Commit 61c6eb5f authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

v1.0.0 release

parent 9e08f7b4
#### DESCRIPTION
Project 12231 submitted by Lars Rogge, Institut Pasteur, Paris, France
Perfom Limma and Random forest analysis using a discovery cohorte of patient (for model building) and using a validation cohorte for validating the model.
Perfom Limma and Random forest analysis using a discovery cohorte of patients (for model building) and using a validation cohorte for validating the model.
Dedicated to SLURM (TARS CLuster) analysis
......@@ -25,10 +25,10 @@ Download the desired Tagged version, never the current master.
Open rogge_12231.conf
Set the parameters
Start the job using a command like:
STARTTIME_SH=$(date) ; JOB_ID_SH=$(date -d "$STARTTIME_SH" +%s) ; sh /pasteur/homes/gmillot/rogge12231/rogge_12231_workflow.sh | tee /pasteur/homes/gmillot/rogge12231/${JOB_ID_SH}_workflow_log
STARTTIME_SH=$(date) ; JOB_ID_SH=$(date -d "$STARTTIME_SH" +%s) ; sh /pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_workflow.sh -c /pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231.conf | tee /pasteur/homes/gmillot/rogge12231/${JOB_ID_SH}_workflow_log
#### FILE DESCRIPTIONS
#### FILE DESCRIPTION
rogge_12231.conf config file. Parameters need to be set by the user inside this file before running
rogge_12231_workflow.sh file allowing checkings and SLURM job submission
......@@ -37,6 +37,20 @@ rogge_12231_whole_VGVR.R R script run by SLURM
The three files must be in the same directory before running
#### OUTPUT DESCRIPTION
## For each job submitted to SLURM:
workflow_log messages printed by the rogge_12231_workflow.sh script
slurm.out nothing relevant
## For each loop (defined by LOOP_NB_CONF in rogge_12231.conf):
graphs graphs
r_rogge_12231_1549557200_report.txt full report of the rogge_12231_whole_VGVR.R script
res_data.RData saved objects
rogge_12231_slurm_jobID.txt SLURM job ID
r_console_messages.txt messages printed by the R console
#### WEB LOCATION
Check for updated versions (most recent tags) at https://gitlab.pasteur.fr/gmillot/rogge_12231/tags
......
......@@ -16,7 +16,7 @@ alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript'
export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.0.0/little_bash_functions-v1.0.0.sh
# export r_ini_check_conf=/pasteur/homes/gmillot/rogge12231/rogge_12231_ini.R
export r_main_conf=/pasteur/homes/gmillot/rogge12231/rogge_12231_whole_VGVR.R
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_whole_VGVR.R
#_______________________________________________________________________________________________
# GENERAL PARAMETERS
......@@ -48,6 +48,7 @@ PROJECT_NAME_CONF="rogge_12231"
################ graphical parameters
R_PDF_DISPLAY_CONF="TRUE" # will be converted in the R script
LABEL_SIZE=4
R_OPT_TXT_CONF="notxt"
......
......@@ -25,7 +25,7 @@ erase.graphs = TRUE # write TRUE to erase all the graphic windows in R before st
sink(stdout(), type = "message")
script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "name.source.file1", "ml.bootstrap.nb", "project.name", "activate.pdf", "optional.text", "slurm.loop.nb") # objects names exactly in the same order as in the bash code and recovered in args
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "name.source.file1", "ml.bootstrap.nb", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb") # objects names exactly in the same order as in the bash code and recovered in args
if(length(args) != length(tempo.arg.names)){
tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n")
stop(tempo.cat)
......@@ -205,7 +205,7 @@ if(any(arg.check) == TRUE){
################ Ignition
ini.time <- as.numeric(Sys.time()) # time of process begin, converted into seconds
analysis.nb <- trunc(ini.time) # to provide a specific number ot each analysis
log.file <- paste0("loop", slurm.loop.nb,"_r_", project.name, "_", analysis.nb,"report.txt")
log.file <- paste0("loop", slurm.loop.nb,"_r_", project.name, "_", analysis.nb,"_report.txt")
# name.dir <- paste0(project.name, "_", analysis.nb)
# path.out<-paste0(path.out, name.dir)
# suppressWarnings(dir.create(path.out))
......@@ -311,6 +311,9 @@ fit <- lmFit(X, design)
fit <- contrasts.fit(fit, contrasts)
fit <- eBayes(fit)
ttab <- topTable(fit, coef = "R - NR", adjust.method = "BH", p.value = 0.05, number = nrow(X))
fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file, sep = 4)
fun_export_data(path = path.out, data = ttab , output = log.file)
backup.name <- c(backup.name, "ttab")
# ```
#
# We performed a differential analysis between responders and non-responders with `limma [@Ritchie2015]. The differential analysis was performed only on the discovery cohort.
......@@ -319,7 +322,7 @@ ttab <- topTable(fit, coef = "R - NR", adjust.method = "BH", p.value = 0.05, num
#
# ```{r showtoptab, results='asis'}
# pander::pandoc.table(ttab)
write.table(ttab, file = paste0(path.out, "/loop", slurm.loop.nb, "_limma.res.table.txt"), append = FALSE, quote = FALSE, row.names = TRUE, col.names = NA, sep = "\t")
# ```
#
#
......@@ -404,7 +407,17 @@ plt <- plotHyperParsEffect(reff, x = "fw.abs", y = "mmce.test.mean")
dev.set(pdf.nb)
plt + geom_point(colour = "red") +
ggtitle("Evaluating optimal number of features") +
theme_bw() + labs(x="Number of selected features", y="Mean misclassification error on the test sets")
theme_bw() +
labs(x="Number of selected features", y="Mean misclassification error on the test sets") +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
# ```
#
# At the end of the learning procedure, there is one optimal value for the number of genes to include in the model. We use the same strategy as in the learning procedure to select this number of genes and train the random forest algorithm on them. The name of the features is given below.
......@@ -451,7 +464,16 @@ feature_importance <- getFeatureImportance(mod$learner.model$next.model)
df_imp <- data.frame(features = names(feature_importance$res),
importance = t(feature_importance$res), stringsAsFactors = FALSE)
dev.set(pdf.nb)
ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() + theme(axis.text.x = element_text(angle=90, vjust=0.5, hjust=1))
ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() +
theme(
axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
# ```
#
#
......@@ -464,13 +486,13 @@ annot.rows <- dat[, "Y", drop = FALSE]
colnames(annot.rows) <- "ASDAS R/NR"
subdat <- df.tmp[, sort(genelist)]
ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato"))
pheatmap(t(scale(subdat)), annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors)
pheatmap(t(scale(subdat)), annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
# ```
#
# The correlation of the selected features is reprsented hereafter. We see that some of the selected features are strongly correlated.
#
# ```{r corrplot}
corrplot(cor(subdat), tl.cex = 0.6, tl.col = "black")
corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10)
# ```
#
# ## 4.2 Validate the model
......@@ -574,9 +596,33 @@ fun_export_data(path = path.out, data = pred1.genes.rf$data, output = log.file,
#
# ```{r roc.rf}
dev.set(pdf.nb)
roc1.genes.rf <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rf, auc)))
roc2.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rf, auc)))
roc3.genes.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rf, auc)))
roc1.genes.rf <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rf, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rf, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rf, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.rf <- ggplot2::ggplot_build(roc1.genes.rf)$data[[1]]
data.roc2.crp.rf <- ggplot2::ggplot_build(roc2.crp.rf)$data[[1]]
data.roc3.genes.crp.rf <- ggplot2::ggplot_build(roc3.genes.crp.rf)$data[[1]]
......@@ -623,9 +669,33 @@ backup.name <- c(backup.name, "confmat1.genes.logreg", "confmat2.crp.logreg", "c
#
# ```{r roc.logreg}
dev.set(pdf.nb)
roc1.genes.logreg <- plotROCCurves(generateThreshVsPerfData(pred1.genes.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.logreg, auc)))
roc2.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred2.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.logreg, auc)))
roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.logreg, auc)))
roc1.genes.logreg <- plotROCCurves(generateThreshVsPerfData(pred1.genes.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred2.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.logreg <- ggplot2::ggplot_build(roc1.genes.logreg)$data[[1]]
data.roc2.crp.logreg <- ggplot2::ggplot_build(roc2.crp.logreg)$data[[1]]
data.roc3.genes.crp.logreg <- ggplot2::ggplot_build(roc3.genes.crp.logreg)$data[[1]]
......@@ -672,9 +742,33 @@ backup.name <- c(backup.name, "confmat1.genes.rpart", "confmat2.crp.rpart", "con
#
# ```{r roc.rpart}
dev.set(pdf.nb)
roc1.genes.rpart <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rpart, auc)))
roc2.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rpart, auc)))
roc3.genes.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rpart, auc)))
roc1.genes.rpart <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.rpart <- ggplot2::ggplot_build(roc1.genes.rpart)$data[[1]]
data.roc2.crp.rpart <- ggplot2::ggplot_build(roc2.crp.rpart)$data[[1]]
data.roc3.genes.crp.rpart <- ggplot2::ggplot_build(roc3.genes.crp.rpart)$data[[1]]
......@@ -702,7 +796,15 @@ ggbox <- ggplot(data = boxdat_melt, aes(x=Y, y=value, colour=Y, shape = CV, line
geom_beeswarm(dodge.width = 0.75, alpha = 0.7) +
geom_boxplot(outlier.shape = NA, fill = NA) +
facet_wrap(~ Gene, ncol = 5) +
theme_bw()
theme_bw() +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
dev.set(pdf.nb)
ggbox
# ```
......
......@@ -254,8 +254,9 @@ CONF_VAR_CHECK=(
"ML_BOOTSTRAP_NB_CONF"
"LOOP_NB_CONF"
"PROJECT_NAME_CONF"
"R_OPT_TXT_CONF"
"R_PDF_DISPLAY_CONF"
"LABEL_SIZE"
"R_OPT_TXT_CONF"
# SLURM PARAMETERS
"MAX_RUNNING_TIME_CONF"
"NB_CPU_PER_TASK_CONF"
......@@ -369,8 +370,8 @@ for i in `seq 1 $LOOP_NB_CONF` ; do
# write the previous line exactly like this, with no comments, otherwise do not work
source $CONFIG_FILE # never forget this because another environment
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $R_OPT_TXT_CONF $i"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}r_console_messages.txt 2>&1" # to add the estderror in the stdout
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF $i"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${i}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${i}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF -J job_$i --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo | tee -a ${OUTPUT_DIR_PATH_tempo2}/loop${i}_${PROJECT_NAME_CONF}_slurm_jobID.txt # write all th echo from the $PROC alaso into a log file
# tricky part of this sbatch because the SUP_VAR_tempo will be used in the script piped to sbatch
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment