Commit 8fd104bf authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent fd03eca6
...@@ -6,8 +6,8 @@ ...@@ -6,8 +6,8 @@
#_______________________________________________________________________________________________ #_______________________________________________________________________________________________
# SOFTWARES (LOCALS OR IN MODULES) # SOFTWARES (LOCALS OR IN MODULES)
shopt -s expand_aliases # to be sure that alias are expended to the different environments shopt -s expand_aliases # to be sure that alias are expended to the different environments
alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript' alias R_conf='module load gcc/4.9.4 R/3.5.0 ; Rscript'
# alias R_conf='module load gcc/4.7.4 ; /pasteur/homes/gmillot/softwares/R/R-3.5.2/lib64/R/bin/Rscript' # alias R_conf='module load gcc/4.9.4 ; /pasteur/homes/gmillot/softwares/R/R-3.5.2/lib64/R/bin/Rscript'
#_______________________________________________________________________________________________ #_______________________________________________________________________________________________
# SCRIPTS TO RUN # SCRIPTS TO RUN
...@@ -57,7 +57,6 @@ PROJECT_NAME_CONF="rogge_12231" ...@@ -57,7 +57,6 @@ PROJECT_NAME_CONF="rogge_12231"
################ graphical parameters ################ graphical parameters
R_PDF_DISPLAY_CONF="TRUE" # will be converted in the R script
LABEL_SIZE=4 # size of the lables in graphs LABEL_SIZE=4 # size of the lables in graphs
R_OPT_TXT_CONF="no.txt" # optional text to add R_OPT_TXT_CONF="no.txt" # optional text to add
......
...@@ -13,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s ...@@ -13,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s
sink(stdout(), type = "message") # diverts R output to a connection sink(stdout(), type = "message") # diverts R output to a connection
script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind") # objects names exactly in the same order as in the bash code and recovered in args tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "project.name", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind") # objects names exactly in the same order as in the bash code and recovered in args
if(length(args) != length(tempo.arg.names)){ if(length(args) != length(tempo.arg.names)){
tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n") tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n")
stop(tempo.cat) stop(tempo.cat)
...@@ -57,7 +57,7 @@ erase.graphs <- TRUE ...@@ -57,7 +57,7 @@ erase.graphs <- TRUE
script <- "code ini v1.0.0" script <- "code ini v1.0.0"
project.name <-"rogge12231" project.name <-"rogge12231"
path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required
path.in <- "Z:/rogge12231/rogge_12231_1550076020/" # absolute path of the data folder path.in <- "Z:/rogge12231/rogge_12231_1550160832/" # absolute path of the data folder
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
project.name <- "rogge_project" project.name <- "rogge_project"
...@@ -160,7 +160,6 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){ ...@@ -160,7 +160,6 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){
# path.function1 fully tested above # path.function1 fully tested above
tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = activate.pdf, class = "logical", length = 1) ; eval(ee)
tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
...@@ -198,7 +197,8 @@ if(optional.text == "no.txt"){ ...@@ -198,7 +197,8 @@ if(optional.text == "no.txt"){
}else{ }else{
fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file) fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file)
} }
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par activate.pdf <- TRUE
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("final_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur() pdf.nb <- dev.cur()
################ Data import ################ Data import
...@@ -340,6 +340,21 @@ backup.name <- c(backup.name, "final.gene.list") ...@@ -340,6 +340,21 @@ backup.name <- c(backup.name, "final.gene.list")
save(list=c(backup.name), file = paste0(path.out, "compiled_data.RData")) save(list=c(backup.name), file = paste0(path.out, "compiled_data.RData"))
fun_export_data(path = path.out, data = paste0("THE COMPILED DATA ARE SAVED IN: ", paste0(path.out, "compiled_data.RData")), output = log.file) fun_export_data(path = path.out, data = paste0("THE COMPILED DATA ARE SAVED IN: ", paste0(path.out, "compiled_data.RData")), output = log.file)
################ 2 plotting the heatmap adnd corplot
fun_export_data(path = path.out, data = "################################ HEATMAP AND CORRELATION PLOT ", output = log.file, sep = 4)
fun_export_data(path = path.out, data = paste0("HEATMAP AND CORRELATION PLOT COME FROM INITIAL DATASET (NO BOOTSTRAP INFLUENCE)"), output = log.file)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(loop1_heatmap.plot) # same for all the loops
corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops
backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot")
################ 3 Differential analysis with limma ################ 3 Differential analysis with limma
fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file, sep = 4) fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file, sep = 4)
...@@ -349,7 +364,6 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ ...@@ -349,7 +364,6 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
fun_export_data(path = path.out, data = final.ttab, output = log.file) fun_export_data(path = path.out, data = final.ttab, output = log.file)
}else if(analysis.kind == "valid_boot"){ }else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.ttab, output = log.file)
}else{ }else{
tempo1 <- c(table(rownames(final.ttab))) tempo1 <- c(table(rownames(final.ttab)))
tempo2 <- data.frame( tempo2 <- data.frame(
...@@ -401,8 +415,12 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ ...@@ -401,8 +415,12 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
} }
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
} }
fun_export_data(path = path.out, data = paste0("SEE THE final.ttab AND THE final.ttab.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.ttab.freq") backup.name <- c(backup.name, "final.ttab.freq")
################ 4 Machine Learning based analysis ################ 4 Machine Learning based analysis
######## 4.1 Discovery: Learning the Random Forest ######## 4.1 Discovery: Learning the Random Forest
...@@ -465,6 +483,7 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ ...@@ -465,6 +483,7 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
} }
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
} }
fun_export_data(path = path.out, data = paste0("SEE THE final.mod.gene.names AND THE final.mod.gene.names.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.mod.gene.names.freq") backup.name <- c(backup.name, "final.mod.gene.names.freq")
...@@ -556,6 +575,7 @@ if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0) ...@@ -556,6 +575,7 @@ if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0)
} }
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
} }
fun_export_data(path = path.out, data = paste0("SEE THE final.gene.importance AND THE final.gene.importance.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.gene.importance.median") backup.name <- c(backup.name, "final.gene.importance.median")
...@@ -580,16 +600,17 @@ if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) = ...@@ -580,16 +600,17 @@ if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) =
fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file) fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file) fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file)
# horiz barplot + CI # line + CI area
bar.width <- 0.5 bar.width <- 0.5
log.scale <- FALSE log.scale <- FALSE
amplif <- label.size amplif <- label.size
tempo.gg.name <- "gg.indiv.plot." tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0 tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) # reorder from higher to lower assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0)) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_ribbon(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), color = "red", alpha = 0.3))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI"))
...@@ -607,30 +628,73 @@ if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) = ...@@ -607,30 +628,73 @@ if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) =
} }
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
} }
fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.select.gene.curve.median") backup.name <- c(backup.name, "final.select.gene.curve.median")
#gene heatmap and corrplot
print(loop1_heatmap.plot) # same for all the loops
corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops
backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot")
######## 4.2 Validate the model ######## 4.2 Validate the model
# #
fun_export_data(path = path.out, data = "################################ VALIDATION", output = log.file, sep = 4) fun_export_data(path = path.out, data = "################################ VALIDATION", output = log.file, sep = 4)
fun_export_data(path = path.out, data = "################ ROC CURVES", output = log.file)
data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important
ana.kind <- c("rf", "logreg", "rpart")
for(i0 in 1:length(data.kind)){
for(i1 in 1:length(ana.kind)){
fun_export_data(path = path.out, data = paste0("######## ", data.kind[i0], " AND ", ana.kind[i1], " ANALYSIS"), output = log.file)
if(nrow(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) == 0 | nrow(na.omit(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])))) == 0){
fun_export_data(path = path.out, data = paste0("NO ROC CURVE FOR THE ", data.kind[i0], " AND ", ana.kind[i1], " ANALYSIS"), output = log.file)
fun_export_data(path = path.out, data = get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), output = log.file)
}else{
tempo1 <- aggregate(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$x, list(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$CUTOFF), median, na.rm = TRUE)
names(tempo1) <- c("CUTOFF", "X_MEDIAN")
tempo2 <- aggregate(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$y, list(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$CUTOFF), median, na.rm = TRUE)
names(tempo2) <- c("CUTOFF", "Y_MEDIAN")
assign(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"), cbind(tempo1, tempo2["Y_MEDIAN"]))
fun_export_data(path = path.out, data = "MEDIAN ROC CURVE: ", output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file)
# roc curves
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median")), mapping = ggplot2::aes(x = X_MEDIAN, y = Y_MEDIAN)))
for(i2 in 1:slurm.loop.nb){
# roc curves
if(nrow(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(data = get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), mapping = ggplot2::aes(x = x, y = y), color = "grey", alpha = 0.5))
}
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("1 - SPECIFICITY"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("SENSITIVITY"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))) # fix limits
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
}else{
windows(5, 5)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.select.gene.curve.median")
###### ???Add build
}
}
tempo <- dev.set(pdf.nb) # assign to avoid the message tempo <- dev.set(pdf.nb) # assign to avoid the message
...@@ -669,88 +733,7 @@ backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.ro ...@@ -669,88 +733,7 @@ backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.ro
gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2) gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2)
# ``` # ```
######## 4.4 Logistic regression
fun_export_data(path = path.out, data = "################ LOGISTIC REGRESSION", output = log.file)
tempo <- dev.set(pdf.nb) # assign to avoid the message
roc1.genes.logreg <- plotROCCurves(generateThreshVsPerfData(pred1.genes.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred2.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.logreg <- ggplot2::ggplot_build(roc1.genes.logreg)$data[[1]]
data.roc2.crp.logreg <- ggplot2::ggplot_build(roc2.crp.logreg)$data[[1]]
data.roc3.genes.crp.logreg <- ggplot2::ggplot_build(roc3.genes.crp.logreg)$data[[1]]
backup.name <- c(backup.name, "data.roc1.genes.logreg", "data.roc2.crp.logreg", "data.roc3.genes.crp.logreg")
gridExtra::grid.arrange(roc1.genes.logreg, roc2.crp.logreg, roc3.genes.crp.logreg, nrow = 2)
# ```
#
######## 4.5 RPART
fun_export_data(path = path.out, data = "################ RPART", output = log.file)
tempo <- dev.set(pdf.nb) # assign to avoid the message
roc1.genes.rpart <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.rpart <- ggplot2::ggplot_build(roc1.genes.rpart)$data[[1]]
data.roc2.crp.rpart <- ggplot2::ggplot_build(roc2.crp.rpart)$data[[1]]
data.roc3.genes.crp.rpart <- ggplot2::ggplot_build(roc3.genes.crp.rpart)$data[[1]]
backup.name <- c(backup.name, "data.roc1.genes.rpart", "data.roc2.crp.rpart", "data.roc3.genes.crp.rpart")
gridExtra::grid.arrange(roc1.genes.rpart, roc2.crp.rpart, roc3.genes.crp.rpart, nrow = 2)
######## 4.6 Plot top feature boxplots ######## 4.6 Plot top feature boxplots
......
...@@ -13,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s ...@@ -13,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s
sink(stdout(), type = "message") sink(stdout(), type = "message")
script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "ml.bootstrap.nb", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio", "random.seed") # objects names exactly in the same order as in the bash code and recovered in args tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "ml.bootstrap.nb", "project.name", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio", "random.seed") # objects names exactly in the same order as in the bash code and recovered in args
if(length(args) != length(tempo.arg.names)){ if(length(args) != length(tempo.arg.names)){
tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n") tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n")
stop(tempo.cat) stop(tempo.cat)
...@@ -63,7 +63,6 @@ path.out <- "/pasteur/homes/gmillot/rogge12231/" # absolute path of the output f ...@@ -63,7 +63,6 @@ path.out <- "/pasteur/homes/gmillot/rogge12231/" # absolute path of the output f
path.function1 <- "/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.4.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot path.function1 <- "/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.4.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in
ml.bootstrap.nb <- 3 ml.bootstrap.nb <- 3
activate.pdf = TRUE
label.size <- 6 label.size <- 6
optional.text <- "" optional.text <- ""
slurm.loop.nb <- 1 slurm.loop.nb <- 1
...@@ -86,7 +85,6 @@ path.function1 <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw ...@@ -86,7 +85,6 @@ path.function1 <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw
# path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot # path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in
ml.bootstrap.nb <- 3 ml.bootstrap.nb <- 3
activate.pdf = TRUE
label.size <- 6 label.size <- 6
optional.text <- "" optional.text <- ""
slurm.loop.nb <- 1 slurm.loop.nb <- 1
...@@ -94,7 +92,7 @@ analysis.kind <- "valid_boot" ...@@ -94,7 +92,7 @@ analysis.kind <- "valid_boot"
cross.valid.ratio <- 0.8 cross.valid.ratio <- 0.8
random.seed <- TRUE random.seed <- TRUE
' '
eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop() # eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode # data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode
...@@ -202,7 +200,6 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){ ...@@ -202,7 +200,6 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){
tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = ml.bootstrap.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = ml.bootstrap.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = activate.pdf, class = "logical", length = 1) ; eval(ee)
tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
...@@ -242,7 +239,7 @@ if(optional.text == "no.txt"){ ...@@ -242,7 +239,7 @@ if(optional.text == "no.txt"){
}else{ }else{
fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file) fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file)
} }
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par par.ini <- fun_open_window(pdf.disp = TRUE, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur() pdf.nb <- dev.cur()
################ randomness ignition ################ randomness ignition
...@@ -443,7 +440,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val ...@@ -443,7 +440,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
) )
tempo <- dev.set(pdf.nb) # assign to avoid the message tempo <- dev.set(pdf.nb) # assign to avoid the message
print(plt.plot) print(plt.plot)
select.gene.curve <- ggplot2::ggplot_build(plt.plot)$data[[2]] select.gene.curve <- ggplot2::ggplot_build(plt.plot)$data[[2]][, c("x", "y")]
backup.name <- c(backup.name, "select.gene.curve") backup.name <- c(backup.name, "select.gene.curve")
# ``` # ```
# #
...@@ -516,6 +513,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val ...@@ -516,6 +513,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
subdat <- df.tmp[, sort(mod.gene.names)] subdat <- df.tmp[, sort(mod.gene.names)]
ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato")) ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato"))
heatmap.plot <- pheatmap(t(scale(subdat)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size) heatmap.plot <- pheatmap(t(scale(subdat)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(heatmap.plot) print(heatmap.plot)
backup.name <- c(backup.name, "heatmap.plot") backup.name <- c(backup.name, "heatmap.plot")
# ``` # ```
...@@ -523,7 +521,9 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val ...@@ -523,7 +521,9 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
# The correlation of the selected features is reprsented hereafter. We see that some of the selected features are strongly correlated. # The correlation of the selected features is reprsented hereafter. We see that some of the selected features are strongly correlated.
# #
# ```{r corrplot} # ```{r corrplot}
tempo <- dev.set(pdf.nb) # assign to avoid the message
corr.plot <- corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10) corr.plot <- corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10)
backup.name <- c(backup.name, "corr.plot")
if(analysis.kind == "valid_boot"){ if(analysis.kind == "valid_boot"){
save(list=c("dat.train", "dat.valid", "train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData")) save(list=c("dat.train", "dat.valid", "train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData"))
...@@ -672,9 +672,12 @@ roc3.genes.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rf, ...@@ -672,9 +672,12 @@ roc3.genes.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rf,
legend.title = ggplot2::element_text(size = label.size), legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size) strip.text = ggplot2::element_text(size = label.size)
) )
data.roc1.genes.rf <- ggplot2::ggplot_build(roc1.genes.rf)$data[[1]] data.roc1.genes.rf <- ggplot2::ggplot_build(roc1.genes.rf)$data[[1]][, c("x", "y")]
data.roc2.crp.rf <- ggplot2::ggplot_build(roc2.crp.rf)$data[[1]] data.roc1.genes.rf <- data.frame(data.roc1.genes.rf, CUTOFF = 1:nrow(data.roc1.genes.rf))
data.roc3.genes.crp.rf <- ggplot2::ggplot_build(roc3.genes.crp.rf)$data[[1]] data.roc2.crp.rf <- ggplot2::ggplot_build(roc2.crp.rf)$data[[1]][, c("x", "y")]
data.roc2.crp.rf <- data.frame(data.roc2.crp.rf, CUTOFF = 1:nrow(data.roc2.crp.rf))
data.roc3.genes.crp.rf <- ggplot2::ggplot_build(roc3.genes.crp.rf)$data[[1]][, c("x", "y")]
data.roc3.genes.crp.rf <- data.frame(data.roc3.genes.crp.rf, CUTOFF = 1:nrow(data.roc3.genes.crp.rf))
backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.roc3.genes.crp.rf") backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.roc3.genes.crp.rf")
gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2) gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2)
...@@ -745,9 +748,12 @@ roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp. ...@@ -745,9 +748,12 @@ roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.
legend.title = ggplot2::element_text(size = label.size), legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size) strip.text = ggplot2::element_text(size = label.size)
) )
data.roc1.genes.logreg <- ggplot2::ggplot_build(roc1.genes.logreg)$data[[1]] data.roc1.genes.logreg <- ggplot2::<