Commit 8fd104bf authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent fd03eca6
......@@ -6,8 +6,8 @@
#_______________________________________________________________________________________________
# SOFTWARES (LOCALS OR IN MODULES)
shopt -s expand_aliases # to be sure that alias are expended to the different environments
alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript'
# alias R_conf='module load gcc/4.7.4 ; /pasteur/homes/gmillot/softwares/R/R-3.5.2/lib64/R/bin/Rscript'
alias R_conf='module load gcc/4.9.4 R/3.5.0 ; Rscript'
# alias R_conf='module load gcc/4.9.4 ; /pasteur/homes/gmillot/softwares/R/R-3.5.2/lib64/R/bin/Rscript'
#_______________________________________________________________________________________________
# SCRIPTS TO RUN
......@@ -57,7 +57,6 @@ PROJECT_NAME_CONF="rogge_12231"
################ graphical parameters
R_PDF_DISPLAY_CONF="TRUE" # will be converted in the R script
LABEL_SIZE=4 # size of the lables in graphs
R_OPT_TXT_CONF="no.txt" # optional text to add
......
......@@ -13,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s
sink(stdout(), type = "message") # diverts R output to a connection
script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind") # objects names exactly in the same order as in the bash code and recovered in args
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "project.name", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind") # objects names exactly in the same order as in the bash code and recovered in args
if(length(args) != length(tempo.arg.names)){
tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n")
stop(tempo.cat)
......@@ -57,7 +57,7 @@ erase.graphs <- TRUE
script <- "code ini v1.0.0"
project.name <-"rogge12231"
path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required
path.in <- "Z:/rogge12231/rogge_12231_1550076020/" # absolute path of the data folder
path.in <- "Z:/rogge12231/rogge_12231_1550160832/" # absolute path of the data folder
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
project.name <- "rogge_project"
......@@ -160,7 +160,6 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){
# path.function1 fully tested above
tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = activate.pdf, class = "logical", length = 1) ; eval(ee)
tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
......@@ -198,7 +197,8 @@ if(optional.text == "no.txt"){
}else{
fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file)
}
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
activate.pdf <- TRUE
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("final_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur()
################ Data import
......@@ -340,6 +340,21 @@ backup.name <- c(backup.name, "final.gene.list")
save(list=c(backup.name), file = paste0(path.out, "compiled_data.RData"))
fun_export_data(path = path.out, data = paste0("THE COMPILED DATA ARE SAVED IN: ", paste0(path.out, "compiled_data.RData")), output = log.file)
################ 2 plotting the heatmap adnd corplot
fun_export_data(path = path.out, data = "################################ HEATMAP AND CORRELATION PLOT ", output = log.file, sep = 4)
fun_export_data(path = path.out, data = paste0("HEATMAP AND CORRELATION PLOT COME FROM INITIAL DATASET (NO BOOTSTRAP INFLUENCE)"), output = log.file)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(loop1_heatmap.plot) # same for all the loops
corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops
backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot")
################ 3 Differential analysis with limma
fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file, sep = 4)
......@@ -349,7 +364,6 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
fun_export_data(path = path.out, data = final.ttab, output = log.file)
}else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.ttab, output = log.file)
}else{
tempo1 <- c(table(rownames(final.ttab)))
tempo2 <- data.frame(
......@@ -401,8 +415,12 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
fun_export_data(path = path.out, data = paste0("SEE THE final.ttab AND THE final.ttab.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.ttab.freq")
################ 4 Machine Learning based analysis
######## 4.1 Discovery: Learning the Random Forest
......@@ -465,6 +483,7 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
fun_export_data(path = path.out, data = paste0("SEE THE final.mod.gene.names AND THE final.mod.gene.names.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.mod.gene.names.freq")
......@@ -556,6 +575,7 @@ if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
fun_export_data(path = path.out, data = paste0("SEE THE final.gene.importance AND THE final.gene.importance.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.gene.importance.median")
......@@ -580,16 +600,17 @@ if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) =
fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file)
# horiz barplot + CI
# line + CI area
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) # reorder from higher to lower
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_ribbon(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), color = "red", alpha = 0.3))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI"))
......@@ -607,30 +628,73 @@ if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) =
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.select.gene.curve.median")
#gene heatmap and corrplot
print(loop1_heatmap.plot) # same for all the loops
corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops
backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot")
######## 4.2 Validate the model
#
fun_export_data(path = path.out, data = "################################ VALIDATION", output = log.file, sep = 4)
fun_export_data(path = path.out, data = "################ ROC CURVES", output = log.file)
data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important
ana.kind <- c("rf", "logreg", "rpart")
for(i0 in 1:length(data.kind)){
for(i1 in 1:length(ana.kind)){
fun_export_data(path = path.out, data = paste0("######## ", data.kind[i0], " AND ", ana.kind[i1], " ANALYSIS"), output = log.file)
if(nrow(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) == 0 | nrow(na.omit(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])))) == 0){
fun_export_data(path = path.out, data = paste0("NO ROC CURVE FOR THE ", data.kind[i0], " AND ", ana.kind[i1], " ANALYSIS"), output = log.file)
fun_export_data(path = path.out, data = get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), output = log.file)
}else{
tempo1 <- aggregate(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$x, list(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$CUTOFF), median, na.rm = TRUE)
names(tempo1) <- c("CUTOFF", "X_MEDIAN")
tempo2 <- aggregate(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$y, list(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$CUTOFF), median, na.rm = TRUE)
names(tempo2) <- c("CUTOFF", "Y_MEDIAN")
assign(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"), cbind(tempo1, tempo2["Y_MEDIAN"]))
fun_export_data(path = path.out, data = "MEDIAN ROC CURVE: ", output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file)
# roc curves
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median")), mapping = ggplot2::aes(x = X_MEDIAN, y = Y_MEDIAN)))
for(i2 in 1:slurm.loop.nb){
# roc curves
if(nrow(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(data = get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), mapping = ggplot2::aes(x = x, y = y), color = "grey", alpha = 0.5))
}
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("1 - SPECIFICITY"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("SENSITIVITY"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))) # fix limits
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
}else{
windows(5, 5)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file)
backup.name <- c(backup.name, "final.select.gene.curve.median")
###### ???Add build
}
}
tempo <- dev.set(pdf.nb) # assign to avoid the message
......@@ -669,88 +733,7 @@ backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.ro
gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2)
# ```
######## 4.4 Logistic regression
fun_export_data(path = path.out, data = "################ LOGISTIC REGRESSION", output = log.file)
tempo <- dev.set(pdf.nb) # assign to avoid the message
roc1.genes.logreg <- plotROCCurves(generateThreshVsPerfData(pred1.genes.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred2.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.logreg, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.logreg, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.logreg <- ggplot2::ggplot_build(roc1.genes.logreg)$data[[1]]
data.roc2.crp.logreg <- ggplot2::ggplot_build(roc2.crp.logreg)$data[[1]]
data.roc3.genes.crp.logreg <- ggplot2::ggplot_build(roc3.genes.crp.logreg)$data[[1]]
backup.name <- c(backup.name, "data.roc1.genes.logreg", "data.roc2.crp.logreg", "data.roc3.genes.crp.logreg")
gridExtra::grid.arrange(roc1.genes.logreg, roc2.crp.logreg, roc3.genes.crp.logreg, nrow = 2)
# ```
#
######## 4.5 RPART
fun_export_data(path = path.out, data = "################ RPART", output = log.file)
tempo <- dev.set(pdf.nb) # assign to avoid the message
roc1.genes.rpart <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc2.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
roc3.genes.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rpart, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rpart, auc))) +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.rpart <- ggplot2::ggplot_build(roc1.genes.rpart)$data[[1]]
data.roc2.crp.rpart <- ggplot2::ggplot_build(roc2.crp.rpart)$data[[1]]
data.roc3.genes.crp.rpart <- ggplot2::ggplot_build(roc3.genes.crp.rpart)$data[[1]]
backup.name <- c(backup.name, "data.roc1.genes.rpart", "data.roc2.crp.rpart", "data.roc3.genes.crp.rpart")
gridExtra::grid.arrange(roc1.genes.rpart, roc2.crp.rpart, roc3.genes.crp.rpart, nrow = 2)
######## 4.6 Plot top feature boxplots
......
......@@ -13,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s
sink(stdout(), type = "message")
script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "ml.bootstrap.nb", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio", "random.seed") # objects names exactly in the same order as in the bash code and recovered in args
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "ml.bootstrap.nb", "project.name", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio", "random.seed") # objects names exactly in the same order as in the bash code and recovered in args
if(length(args) != length(tempo.arg.names)){
tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n")
stop(tempo.cat)
......@@ -63,7 +63,6 @@ path.out <- "/pasteur/homes/gmillot/rogge12231/" # absolute path of the output f
path.function1 <- "/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.4.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in
ml.bootstrap.nb <- 3
activate.pdf = TRUE
label.size <- 6
optional.text <- ""
slurm.loop.nb <- 1
......@@ -86,7 +85,6 @@ path.function1 <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw
# path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in
ml.bootstrap.nb <- 3
activate.pdf = TRUE
label.size <- 6
optional.text <- ""
slurm.loop.nb <- 1
......@@ -94,7 +92,7 @@ analysis.kind <- "valid_boot"
cross.valid.ratio <- 0.8
random.seed <- TRUE
'
eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode
......@@ -202,7 +200,6 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){
tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = ml.bootstrap.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = activate.pdf, class = "logical", length = 1) ; eval(ee)
tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee)
tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
......@@ -242,7 +239,7 @@ if(optional.text == "no.txt"){
}else{
fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file)
}
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
par.ini <- fun_open_window(pdf.disp = TRUE, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur()
################ randomness ignition
......@@ -443,7 +440,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(plt.plot)
select.gene.curve <- ggplot2::ggplot_build(plt.plot)$data[[2]]
select.gene.curve <- ggplot2::ggplot_build(plt.plot)$data[[2]][, c("x", "y")]
backup.name <- c(backup.name, "select.gene.curve")
# ```
#
......@@ -516,6 +513,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
subdat <- df.tmp[, sort(mod.gene.names)]
ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato"))
heatmap.plot <- pheatmap(t(scale(subdat)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(heatmap.plot)
backup.name <- c(backup.name, "heatmap.plot")
# ```
......@@ -523,7 +521,9 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
# The correlation of the selected features is reprsented hereafter. We see that some of the selected features are strongly correlated.
#
# ```{r corrplot}
tempo <- dev.set(pdf.nb) # assign to avoid the message
corr.plot <- corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10)
backup.name <- c(backup.name, "corr.plot")
if(analysis.kind == "valid_boot"){
save(list=c("dat.train", "dat.valid", "train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData"))
......@@ -672,9 +672,12 @@ roc3.genes.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rf,
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.rf <- ggplot2::ggplot_build(roc1.genes.rf)$data[[1]]
data.roc2.crp.rf <- ggplot2::ggplot_build(roc2.crp.rf)$data[[1]]
data.roc3.genes.crp.rf <- ggplot2::ggplot_build(roc3.genes.crp.rf)$data[[1]]
data.roc1.genes.rf <- ggplot2::ggplot_build(roc1.genes.rf)$data[[1]][, c("x", "y")]
data.roc1.genes.rf <- data.frame(data.roc1.genes.rf, CUTOFF = 1:nrow(data.roc1.genes.rf))
data.roc2.crp.rf <- ggplot2::ggplot_build(roc2.crp.rf)$data[[1]][, c("x", "y")]
data.roc2.crp.rf <- data.frame(data.roc2.crp.rf, CUTOFF = 1:nrow(data.roc2.crp.rf))
data.roc3.genes.crp.rf <- ggplot2::ggplot_build(roc3.genes.crp.rf)$data[[1]][, c("x", "y")]
data.roc3.genes.crp.rf <- data.frame(data.roc3.genes.crp.rf, CUTOFF = 1:nrow(data.roc3.genes.crp.rf))
backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.roc3.genes.crp.rf")
gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2)
......@@ -745,9 +748,12 @@ roc3.genes.crp.logreg <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.logreg <- ggplot2::ggplot_build(roc1.genes.logreg)$data[[1]]
data.roc2.crp.logreg <- ggplot2::ggplot_build(roc2.crp.logreg)$data[[1]]
data.roc3.genes.crp.logreg <- ggplot2::ggplot_build(roc3.genes.crp.logreg)$data[[1]]
data.roc1.genes.logreg <- ggplot2::ggplot_build(roc1.genes.logreg)$data[[1]][, c("x", "y")]
data.roc1.genes.logreg <- data.frame(data.roc1.genes.logreg, CUTOFF = 1:nrow(data.roc1.genes.logreg))
data.roc2.crp.logreg <- ggplot2::ggplot_build(roc2.crp.logreg)$data[[1]][, c("x", "y")]
data.roc2.crp.logreg <- data.frame(data.roc2.crp.logreg, CUTOFF = 1:nrow(data.roc2.crp.logreg))
data.roc3.genes.crp.logreg <- ggplot2::ggplot_build(roc3.genes.crp.logreg)$data[[1]][, c("x", "y")]
data.roc3.genes.crp.logreg <- data.frame(data.roc3.genes.crp.logreg, CUTOFF = 1:nrow(data.roc3.genes.crp.logreg))
backup.name <- c(backup.name, "data.roc1.genes.logreg", "data.roc2.crp.logreg", "data.roc3.genes.crp.logreg")
gridExtra::grid.arrange(roc1.genes.logreg, roc2.crp.logreg, roc3.genes.crp.logreg, nrow = 2)
......@@ -818,9 +824,12 @@ roc3.genes.crp.rpart <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.r
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
data.roc1.genes.rpart <- ggplot2::ggplot_build(roc1.genes.rpart)$data[[1]]
data.roc2.crp.rpart <- ggplot2::ggplot_build(roc2.crp.rpart)$data[[1]]
data.roc3.genes.crp.rpart <- ggplot2::ggplot_build(roc3.genes.crp.rpart)$data[[1]]
data.roc1.genes.rpart <- ggplot2::ggplot_build(roc1.genes.rpart)$data[[1]][, c("x", "y")]
data.roc1.genes.rpart <- data.frame(data.roc1.genes.rpart, CUTOFF = 1:nrow(data.roc1.genes.rpart))
data.roc2.crp.rpart <- ggplot2::ggplot_build(roc2.crp.rpart)$data[[1]][, c("x", "y")]
data.roc2.crp.rpart <- data.frame(data.roc2.crp.rpart, CUTOFF = 1:nrow(data.roc2.crp.rpart))
data.roc3.genes.crp.rpart <- ggplot2::ggplot_build(roc3.genes.crp.rpart)$data[[1]][, c("x", "y")]
data.roc3.genes.crp.rpart <- data.frame(data.roc3.genes.crp.rpart, CUTOFF = 1:nrow(data.roc3.genes.crp.rpart))
backup.name <- c(backup.name, "data.roc1.genes.rpart", "data.roc2.crp.rpart", "data.roc3.genes.crp.rpart")
gridExtra::grid.arrange(roc1.genes.rpart, roc2.crp.rpart, roc3.genes.crp.rpart, nrow = 2)
......
......@@ -239,7 +239,6 @@ CONF_VAR_CHECK=(
"LOOP_NB_CONF"
"R_RANDOM_SEED"
"PROJECT_NAME_CONF"
"R_PDF_DISPLAY_CONF"
"LABEL_SIZE"
"R_OPT_TXT_CONF"
# SLURM PARAMETERS
......@@ -357,7 +356,7 @@ while [[ $COUNT < $(($LOOP_NB_CONF + 1)) ]] ; do
source $CONFIG_FILE # never forget this because another environment
OUTPUT_DIR_PATH_tempo2="${OUTPUT_DIR_PATH_tempo}/loop${COUNT}/"
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $r_main_functions_conf $FILE_NAME1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF $COUNT $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED"
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $r_main_functions_conf $FILE_NAME1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $LABEL_SIZE $R_OPT_TXT_CONF $COUNT $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${COUNT}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${COUNT}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --job-name=wait_loop1 --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo --wait | tee -a ${OUTPUT_DIR_PATH_tempo}/loop${COUNT}/loop${COUNT}_${PROJECT_NAME_CONF}_slurm_jobID.txt # write all th echo from the $PROC alaso into a log file
......@@ -372,7 +371,7 @@ while [[ $COUNT < $(($LOOP_NB_CONF + 1)) ]] ; do
source $CONFIG_FILE # never forget this because another environment
OUTPUT_DIR_PATH_tempo2="${OUTPUT_DIR_PATH_tempo}/loop${SLURM_ARRAY_TASK_ID}/"
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $r_main_functions_conf $FILE_NAME1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED" # beware $COUNT replaced by ${SLURM_ARRAY_TASK_ID} because job array
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $r_main_functions_conf $FILE_NAME1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED" # beware $COUNT replaced by ${SLURM_ARRAY_TASK_ID} because job array
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --array=2-$LOOP_NB_CONF --job-name=wait_loop_all --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo --wait | tee -a $(for((i = 2 ; i <= $LOOP_NB_CONF ; i++)) ; do echo ${OUTPUT_DIR_PATH_tempo}/loop${i}/loop${i}_${PROJECT_NAME_CONF}_slurm_jobID.txt ; done) # tee is dispached in all the dir of the job array
......@@ -384,7 +383,7 @@ while [[ $COUNT < $(($LOOP_NB_CONF + 1)) ]] ; do
source $CONFIG_FILE # never forget this because another environment
OUTPUT_DIR_PATH_tempo2="${OUTPUT_DIR_PATH_tempo}/loop${SLURM_ARRAY_TASK_ID}/"
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $r_main_functions_conf $FILE_NAME1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED"
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $r_main_functions_conf $FILE_NAME1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --array=1-$LOOP_NB_CONF --job-name=wait_loop_all --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo --wait | tee -a $(for((i = 1 ; i <= $LOOP_NB_CONF ; i++)) ; do echo ${OUTPUT_DIR_PATH_tempo}/loop${i}/loop${i}_${PROJECT_NAME_CONF}_slurm_jobID.txt ; done) # tee is dispached in all the dir of the job array
......@@ -403,13 +402,13 @@ if [[ $R_ANALYSIS_KIND =~ longit || $LOOP_NB_CONF == 1 ]] ; then
else
OUTPUT_DIR_PATH_FINAL="${OUTPUT_DIR_PATH_tempo}/final_res"
mkdir ${OUTPUT_DIR_PATH_FINAL}
SUP_VAR+="OUTPUT_DIR_PATH_FINAL=$OUTPUT_DIR_PATH_FINAL"
SUP_VAR+=",OUTPUT_DIR_PATH_FINAL=$OUTPUT_DIR_PATH_FINAL"
echo -e '#!/bin/sh
# write the previous line exactly like this, with no comments, otherwise do not work
source $CONFIG_FILE # never forget this because another environment
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_compil_conf} $PATH_LIB_CONF $OUTPUT_DIR_PATH_tempo $OUTPUT_DIR_PATH_FINAL $r_main_functions_conf $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF $LOOP_NB_CONF $R_ANALYSIS_KIND"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_FINAL/r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_FINAL/r_console_messages.txt 2>&1" # to add the estderror in the stdout
R_PROC="R_conf ${r_compil_conf} $PATH_LIB_CONF ${OUTPUT_DIR_PATH_tempo}/ ${OUTPUT_DIR_PATH_FINAL}/ $r_main_functions_conf $PROJECT_NAME_CONF $LABEL_SIZE $R_OPT_TXT_CONF $LOOP_NB_CONF $R_ANALYSIS_KIND"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_FINAL}/r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_FINAL}/r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --job-name=compil --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR | tee -a ${OUTPUT_DIR_PATH_FINAL}/${PROJECT_NAME_CONF}_slurm_jobID.txt # write all th echo from the $PROC alaso into a log file
fi
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment