Commit fd03eca6 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent d8f8965e
#!/bin/bash # shebang (#! https://en.wikipedia.org/wiki/Shebang_%28Unix%29) indicating to the shell what program to interpret the script with, when executed, probably optional here. #!/bin/bash # shebang (#! https://en.wikipedia.org/wiki/Shebang_%28Unix%29) indicating to the shell what program to interpret the script with, when executed, probably optional here.
# export allow the variable to be use in subprocesses. Without export, the variable is only available in the current process. Example ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729 instead of export ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729 # export allow the variable to be use in subprocesses. Without export, the variable is only available in the current process. Example ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729 instead of export ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729
# _conf: lowercases for alias and scripts, and uppercases for variables # _conf: lowercases for alias and scripts, and uppercases for variables
...@@ -15,7 +14,7 @@ alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript' ...@@ -15,7 +14,7 @@ alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript'
export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R
# export r_main_functions_conf=https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R # export r_main_functions_conf=https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.0.0/little_bash_functions-v1.0.0.sh export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.1.0/little_bash_functions.sh
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_main_analysis.R export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_main_analysis.R
export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_data_compilation.R export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_data_compilation.R
...@@ -39,7 +38,7 @@ FILE_NAME1_CONF="supplementary_data_file_test.csv" # name of the data file to im ...@@ -39,7 +38,7 @@ FILE_NAME1_CONF="supplementary_data_file_test.csv" # name of the data file to im
ML_BOOTSTRAP_NB_CONF=3 ML_BOOTSTRAP_NB_CONF=3
LOOP_NB_CONF=3 LOOP_NB_CONF=3
R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output) R_RANDOM_SEED="TRUE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
################ kind of analysis ################ kind of analysis
...@@ -48,7 +47,7 @@ R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the b ...@@ -48,7 +47,7 @@ R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the b
# with discovery set 67 indiv (df.nano$cohort_id != "cohortR") and validation set 9 indiv (df.nano$cohort_id == "cohortR") # with discovery set 67 indiv (df.nano$cohort_id != "cohortR") and validation set 9 indiv (df.nano$cohort_id == "cohortR")
# "valid_boot" limma and rf training are run once but bootstrap of the validation set 9 indiv (df.nano$cohort_id == "cohortR") using LOOP_NB_CONF parameter # "valid_boot" limma and rf training are run once but bootstrap of the validation set 9 indiv (df.nano$cohort_id == "cohortR") using LOOP_NB_CONF parameter
# "full_cross_validation" rows of the dataset are randomly split in two (no replacement), according to CROSS_VALID_RATIO, forming the discovery and validation set # "full_cross_validation" rows of the dataset are randomly split in two (no replacement), according to CROSS_VALID_RATIO, forming the discovery and validation set
R_ANALYSIS_KIND="longit" R_ANALYSIS_KIND="full_cross_validation"
CROSS_VALID_RATIO=0.8 # proportion (nb indiv randomly selected (wo replacement) for the discovery set) / (total number of indiv) CROSS_VALID_RATIO=0.8 # proportion (nb indiv randomly selected (wo replacement) for the discovery set) / (total number of indiv)
# -> the validation set is formed by the remaining indiv, with proportion 1 - CROSS_VALID_RATIO # -> the validation set is formed by the remaining indiv, with proportion 1 - CROSS_VALID_RATIO
......
...@@ -57,11 +57,11 @@ erase.graphs <- TRUE ...@@ -57,11 +57,11 @@ erase.graphs <- TRUE
script <- "code ini v1.0.0" script <- "code ini v1.0.0"
project.name <-"rogge12231" project.name <-"rogge12231"
path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required
path.in <- "Z:/rogge12231/rogge_12231_1549993246/" # absolute path of the data folder path.in <- "Z:/rogge12231/rogge_12231_1550076020/" # absolute path of the data folder
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
project.name <- "rogge_project" project.name <- "rogge_project"
activate.pdf = TRUE activate.pdf = FALSE
label.size <-12 label.size <-12
optional.text <- "" optional.text <- ""
slurm.loop.nb <- 3 slurm.loop.nb <- 3
...@@ -104,12 +104,18 @@ for(i0 in 1:length(req.package.list)){ ...@@ -104,12 +104,18 @@ for(i0 in 1:length(req.package.list)){
if(length(path.function1) != 1){ if(length(path.function1) != 1){
stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n")) stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n"))
}else if(grepl(x = path.function1, pattern = "^http") & ( ! RCurl::url.exists(path.function1))){ }else if(grepl(x = path.function1, pattern = "^http")){
if( ! RCurl::url.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: HTTP INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n")) stop(paste0("\n\n============\n\nERROR: HTTP INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else if(( ! grepl(x = path.function1, pattern = "^http")) & ( ! file.exists(path.function1))){ }else{
source(path.function1) # source the fun_ functions used below
}
}else if( ! grepl(x = path.function1, pattern = "^http")){
if( ! file.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n")) stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else{ }else{
source(path.function1) # source the fun_ functions used below source(path.function1) # source the fun_ functions used below
}
} }
################################ End Functions ################################ End Functions
...@@ -195,8 +201,6 @@ if(optional.text == "no.txt"){ ...@@ -195,8 +201,6 @@ if(optional.text == "no.txt"){
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur() pdf.nb <- dev.cur()
################ Data import ################ Data import
for(i0 in 1:slurm.loop.nb){ for(i0 in 1:slurm.loop.nb){
...@@ -217,7 +221,6 @@ for(i0 in 1:slurm.loop.nb){ ...@@ -217,7 +221,6 @@ for(i0 in 1:slurm.loop.nb){
# > class(final.gene.list) # > class(final.gene.list)
# [1] "data.frame" # [1] "data.frame"
################ Data compilation ################ Data compilation
backup.name <- NULL backup.name <- NULL
...@@ -226,7 +229,7 @@ if(analysis.kind == "full_cross_validation"){ ...@@ -226,7 +229,7 @@ if(analysis.kind == "full_cross_validation"){
final.ttab <- data.frame(NULL, stringsAsFactors = FALSE) final.ttab <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){ for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_ttab"))) > 0){ if(nrow(get(paste0("loop", i0, "_ttab"))) > 0){
final.ttab <- rbind(final.ttab, get(paste0("loop", i0, "_ttab"))) final.ttab <- rbind(final.ttab, data.frame(get(paste0("loop", i0, "_ttab")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{ }else{
final.ttab <- rbind(final.ttab, data.frame(logFC = NA, AveExpr = NA, t = NA, P.Value = NA, adj.P.Val = NA, B = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) final.ttab <- rbind(final.ttab, data.frame(logFC = NA, AveExpr = NA, t = NA, P.Value = NA, adj.P.Val = NA, B = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
} }
...@@ -237,6 +240,22 @@ if(analysis.kind == "full_cross_validation"){ ...@@ -237,6 +240,22 @@ if(analysis.kind == "full_cross_validation"){
} }
backup.name <- c(backup.name, "final.ttab") backup.name <- c(backup.name, "final.ttab")
# select.gene.curve
if(analysis.kind == "full_cross_validation"){
final.select.gene.curve <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_select.gene.curve"))) > 0){
final.select.gene.curve <- rbind(final.select.gene.curve, data.frame(get(paste0("loop", i0, "_select.gene.curve")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{
final.select.gene.curve <- rbind(final.select.gene.curve, data.frame(x = NA, y = NA, PANEL = NA, group = NA, shape = NA, colour = NA, size = NA, fill = NA, alpha = NA, stroke = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}
}
}else{
final.select.gene.curve <- get(paste0("loop1_select.gene.curve"))
fun_export_data(path = path.out, data = paste0("final.select.gene.curve OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file)
}
backup.name <- c(backup.name, "final.select.gene.curve")
# mod.gene.names # mod.gene.names
if(analysis.kind == "full_cross_validation"){ if(analysis.kind == "full_cross_validation"){
final.mod.gene.names <- data.frame(NULL, stringsAsFactors = FALSE) final.mod.gene.names <- data.frame(NULL, stringsAsFactors = FALSE)
...@@ -253,6 +272,22 @@ if(analysis.kind == "full_cross_validation"){ ...@@ -253,6 +272,22 @@ if(analysis.kind == "full_cross_validation"){
} }
backup.name <- c(backup.name, "final.mod.gene.names") backup.name <- c(backup.name, "final.mod.gene.names")
# gene.importance
if(analysis.kind == "full_cross_validation"){
final.gene.importance <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_gene.importance"))) > 0){
final.gene.importance <- rbind(final.gene.importance, data.frame(get(paste0("loop", i0, "_gene.importance")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{
final.gene.importance <- rbind(final.gene.importance, data.frame(features = NA, importance = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}
}
}else{
final.gene.importance <- get(paste0("loop1_gene.importance"))
fun_export_data(path = path.out, data = paste0("final.gene.importance OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file)
}
backup.name <- c(backup.name, "final.gene.importance")
# loop1_confmat1.genes.rf, data.pred1.genes.logreg, data.roc1.genes.logreg # loop1_confmat1.genes.rf, data.pred1.genes.logreg, data.roc1.genes.logreg
data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important
ana.kind <- c("rf", "logreg", "rpart") ana.kind <- c("rf", "logreg", "rpart")
...@@ -327,34 +362,29 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ ...@@ -327,34 +362,29 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
fun_export_data(path = path.out, data = "FREQUENCIES AND P VALUES MEDIANS & CI: ", output = log.file) fun_export_data(path = path.out, data = "FREQUENCIES AND P VALUES MEDIANS & CI: ", output = log.file)
fun_export_data(path = path.out, data = final.ttab.freq, output = log.file) fun_export_data(path = path.out, data = final.ttab.freq, output = log.file)
# barplot # horiz barplot
bar.width <- 0.5 bar.width <- 0.5
log.scale <- FALSE log.scale <- FALSE
amplif <- label.size amplif <- label.size
tempo.gg.name <- "gg.indiv.plot." tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0 tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.ttab.freq, mapping = ggplot2::aes(x = reorder(GENE, -FREQ), y = FREQ))) # reorder from higher to lower if(nrow(final.ttab.freq) > 20){
tempo1 <- table(final.ttab.freq$FREQ)
tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)]
tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20]))
tempo.plot <- final.ttab.freq[round(final.ttab.freq$FREQ, 7) %in% round(tempo3, 7), ]
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.ttab.freq))
}else{
tempo.plot <- final.ttab.freq
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- ""
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ", slurm.loop.nb, " LOOPS)"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab(""))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION"))
caca <- 'assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5),
axis.text = ggplot2::element_text(size = amplif * 0.75),
axis.title = ggplot2::element_text(size = amplif),
legend.text = ggplot2::element_text(size = amplif),
legend.title = ggplot2::element_text(size = amplif),
strip.text = ggplot2::element_text(size = amplif),
legend.key = ggplot2::element_blank(),
panel.background = ggplot2::element_rect(fill = "grey95"),
panel.border = ggplot2::element_rect(colour = "black", fill = NA),
panel.grid.major.x = ggplot2::element_line(colour = "grey75"),
panel.grid.major.y = ggplot2::element_line(colour = "grey75"),
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.minor.y = ggplot2::element_blank(),
strip.background = ggplot2::element_rect(fill = "white", colour = "black")
))'
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order
...@@ -371,6 +401,7 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ ...@@ -371,6 +401,7 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
} }
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
} }
backup.name <- c(backup.name, "final.ttab.freq")
################ 4 Machine Learning based analysis ################ 4 Machine Learning based analysis
...@@ -383,32 +414,119 @@ fun_export_data(path = path.out, data = "################################ RANDOM ...@@ -383,32 +414,119 @@ fun_export_data(path = path.out, data = "################################ RANDOM
fun_export_data(path = path.out, data = "################ LIST OF GENES SELECTED DURING THE MODEL ELABORATION", output = log.file, sep = 4) fun_export_data(path = path.out, data = "################ LIST OF GENES SELECTED DURING THE MODEL ELABORATION", output = log.file, sep = 4)
if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE LIMMA ANALYSIS (P VALUES ABOVE 0.05 AFTER CORRECTION FOR INSTANCE)"), output = log.file) fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file)
fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file)
}else if(analysis.kind == "valid_boot"){ }else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file)
}else{ }else{
tempo1 <- c(table(final.mod.gene.names$GENE)) tempo1 <- c(table(final.mod.gene.names$GENE))
final.mod.gene.names.freq <- data.frame(GENE = names(tempo1), NB = unname(tempo1), FREQ = unname(tempo1)/sum(tempo1)) final.mod.gene.names.freq <- data.frame(GENE = names(tempo1), NB = unname(tempo1), FREQ = unname(tempo1)/sum(tempo1))
rownames(final.mod.gene.names.freq) <- NULL rownames(final.mod.gene.names.freq) <- NULL
fun_export_data(path = path.out, data = "FREQUENCIE: ", output = log.file) fun_export_data(path = path.out, data = "FREQUENCIES: ", output = log.file)
fun_export_data(path = path.out, data = final.mod.gene.names.freq, output = log.file) fun_export_data(path = path.out, data = final.mod.gene.names.freq, output = log.file)
# barplot # horiz barplot
bar.width <- 0.5 bar.width <- 0.5
log.scale <- FALSE log.scale <- FALSE
amplif <- label.size amplif <- label.size
tempo.gg.name <- "gg.indiv.plot." tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0 tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.mod.gene.names.freq, mapping = ggplot2::aes(x = reorder(GENE, -FREQ), y = FREQ))) # reorder from higher to lower if(nrow(final.mod.gene.names.freq) > 20){
tempo1 <- table(final.mod.gene.names.freq$FREQ)
tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)]
tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20]))
tempo.plot <- final.mod.gene.names.freq[round(final.mod.gene.names.freq$FREQ, 7) %in% round(tempo3, 7), ]
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.mod.gene.names.freq))
}else{
tempo.plot <- final.mod.gene.names.freq
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- ""
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab(""))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION"))
caca <- 'assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order
if(log.scale == TRUE){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l"))
}else{
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_flip(ylim = c(0, max(final.mod.gene.names.freq$FREQ, na.rm = TRUE) * 1.02))) # fix limits
}
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
}else{
windows(5, 5)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
backup.name <- c(backup.name, "final.mod.gene.names.freq")
#gene.importance
fun_export_data(path = path.out, data = "################ GENE IMPORTANCE DURING THE MODEL ELABORATION", output = log.file, sep = 4)
if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0){
fun_export_data(path = path.out, data = paste0("NO GENE IMPORTANCE RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file)
fun_export_data(path = path.out, data = final.gene.importance, output = log.file)
}else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.gene.importance, output = log.file)
}else{
tempo1 <- aggregate(final.gene.importance$importance, list(final.gene.importance$features), median, na.rm = TRUE)
names(tempo1) <- c("GENE", "MEDIAN")
tempo2 <- aggregate(final.gene.importance$features, list(final.gene.importance$features), function(x){length(x[ ! is.na(x)])}) # nb
if(identical(tempo1$GENE, tempo2$Group.1)){
tempo1 <- data.frame(tempo1, NB = tempo2$x)
}else{
cat(paste0("\n\n============\n\nERROR: IN THE PLOT OF THE final.gene.importance OBJECT, tempo1 AND tempo2 SHOULD HAVE THE SAME GENE COLUMN: \n\n============\n\n"))
tempo1
tempo2
cat("\n\n============\n\n")
stop()
}
final.gene.importance.median <- cbind(tempo1,
CI95.inf = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.025, na.rm = TRUE)[, 2],
CI95.sup = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.975, na.rm = TRUE)[, 2],
stringsAsFactors = FALSE
)
fun_export_data(path = path.out, data = "IMPORTANCE: ", output = log.file)
fun_export_data(path = path.out, data = final.gene.importance.median, output = log.file)
# horiz barplot + CI
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
if(nrow(final.gene.importance.median) > 20){
tempo1 <- table(final.gene.importance.median$MEDIAN)
tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)]
tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20]))
tempo.plot <- final.gene.importance.median[round(final.gene.importance.median$MEDIAN, 7) %in% round(tempo3, 7), ]
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower
tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST IMPORTANT GENES PLOTTED, AMONG ", nrow(final.gene.importance.median))
}else{
tempo.plot <- final.gene.importance.median
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower
tempo.txt <- ""
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("GENE IMPORTANCE (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ", slurm.loop.nb, " LOOPS)")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab(""))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI"))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme(
# plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"),
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.text = ggplot2::element_text(size = amplif * 0.75), axis.text = ggplot2::element_text(size = amplif * 0.75),
axis.title = ggplot2::element_text(size = amplif), axis.title = ggplot2::element_text(size = amplif),
legend.text = ggplot2::element_text(size = amplif), legend.text = ggplot2::element_text(size = amplif),
...@@ -422,15 +540,14 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ ...@@ -422,15 +540,14 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
panel.grid.minor.x = ggplot2::element_blank(), panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.minor.y = ggplot2::element_blank(), panel.grid.minor.y = ggplot2::element_blank(),
strip.background = ggplot2::element_rect(fill = "white", colour = "black") strip.background = ggplot2::element_rect(fill = "white", colour = "black")
))' ))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = tempo.plot$NB, size = amplif/2, color = "black", vjust = "center", hjust = "right", angle = 90)) # beware: no need of order() for labels because coord[[1]]$x set the order
if(log.scale == TRUE){ if(log.scale == TRUE){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l"))
}else{ }else{
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_flip(ylim = c(0, max(final.mod.gene.names.freq$FREQ, na.rm = TRUE) * 1.02))) # fix limits assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.gene.importance.median$CI95.sup, na.rm = TRUE) * 1.02))) # fix limits
} }
if(activate.pdf == TRUE){ if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message tempo <- dev.set(pdf.nb) # assign to avoid the message
...@@ -439,13 +556,64 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ ...@@ -439,13 +556,64 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
} }
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
} }
backup.name <- c(backup.name, "final.gene.importance.median")
#select.gene.curve
fun_export_data(path = path.out, data = "################ OPTIMAL NUMBER OF GENES DURING THE MODEL ELABORATION", output = log.file, sep = 4)
if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) == 0){
fun_export_data(path = path.out, data = paste0("NO OPTIMAL RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file)
}else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file)
}else{
tempo1 <- aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), median, na.rm = TRUE)
names(tempo1) <- c("X", "MEDIAN")
final.select.gene.curve.median <- cbind(tempo1,
CI95.inf = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.025, na.rm = TRUE)[, 2],
CI95.sup = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.975, na.rm = TRUE)[, 2],
stringsAsFactors = FALSE
)
fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file)
# horiz barplot + CI
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) # reorder from higher to lower
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
if(log.scale == TRUE){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l"))
}else{
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.select.gene.curve.median$CI95.sup, na.rm = TRUE) * 1.02))) # fix limits
}
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
}else{
windows(5, 5)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
backup.name <- c(backup.name, "final.select.gene.curve.median")
#gene heatmap and corrplot
print(loop1_heatmap.plot) # same for all the loops
corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops
backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot")
...@@ -457,40 +625,6 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ ...@@ -457,40 +625,6 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
tempo <- dev.set(pdf.nb) # assign to avoid the message
plt + geom_point(colour = "red") +
ggtitle("Evaluating optimal number of features") +
theme_bw() +
labs(x="Number of selected features", y="Mean misclassification error on the test sets") +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)