Commit fd03eca6 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent d8f8965e
#!/bin/bash # shebang (#! https://en.wikipedia.org/wiki/Shebang_%28Unix%29) indicating to the shell what program to interpret the script with, when executed, probably optional here.
# export allow the variable to be use in subprocesses. Without export, the variable is only available in the current process. Example ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729 instead of export ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729
# _conf: lowercases for alias and scripts, and uppercases for variables
......@@ -15,7 +14,7 @@ alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript'
export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R
# export r_main_functions_conf=https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.0.0/little_bash_functions-v1.0.0.sh
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.1.0/little_bash_functions.sh
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_main_analysis.R
export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_data_compilation.R
......@@ -39,7 +38,7 @@ FILE_NAME1_CONF="supplementary_data_file_test.csv" # name of the data file to im
ML_BOOTSTRAP_NB_CONF=3
LOOP_NB_CONF=3
R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
R_RANDOM_SEED="TRUE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
################ kind of analysis
......@@ -48,7 +47,7 @@ R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the b
# with discovery set 67 indiv (df.nano$cohort_id != "cohortR") and validation set 9 indiv (df.nano$cohort_id == "cohortR")
# "valid_boot" limma and rf training are run once but bootstrap of the validation set 9 indiv (df.nano$cohort_id == "cohortR") using LOOP_NB_CONF parameter
# "full_cross_validation" rows of the dataset are randomly split in two (no replacement), according to CROSS_VALID_RATIO, forming the discovery and validation set
R_ANALYSIS_KIND="longit"
R_ANALYSIS_KIND="full_cross_validation"
CROSS_VALID_RATIO=0.8 # proportion (nb indiv randomly selected (wo replacement) for the discovery set) / (total number of indiv)
# -> the validation set is formed by the remaining indiv, with proportion 1 - CROSS_VALID_RATIO
......
......@@ -57,11 +57,11 @@ erase.graphs <- TRUE
script <- "code ini v1.0.0"
project.name <-"rogge12231"
path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required
path.in <- "Z:/rogge12231/rogge_12231_1549993246/" # absolute path of the data folder
path.in <- "Z:/rogge12231/rogge_12231_1550076020/" # absolute path of the data folder
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
project.name <- "rogge_project"
activate.pdf = TRUE
activate.pdf = FALSE
label.size <-12
optional.text <- ""
slurm.loop.nb <- 3
......@@ -104,12 +104,18 @@ for(i0 in 1:length(req.package.list)){
if(length(path.function1) != 1){
stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n"))
}else if(grepl(x = path.function1, pattern = "^http") & ( ! RCurl::url.exists(path.function1))){
}else if(grepl(x = path.function1, pattern = "^http")){
if( ! RCurl::url.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: HTTP INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else if(( ! grepl(x = path.function1, pattern = "^http")) & ( ! file.exists(path.function1))){
}else{
source(path.function1) # source the fun_ functions used below
}
}else if( ! grepl(x = path.function1, pattern = "^http")){
if( ! file.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else{
}else{
source(path.function1) # source the fun_ functions used below
}
}
################################ End Functions
......@@ -195,8 +201,6 @@ if(optional.text == "no.txt"){
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur()
################ Data import
for(i0 in 1:slurm.loop.nb){
......@@ -217,7 +221,6 @@ for(i0 in 1:slurm.loop.nb){
# > class(final.gene.list)
# [1] "data.frame"
################ Data compilation
backup.name <- NULL
......@@ -226,7 +229,7 @@ if(analysis.kind == "full_cross_validation"){
final.ttab <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_ttab"))) > 0){
final.ttab <- rbind(final.ttab, get(paste0("loop", i0, "_ttab")))
final.ttab <- rbind(final.ttab, data.frame(get(paste0("loop", i0, "_ttab")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{
final.ttab <- rbind(final.ttab, data.frame(logFC = NA, AveExpr = NA, t = NA, P.Value = NA, adj.P.Val = NA, B = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}
......@@ -237,6 +240,22 @@ if(analysis.kind == "full_cross_validation"){
}
backup.name <- c(backup.name, "final.ttab")
# select.gene.curve
if(analysis.kind == "full_cross_validation"){
final.select.gene.curve <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_select.gene.curve"))) > 0){
final.select.gene.curve <- rbind(final.select.gene.curve, data.frame(get(paste0("loop", i0, "_select.gene.curve")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{
final.select.gene.curve <- rbind(final.select.gene.curve, data.frame(x = NA, y = NA, PANEL = NA, group = NA, shape = NA, colour = NA, size = NA, fill = NA, alpha = NA, stroke = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}
}
}else{
final.select.gene.curve <- get(paste0("loop1_select.gene.curve"))
fun_export_data(path = path.out, data = paste0("final.select.gene.curve OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file)
}
backup.name <- c(backup.name, "final.select.gene.curve")
# mod.gene.names
if(analysis.kind == "full_cross_validation"){
final.mod.gene.names <- data.frame(NULL, stringsAsFactors = FALSE)
......@@ -253,6 +272,22 @@ if(analysis.kind == "full_cross_validation"){
}
backup.name <- c(backup.name, "final.mod.gene.names")
# gene.importance
if(analysis.kind == "full_cross_validation"){
final.gene.importance <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_gene.importance"))) > 0){
final.gene.importance <- rbind(final.gene.importance, data.frame(get(paste0("loop", i0, "_gene.importance")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{
final.gene.importance <- rbind(final.gene.importance, data.frame(features = NA, importance = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}
}
}else{
final.gene.importance <- get(paste0("loop1_gene.importance"))
fun_export_data(path = path.out, data = paste0("final.gene.importance OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file)
}
backup.name <- c(backup.name, "final.gene.importance")
# loop1_confmat1.genes.rf, data.pred1.genes.logreg, data.roc1.genes.logreg
data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important
ana.kind <- c("rf", "logreg", "rpart")
......@@ -327,34 +362,29 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
fun_export_data(path = path.out, data = "FREQUENCIES AND P VALUES MEDIANS & CI: ", output = log.file)
fun_export_data(path = path.out, data = final.ttab.freq, output = log.file)
# barplot
# horiz barplot
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.ttab.freq, mapping = ggplot2::aes(x = reorder(GENE, -FREQ), y = FREQ))) # reorder from higher to lower
if(nrow(final.ttab.freq) > 20){
tempo1 <- table(final.ttab.freq$FREQ)
tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)]
tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20]))
tempo.plot <- final.ttab.freq[round(final.ttab.freq$FREQ, 7) %in% round(tempo3, 7), ]
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.ttab.freq))
}else{
tempo.plot <- final.ttab.freq
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- ""
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ", slurm.loop.nb, " LOOPS)")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab(""))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION"))
caca <- 'assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5),
axis.text = ggplot2::element_text(size = amplif * 0.75),
axis.title = ggplot2::element_text(size = amplif),
legend.text = ggplot2::element_text(size = amplif),
legend.title = ggplot2::element_text(size = amplif),
strip.text = ggplot2::element_text(size = amplif),
legend.key = ggplot2::element_blank(),
panel.background = ggplot2::element_rect(fill = "grey95"),
panel.border = ggplot2::element_rect(colour = "black", fill = NA),
panel.grid.major.x = ggplot2::element_line(colour = "grey75"),
panel.grid.major.y = ggplot2::element_line(colour = "grey75"),
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.minor.y = ggplot2::element_blank(),
strip.background = ggplot2::element_rect(fill = "white", colour = "black")
))'
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order
......@@ -371,6 +401,7 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
backup.name <- c(backup.name, "final.ttab.freq")
################ 4 Machine Learning based analysis
......@@ -383,32 +414,119 @@ fun_export_data(path = path.out, data = "################################ RANDOM
fun_export_data(path = path.out, data = "################ LIST OF GENES SELECTED DURING THE MODEL ELABORATION", output = log.file, sep = 4)
if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE LIMMA ANALYSIS (P VALUES ABOVE 0.05 AFTER CORRECTION FOR INSTANCE)"), output = log.file)
fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file)
fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file)
}else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file)
}else{
tempo1 <- c(table(final.mod.gene.names$GENE))
final.mod.gene.names.freq <- data.frame(GENE = names(tempo1), NB = unname(tempo1), FREQ = unname(tempo1)/sum(tempo1))
rownames(final.mod.gene.names.freq) <- NULL
fun_export_data(path = path.out, data = "FREQUENCIE: ", output = log.file)
fun_export_data(path = path.out, data = "FREQUENCIES: ", output = log.file)
fun_export_data(path = path.out, data = final.mod.gene.names.freq, output = log.file)
# barplot
# horiz barplot
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.mod.gene.names.freq, mapping = ggplot2::aes(x = reorder(GENE, -FREQ), y = FREQ))) # reorder from higher to lower
if(nrow(final.mod.gene.names.freq) > 20){
tempo1 <- table(final.mod.gene.names.freq$FREQ)
tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)]
tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20]))
tempo.plot <- final.mod.gene.names.freq[round(final.mod.gene.names.freq$FREQ, 7) %in% round(tempo3, 7), ]
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.mod.gene.names.freq))
}else{
tempo.plot <- final.mod.gene.names.freq
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower
tempo.txt <- ""
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab(""))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION"))
caca <- 'assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme(
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order
if(log.scale == TRUE){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l"))
}else{
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_flip(ylim = c(0, max(final.mod.gene.names.freq$FREQ, na.rm = TRUE) * 1.02))) # fix limits
}
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
}else{
windows(5, 5)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
backup.name <- c(backup.name, "final.mod.gene.names.freq")
#gene.importance
fun_export_data(path = path.out, data = "################ GENE IMPORTANCE DURING THE MODEL ELABORATION", output = log.file, sep = 4)
if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0){
fun_export_data(path = path.out, data = paste0("NO GENE IMPORTANCE RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file)
fun_export_data(path = path.out, data = final.gene.importance, output = log.file)
}else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.gene.importance, output = log.file)
}else{
tempo1 <- aggregate(final.gene.importance$importance, list(final.gene.importance$features), median, na.rm = TRUE)
names(tempo1) <- c("GENE", "MEDIAN")
tempo2 <- aggregate(final.gene.importance$features, list(final.gene.importance$features), function(x){length(x[ ! is.na(x)])}) # nb
if(identical(tempo1$GENE, tempo2$Group.1)){
tempo1 <- data.frame(tempo1, NB = tempo2$x)
}else{
cat(paste0("\n\n============\n\nERROR: IN THE PLOT OF THE final.gene.importance OBJECT, tempo1 AND tempo2 SHOULD HAVE THE SAME GENE COLUMN: \n\n============\n\n"))
tempo1
tempo2
cat("\n\n============\n\n")
stop()
}
final.gene.importance.median <- cbind(tempo1,
CI95.inf = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.025, na.rm = TRUE)[, 2],
CI95.sup = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.975, na.rm = TRUE)[, 2],
stringsAsFactors = FALSE
)
fun_export_data(path = path.out, data = "IMPORTANCE: ", output = log.file)
fun_export_data(path = path.out, data = final.gene.importance.median, output = log.file)
# horiz barplot + CI
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
if(nrow(final.gene.importance.median) > 20){
tempo1 <- table(final.gene.importance.median$MEDIAN)
tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)]
tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20]))
tempo.plot <- final.gene.importance.median[round(final.gene.importance.median$MEDIAN, 7) %in% round(tempo3, 7), ]
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower
tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST IMPORTANT GENES PLOTTED, AMONG ", nrow(final.gene.importance.median))
}else{
tempo.plot <- final.gene.importance.median
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower
tempo.txt <- ""
}
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("GENE IMPORTANCE (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ", slurm.loop.nb, " LOOPS)")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab(""))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI"))
# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme(
# plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"),
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5),
axis.text.x = element_text(angle = 90, hjust = 1),
axis.text = ggplot2::element_text(size = amplif * 0.75),
axis.title = ggplot2::element_text(size = amplif),
legend.text = ggplot2::element_text(size = amplif),
......@@ -422,15 +540,14 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
panel.grid.minor.x = ggplot2::element_blank(),
panel.grid.minor.y = ggplot2::element_blank(),
strip.background = ggplot2::element_rect(fill = "white", colour = "black")
))'
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
))
coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = tempo.plot$NB, size = amplif/2, color = "black", vjust = "center", hjust = "right", angle = 90)) # beware: no need of order() for labels because coord[[1]]$x set the order
if(log.scale == TRUE){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l"))
}else{
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_flip(ylim = c(0, max(final.mod.gene.names.freq$FREQ, na.rm = TRUE) * 1.02))) # fix limits
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.gene.importance.median$CI95.sup, na.rm = TRUE) * 1.02))) # fix limits
}
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
......@@ -439,13 +556,64 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
backup.name <- c(backup.name, "final.gene.importance.median")
#select.gene.curve
fun_export_data(path = path.out, data = "################ OPTIMAL NUMBER OF GENES DURING THE MODEL ELABORATION", output = log.file, sep = 4)
if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) == 0){
fun_export_data(path = path.out, data = paste0("NO OPTIMAL RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file)
}else if(analysis.kind == "valid_boot"){
fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file)
}else{
tempo1 <- aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), median, na.rm = TRUE)
names(tempo1) <- c("X", "MEDIAN")
final.select.gene.curve.median <- cbind(tempo1,
CI95.inf = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.025, na.rm = TRUE)[, 2],
CI95.sup = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.975, na.rm = TRUE)[, 2],
stringsAsFactors = FALSE
)
fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file)
fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file)
# horiz barplot + CI
bar.width <- 0.5
log.scale <- FALSE
amplif <- label.size
tempo.gg.name <- "gg.indiv.plot."
tempo.gg.count <- 0
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) # reorder from higher to lower
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")")))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI"))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif))
if(log.scale == TRUE){
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l"))
}else{
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10)))
assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.select.gene.curve.median$CI95.sup, na.rm = TRUE) * 1.02))) # fix limits
}
if(activate.pdf == TRUE){
tempo <- dev.set(pdf.nb) # assign to avoid the message
}else{
windows(5, 5)
}
print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))
}
backup.name <- c(backup.name, "final.select.gene.curve.median")
#gene heatmap and corrplot
print(loop1_heatmap.plot) # same for all the loops
corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops
backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot")
......@@ -457,40 +625,6 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){
tempo <- dev.set(pdf.nb) # assign to avoid the message
plt + geom_point(colour = "red") +
ggtitle("Evaluating optimal number of features") +
theme_bw() +
labs(x="Number of selected features", y="Mean misclassification error on the test sets") +
theme(
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() +
theme(
axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
axis.text = ggplot2::element_text(size = label.size),
axis.title = ggplot2::element_text(size = label.size),
legend.text = ggplot2::element_text(size = label.size),
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
pheatmap(t(scale(subdat)), annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10)
......
</
......@@ -94,7 +94,7 @@ analysis.kind <- "valid_boot"
cross.valid.ratio <- 0.8
random.seed <- TRUE
'
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode
......@@ -120,7 +120,8 @@ req.package.list <- c(
"ggplot2",
"pander",
"gridExtra",
"lubridate"
"lubridate",
"RCurl"
)
if(path.lib == "none"){
path.lib <- .libPaths() # .libPaths(new = path.lib) # or .libPaths(new = c(.libPaths(), path.lib))
......@@ -143,12 +144,18 @@ for(i0 in 1:length(req.package.list)){
if(length(path.function1) != 1){
stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n"))
}else if(grepl(x = path.function1, pattern = "^http") & ( ! RCurl::url.exists(path.function1))){
}else if(grepl(x = path.function1, pattern = "^http")){
if( ! RCurl::url.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: HTTP INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else if(( ! grepl(x = path.function1, pattern = "^http")) & ( ! file.exists(path.function1))){
}else{
source(path.function1) # source the fun_ functions used below
}
}else if( ! grepl(x = path.function1, pattern = "^http")){
if( ! file.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else{
}else{
source(path.function1) # source the fun_ functions used below
}
}
################################ End Functions
......@@ -422,8 +429,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
reff <- generateHyperParsEffectData(rtune, trafo = FALSE, include.diagnostics = FALSE)
plt <- plotHyperParsEffect(reff, x = "fw.abs", y = "mmce.test.mean")
# add our own touches to the plot
tempo <- dev.set(pdf.nb) # assign to avoid the message
plt + geom_point(colour = "red") +
plt.plot <- plt + geom_point(colour = "red") +
ggtitle("Evaluating optimal number of features") +
theme_bw() +
labs(x="Number of selected features", y="Mean misclassification error on the test sets") +
......@@ -435,7 +441,10 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(plt.plot)
select.gene.curve <- ggplot2::ggplot_build(plt.plot)$data[[2]]
backup.name <- c(backup.name, "select.gene.curve")
# ```
#
# At the end of the learning procedure, there is one optimal value for the number of genes to include in the model. We use the same strategy as in the learning procedure to select this number of genes and train the random forest algorithm on them. The name of the features is given below.
......@@ -480,8 +489,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
feature_importance <- getFeatureImportance(mod$learner.model$next.model)
df_imp <- data.frame(features = names(feature_importance$res),
importance = t(feature_importance$res), stringsAsFactors = FALSE)
tempo <- dev.set(pdf.nb) # assign to avoid the message
ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() +
importance.plot <- ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() +
theme(
axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
......@@ -491,6 +499,10 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(importance.plot)
gene.importance <- df_imp
backup.name <- c(backup.name, "gene.importance")
# ```
#
#
......@@ -503,13 +515,16 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
colnames(annot.rows) <- "ASDAS R/NR"
subdat <- df.tmp[, sort(mod.gene.names)]
ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato"))
pheatmap(t(scale(subdat)), annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
heatmap.plot <- pheatmap(t(scale(subdat)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
print(heatmap.plot)
backup.name <- c(backup.name, "heatmap.plot")
# ```
#