diff --git a/README.md b/README.md index 586531ffe43971175d208c0a7b4f5f45f1688eee..92d1dadee2835aae1b4f65d869222c91a93a80ff 100644 --- a/README.md +++ b/README.md @@ -34,11 +34,12 @@ rogge_12231.conf config file. Parameters need to be set by the user inside this rogge_12231_workflow.sh file allowing checkings and SLURM job submission rogge_12231_main_analysis.R R script run by SLURM rogge_12231_data_compilation.R R script run by SLURM that compil the data from the previous loops - The four files must be in the same directory before running +rogge_12231_manual_graph_adjustment.R optional R script to run in R, after SLURM job, to improve the representation of the graphs. Open the script and set the parameter inside before running + -#### OUTPUT DESCRIPTION +#### TARS OUTPUT DESCRIPTION ## For each job submitted to SLURM: workflow_log messages printed by the rogge_12231_workflow.sh script @@ -59,6 +60,11 @@ Check for updated versions (most recent tags) at https://gitlab.pasteur.fr/gmill #### WHAT'S NEW IN +## v2.0.0 + +rogge_12231_manual_graph_adjustment.R file added + + ## v1.0.0 Everything diff --git a/rogge_12231_data_compilation.R b/rogge_12231_data_compilation.R index 5505eb402e5274d8afdae91459153f470815bb48..087fc1ce7ff048b55b0c30d13bae9b70892de8c1 100644 --- a/rogge_12231_data_compilation.R +++ b/rogge_12231_data_compilation.R @@ -638,7 +638,7 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) fun_export_data(path = path.out, data = paste0("SEE THE final.mod.gene.names AND THE final.mod.gene.names.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "final.mod.gene.names.freq") + backup.name <- c(backup.name, "final.mod.gene.names.freq", "optimal.gene.nb", "selected.gene.names") } @@ -792,10 +792,9 @@ if(analysis.kind == "valid_boot"){ tempo <- dev.set(pdf.nb) # assign to avoid the message corrplot::corrplot(cor(nano.rf.selected.genes), tl.col = "black", tl.cex = label.size / 10) fun_export_data(path = path.out, data = paste0("SEE THE nano.rf.selected.genes OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "nano.rf.selected.genes") } } - +backup.name <- c(backup.name, "nano.rf.selected.genes", "dat") ######## 4.2 Validate the model @@ -837,6 +836,7 @@ for(i0 in 1:length(data.kind)){ for(i2 in 1:slurm.loop.nb){ # roc curves if(nrow(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){ + backup.name <- c(backup.name, paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(data = get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))[order(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$x, get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$y, decreasing = FALSE), ], mapping = ggplot2::aes(x = x, y = y), color = "grey", alpha = 0.5)) } } @@ -891,6 +891,7 @@ boxdat <- data.frame(ASDAS = df.nano$response_ASDAS_R_NR, ) boxdat_melt <- reshape2::melt(boxdat, id.vars = c("ASDAS", "Cohort.name", "RF.COHORTE"), variable.name = "Gene") boxdat_melt$Gene <- factor(boxdat_melt$Gene, levels = unique(boxdat_melt$Gene)) +backup.name <- c(backup.name, "boxdat_melt") # boxplot bar.width <- 0.5 diff --git a/rogge_12231_manual_graph_adjustment.R b/rogge_12231_manual_graph_adjustment.R index 74545c060c4fc0d9d0a30adbe0d29cf5ba293b78..3179ff01ce0c6d20c75dc5cce7722095200f27a1 100644 --- a/rogge_12231_manual_graph_adjustment.R +++ b/rogge_12231_manual_graph_adjustment.R @@ -1,42 +1,139 @@ +################################################################ +## ## +## Rogge_12231 Manual Graph Adjustment v1.0.0 ## +## ## +## ## +## ## +################################################################ +################################ Aim +# Manual adjustment of the rogge_12231_data_compilation.R output for valid_boot and full_cross_validation analyses +# With this code, it is possible to redraw the graph output from a cluster job +################################ End Aim + + +################################ Introduction + + +# Compatible with R v3.5.1 +# Packages requirement: see below + + +################################ End Introduction + + +################################ Credits + + +# Gael A Millot | Hub-C3BI, Institut Pasteur, Paris, France + + +################################ End Credits + ################################ Initialization -erase.objects <- TRUE # write TRUE to erase all the existing objects in R before starting the algorithm and FALSE otherwise. Beginners should use TRUE + +# R version checking +if(version$version.string != "R version 3.5.1 (2018-07-02)"){ + cat("\n\nBEWARE: THE ", version$version.string, " IS NOT THE 3.5.1 RECOMMANDED\n\n") +} +# other initializations +erase.objects = TRUE # write TRUE to erase all the existing objects in R before starting the algorithm and FALSE otherwise. Beginners should use TRUE if(erase.objects == TRUE){ rm(list=ls()) erase.objects = TRUE } -erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before starting the algorithm and FALSE otherwise +erase.graphs = TRUE # write TRUE to erase all the graphic windows in R before starting the algorithm and FALSE otherwise + ################################ End Initialization +################################ Parameters that need to be set by the user -################################ Recording of the initial parameters +################ File names and locations -################################ End Recording of the initial parameters +data.file <- "Z:\\rogge12231\\first attempt\\rogge_12231_1550598741\\final_res\\compiled_data.RData" # RData file and absolute path to source +path.function1 <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R" # /pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R +path.out <- "C:/Users/Gael/Desktop/" + +################ Parameters -################################ DEBUG +project.name <- "rogge_12231" +analysis.kind <- "valid_boot" # kind of analysis. See .conf file +slurm.loop.nb <- 100 # indicate the number of loops used +optional.text <- "" # write here an optional text to include in results and graphs +################ Graphical and display parameters -activate.pdf <- TRUE ; cat(paste0("\n\n================\n\nERROR: ACTIVE GRAPH FILE PARAMETERS\n\n================\n\n")) # graph file parameter -cut.off.freq.for.selected.genes <- 0.6 ; cat(paste0("\n\n================\n\nERROR: ACTIVE GRAPH FILE PARAMETERS\n\n================\n\n"))# graph file parameter +activate.pdf <- TRUE # write TRUE for pdf display and FALSE for R display +window.width <- 7 +window.height <- 7 +amplif <- 14 # increase or decrease the value to increase or decrease the size of the axis numbers and legend +bar.width <- 0.5 # width of bars +log.scale <- FALSE # log scale ? +# boxplots +facet.nrow <- 4 # nomber of rows in the boxplot facet +facet.ncol <- 4 # nomber of columns in the boxplot facet -################################ End DEBUG +################################ End Parameters that need to be set by the user + + +################################ Recording of the initial parameters + + +param.list <- c( + "erase.objects", + "erase.graphs", + "data.file", + "path.function1", + "path.out", + "project.name", + "analysis.kind", + "slurm.loop.nb", + "optional.text", + "activate.pdf", + "window.width", + "window.height", + "amplif", + "bar.width", + "log.scale", + "facet.nrow", + "facet.ncol" +) +if(any(duplicated(param.list))){ + stop(paste0("\n\n================\n\nERROR: THE param.list OBJECT CONTAINS DUPLICATED ELEMENTS\n\n================\n\n")) # message for developers +} +if(erase.objects == TRUE){ + created.object.control <- ls()[ ! ls() %in% "param.list"] + if( ! (all(created.object.control %in% param.list) & all(param.list %in% created.object.control))){ + stop(paste0("\n\n================\n\nERROR: INCONSISTENCIES BETWEEN THE param.list ELEMENTS AND THE CREATED OBJECTS\nTHE CREATED OBJECTS (created.object.control) NOT PRESENT IN THE param.list ARE: ", paste(created.object.control[ ! created.object.control %in% param.list], collapse = " "), "\nTHE param.list ELEMENTS NOT PRESENT IN THE CREATED OBJECTS (created.object.control) ARE: ", paste(param.list[ ! param.list %in% created.object.control], collapse = " "), "\n\n================\n\n")) # message for developers + } +} +char.length <- nchar(param.list) +space.add <- max(char.length) - char.length + 5 +param.ini.settings <- NULL +for(i in 1:length(param.list)){ + param.ini.settings <- c(param.ini.settings, paste0("\n", param.list[i], paste0(rep(" ", space.add[i]), collapse = ""), paste0(get(param.list[i]), collapse = ","))) +} + + +################################ End Recording of the initial parameters + ################################ Packages verification and import @@ -50,24 +147,20 @@ req.package.list <- c( "lubridate", "RCurl" # for url.exists ) -if(path.lib == "none"){ - path.lib <- .libPaths() # -}else{ - # .libPaths(new = ) add path to default path - .libPaths(new = sub(x = path.lib, pattern = "/$|\\\\$", replacement = "")) # .libPaths() does not support / at the end of a submitted path. Thus check and replace last / or \\ in path -} for(i0 in 1:length(req.package.list)){ - if( ! req.package.list[i0] %in% rownames(installed.packages(lib.loc = path.lib))){ - stop(paste0("\n\n================\n\nERROR: PACKAGE ", req.package.list[i0], " MUST BE INSTALLED IN THE MENTIONNED DIRECTORY:\n", paste(path.lib, collapse = "\n"), "\n CHECK ALSO IN :\n", paste(.libPaths(), collapse = "\n"), "\n\n================\n\n")) + if( ! req.package.list[i0] %in% rownames(installed.packages())){ + stop(paste0("\n\n================\n\nERROR: PACKAGE ", req.package.list[i0], " MUST BE INSTALLED IN THE MENTIONNED DIRECTORY:\n", paste(.libPaths(), collapse = "\n"), "\n\n================\n\n")) }else{ - suppressPackageStartupMessages(library(req.package.list[i0], lib.loc = path.lib, quietly = TRUE, character.only = TRUE)) + suppressPackageStartupMessages(library(req.package.list[i0], quietly = TRUE, character.only = TRUE)) } } ################################ End Packages verification and import + ################################ Functions + if(length(path.function1) != 1){ stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n")) }else if(grepl(x = path.function1, pattern = "^http")){ @@ -86,8 +179,10 @@ if(length(path.function1) != 1){ ################################ End Functions + ################################ Main code + ################ Pre-ignition checking arg.check <- NULL # for function debbuging @@ -97,25 +192,12 @@ ee <- expression(arg.check <- c(arg.check, tempo$problem) , checked.arg.names <- # initializations tempo <- fun_param_check(data = erase.objects, class = "logical", length = 1) ; eval(ee) tempo <- fun_param_check(data = erase.graphs, class = "logical", length = 1) ; eval(ee) -tempo <- fun_param_check(data = script, class = "character", length = 1) ; eval(ee) -tempo <- fun_param_check(data = args, class = "character", length = length(tempo.arg.names)) ; eval(ee) -tempo <- fun_param_check(data = tempo.arg.names, class = "character", length = length(tempo.arg.names)) ; eval(ee) # imported objects -tempo <- fun_param_check(data = path.lib, class = "character", length = 1) ; eval(ee) -if(tempo$problem == FALSE & path.lib != "none"){ - if( ! dir.exists(path.lib)){ - cat(paste0("\n\n============\n\nERROR: DIRECTORY PATH INDICATED IN THE path.in PARAMETER DOES NOT EXISTS: ", path.in, "\n\n============\n\n")) - arg.check <- TRUE - } -} -tempo <- fun_param_check(data = path.in, class = "character", length = 1) ; eval(ee) -if(tempo$problem == FALSE & ! dir.exists(path.in)){ - cat(paste0("\n\n============\n\nERROR: DIRECTORY PATH INDICATED IN THE path.in PARAMETER DOES NOT EXISTS: ", path.in, "\n\n============\n\n")) - arg.check <- TRUE -} -if(tempo$problem == FALSE & ! ("loop1_res_data.RData" %in% list.files(paste0(path.in, "loop1/")))){ - cat(paste0("\n\n============\n\nERROR: loop1_res_data.RData SHOULD BE PRESENT IN ", paste0(path.in, "loop1/"), ":\nCHECK THE path.in PARAMETER\n\n============\n\n")) + +tempo <- fun_param_check(data = data.file, class = "character", length = 1) ; eval(ee) +if(tempo$problem == FALSE & ! file.exists(data.file)){ + cat(paste0("\n\n============\n\nERROR: FILE NAME AND PATH INDICATED IN THE data.file PARAMETER DOES NOT EXISTS: ", data.file, "\n\n============\n\n")) arg.check <- TRUE } tempo <- fun_param_check(data = path.out, class = "character", length = 1) ; eval(ee) @@ -126,10 +208,17 @@ if(tempo$problem == FALSE & ! dir.exists(path.out)){ # path.function1 fully tested above tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee) -tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) -tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee) -tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = analysis.kind, options = c("longit", "valid_boot", "full_cross_validation"), length = 1) ; eval(ee) +tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) +tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee) +tempo <- fun_param_check(data = activate.pdf, class = "logical", length = 1) ; eval(ee) +tempo <- fun_param_check(data = window.width, class = "numeric", length = 1) ; eval(ee) +tempo <- fun_param_check(data = window.height, class = "numeric", length = 1) ; eval(ee) +tempo <- fun_param_check(data = amplif, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) +tempo <- fun_param_check(data = bar.width, class = "numeric", length = 1) ; eval(ee) +tempo <- fun_param_check(data = log.scale, class = "logical", length = 1) ; eval(ee) +tempo <- fun_param_check(data = facet.nrow, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) +tempo <- fun_param_check(data = facet.ncol, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) if(any(arg.check) == TRUE){ stop() } @@ -139,16 +228,10 @@ if(any(arg.check) == TRUE){ ini.time <- as.numeric(Sys.time()) # time of process begin, converted into seconds analysis.nb <- trunc(ini.time) # to provide a specific number ot each analysis log.file <- paste0("r_", project.name, "_", analysis.nb,"_report.txt") -# name.dir <- paste0(project.name, "_", analysis.nb) -# path.out<-paste0(path.out, name.dir) -# suppressWarnings(dir.create(path.out)) -backup.name <- NULL # names of the object to save +name.dir <- paste0(project.name, "_", analysis.nb) +path.out<-paste0(path.out, name.dir) +suppressWarnings(dir.create(path.out)) fun_export_data(data = paste0("\n\n################################ ", log.file, " ################"), output = log.file, no.overwrite = FALSE, path = path.out, sep = 4) -fun_export_data(path = path.out, data = "################################ INITIAL DATA", output = log.file) -fun_export_data(path = path.out, data = paste0("SCRIPT USED: ", script), output = log.file) -fun_export_data(path = path.out, data = paste0("KIND OF ANALYSIS PERFORMED: ", analysis.kind), output = log.file) -fun_export_data(path = path.out, data = paste0("NUMBER OF LOOPS USED FOR COMPILATION: ", slurm.loop.nb), output = log.file) -fun_export_data(path = path.out, data = paste0("FOR INFO: THE RESPONSE USED IS THE COLUMN: response_ASDAS_R_NR"), output = log.file) ################ Graphical parameter ignition @@ -165,7 +248,7 @@ if(optional.text == "no.txt"){ }else{ fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file) } -par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("final_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par +par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("final_graphs"), width.fun = window.width, height.fun = window.height, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par if(activate.pdf == TRUE){ pdf.nb <- dev.cur() }else{ @@ -174,39 +257,17 @@ if(activate.pdf == TRUE){ ################ Data import -load() - - - - +load(data.file) ################ 3 Differential analysis with limma fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file) -if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ +if( ! any(ls() %in% "final.ttab.freq")){ + fun_export_data(path = path.out, data = paste0("NO final.ttab.freq RESULTS FROM THE LIMMA ANALYSIS"), output = log.file) +}else if(nrow(final.ttab.freq) == 0 | nrow(na.omit(final.ttab.freq)) == 0){ fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE LIMMA ANALYSIS (P VALUES ABOVE 0.05 AFTER CORRECTION FOR INSTANCE)"), output = log.file) - fun_export_data(path = path.out, data = final.ttab, output = log.file) -}else if(analysis.kind == "valid_boot"){ - fun_export_data(path = path.out, data = paste0("BEWARE: NO COMPILATION FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) }else{ - tempo1 <- c(table(rownames(na.omit(final.ttab)))) - tempo2 <- aggregate(final.ttab$adj.P.Val, list(rownames(final.ttab)), median, na.rm = TRUE) - names(tempo2) <- c("GENE", "ADJ_P_VALUE_MEDIAN") - tempo2 <- data.frame( - tempo2, - ADJ_P_VALUE_CI95_INF = aggregate(final.ttab$adj.P.Val, list(rownames(final.ttab)), quantile, probs = 0.025, na.rm = TRUE)$x, - ADJ_P_VALUE_CI95_SUP = aggregate(final.ttab$adj.P.Val, list(rownames(final.ttab)), quantile, probs = 0.975, na.rm = TRUE)$x - ) - final.ttab.freq <- data.frame(tempo2, NB = unname(tempo1), FREQ = unname(tempo1)/slurm.loop.nb) - rownames(final.ttab.freq) <- NULL - fun_export_data(path = path.out, data = "FREQUENCIES (RELATED TO NB OF TIME GENES HAVE BEEN SELECTED PER LOOP) AND P VALUES MEDIANS & CI: ", output = log.file) - fun_export_data(path = path.out, data = final.ttab.freq, output = log.file) - - # horiz barplot - bar.width <- 0.5 - log.scale <- FALSE - amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 if(nrow(final.ttab.freq) > 20){ @@ -214,21 +275,41 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) tempo.plot <- final.ttab.freq[round(final.ttab.freq$FREQ, 7) %in% round(tempo3, 7), ] - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower - tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.ttab.freq)) + if(length(tempo3) == 0){ + tempo.plot <- final.ttab.freq + tempo.txt <- paste0("BEWARE: UNIQUE FREQUENCY ", names(tempo2), " FOR ALL THE ", unname(tempo2), " GENES -> CRAZY PLOT") + }else{ + tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.ttab.freq)) + } }else{ tempo.plot <- final.ttab.freq - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower tempo.txt <- "" } + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) - # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION")) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( + # plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"), + plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), + axis.text.x = ggplot2::element_text(angle = 90, hjust = 1), + axis.text = ggplot2::element_text(size = amplif * 0.75), + axis.title = ggplot2::element_text(size = amplif), + legend.text = ggplot2::element_text(size = amplif), + legend.title = ggplot2::element_text(size = amplif), + strip.text = ggplot2::element_text(size = amplif), + legend.key = ggplot2::element_blank(), + panel.background = ggplot2::element_rect(fill = "white"), + panel.border = ggplot2::element_rect(colour = "black", fill = NA), + panel.grid.major.x = ggplot2::element_line(colour = "grey75"), + panel.grid.major.y = ggplot2::element_line(colour = "grey75"), + panel.grid.minor.x = ggplot2::element_blank(), + panel.grid.minor.y = ggplot2::element_blank(), + strip.background = ggplot2::element_rect(fill = "white", colour = "black") + )) coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax * 1.05, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax * 1.05, label = round(coord[[1]]$y, 3), size = amplif/3, color = "black", vjust = "center", hjust = "left")) # beware: no need of order() for labels because coord[[1]]$x set the order if(log.scale == TRUE){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ @@ -238,11 +319,9 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ - windows(5, 5) + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) - fun_export_data(path = path.out, data = paste0("SEE THE final.ttab AND THE final.ttab.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "final.ttab.freq") } @@ -255,50 +334,98 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ fun_export_data(path = path.out, data = "################################ RANDOM FOREST LEARNING USING THE DISCOVERY SET", output = log.file) -#mod.gene.names +#select.gene.curve (nb of features) -fun_export_data(path = path.out, data = "################ LIST OF GENES SELECTED DURING THE MODEL ELABORATION", output = log.file) +fun_export_data(path = path.out, data = "################ OPTIMAL NUMBER OF GENES DURING THE MODEL ELABORATION", output = log.file) -if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ - fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) - fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) -}else if(analysis.kind == "valid_boot"){ - fun_export_data(path = path.out, data = paste0("BEWARE: NO COMPILATION FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) - # fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) +if( ! any(ls() %in% "final.select.gene.curve.median")){ + fun_export_data(path = path.out, data = paste0("NO final.select.gene.curve.median RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) +}else if(nrow(final.select.gene.curve.median) == 0 | nrow(na.omit(final.select.gene.curve.median)) == 0){ + fun_export_data(path = path.out, data = paste0("NO OPTIMAL RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) }else{ - tempo1 <- c(table(final.mod.gene.names$GENE, useNA = "no")) - final.mod.gene.names.freq <- data.frame(GENE = names(tempo1), NB = unname(tempo1), FREQ = unname(tempo1)/slurm.loop.nb) - final.mod.gene.names.freq <- final.mod.gene.names.freq[order(final.mod.gene.names.freq$FREQ, decreasing = TRUE), ] - rownames(final.mod.gene.names.freq) <- NULL - fun_export_data(path = path.out, data = "FREQUENCIES: ", output = log.file) - fun_export_data(path = path.out, data = final.mod.gene.names.freq, output = log.file) - - # horiz barplot - bar.width <- 0.5 - log.scale <- FALSE - amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 - if(nrow(final.mod.gene.names.freq) > 20){ - tempo1 <- table(final.mod.gene.names.freq$FREQ, useNA = "no") - tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] - tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) - tempo.plot <- final.mod.gene.names.freq[round(final.mod.gene.names.freq$FREQ, 7) %in% round(tempo3, 7), ] - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower - tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.mod.gene.names.freq)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red")) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_ribbon(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), color = NA, fill = "red", alpha = 0.1)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")"))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES")) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN & 95CI")) + # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( + # plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"), + plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), + axis.text.x = ggplot2::element_text(angle = 90, hjust = 1), + axis.text = ggplot2::element_text(size = amplif * 0.75), + axis.title = ggplot2::element_text(size = amplif), + legend.text = ggplot2::element_text(size = amplif), + legend.title = ggplot2::element_text(size = amplif), + strip.text = ggplot2::element_text(size = amplif), + legend.key = ggplot2::element_blank(), + panel.background = ggplot2::element_rect(fill = "white"), + panel.border = ggplot2::element_rect(colour = "black", fill = NA), + panel.grid.major.x = ggplot2::element_line(colour = "white"), + panel.grid.major.y = ggplot2::element_line(colour = "grey75"), + panel.grid.minor.x = ggplot2::element_blank(), + panel.grid.minor.y = ggplot2::element_blank(), + strip.background = ggplot2::element_rect(fill = "white", colour = "black") + )) + if(log.scale == TRUE){ + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ - tempo.plot <- final.mod.gene.names.freq - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower - tempo.txt <- "" + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_x_continuous(expand = c(0,0))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(xlim= c(min(final.select.gene.curve.median$X, na.rm = TRUE), max(final.select.gene.curve.median$X, na.rm = TRUE)), ylim = c(0, max(final.select.gene.curve.median$CI95.sup, na.rm = TRUE) * 1.1))) # fix limits + } + if(activate.pdf == TRUE){ + tempo <- dev.set(pdf.nb) # assign to avoid the message + }else{ + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) } + print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) + +} + + +#mod.gene.names (final gene list) + +fun_export_data(path = path.out, data = "################ LIST OF GENES SELECTED DURING THE MODEL ELABORATION", output = log.file) + +if( ! any(ls() %in% "final.mod.gene.names.freq")){ + fun_export_data(path = path.out, data = paste0("NO final.mod.gene.names.freq RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) +}else if(nrow(final.mod.gene.names.freq) == 0 | nrow(na.omit(final.mod.gene.names.freq)) == 0){ + fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) +}else{ + tempo.gg.name <- "gg.indiv.plot." + tempo.gg.count <- 0 + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.mod.gene.names.freq, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION")) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) + # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( + # plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"), + plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), + axis.text.x = ggplot2::element_text(angle = 90, hjust = 1), + axis.text = ggplot2::element_text(size = amplif * 0.75), + axis.title = ggplot2::element_text(size = amplif), + legend.text = ggplot2::element_text(size = amplif), + legend.title = ggplot2::element_text(size = amplif), + strip.text = ggplot2::element_text(size = amplif), + legend.key = ggplot2::element_blank(), + panel.background = ggplot2::element_rect(fill = "white"), + panel.border = ggplot2::element_rect(colour = "black", fill = NA), + panel.grid.major.x = ggplot2::element_line(colour = "grey75"), + panel.grid.major.y = ggplot2::element_line(colour = "grey75"), + panel.grid.minor.x = ggplot2::element_blank(), + panel.grid.minor.y = ggplot2::element_blank(), + strip.background = ggplot2::element_rect(fill = "white", colour = "black") + )) coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax * 1.05, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax * 1.05, label = round(coord[[1]]$y, 3), size = amplif/3, color = "black", vjust = "center", hjust = "left")) # beware: no need of order() for labels because coord[[1]]$x set the order if(log.scale == TRUE){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ @@ -308,11 +435,9 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ - windows(5, 5) + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) - fun_export_data(path = path.out, data = paste0("SEE THE final.mod.gene.names AND THE final.mod.gene.names.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "final.mod.gene.names.freq") } @@ -320,37 +445,11 @@ if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ fun_export_data(path = path.out, data = "################ GENE IMPORTANCE DURING THE MODEL ELABORATION", output = log.file) -if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0){ - fun_export_data(path = path.out, data = paste0("NO GENE IMPORTANCE RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) - fun_export_data(path = path.out, data = final.gene.importance, output = log.file) -}else if(analysis.kind == "valid_boot"){ - fun_export_data(path = path.out, data = paste0("BEWARE: NO COMPILATION FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) - # fun_export_data(path = path.out, data = final.gene.importance, output = log.file) +if( ! any(ls() %in% "final.gene.importance.median")){ + fun_export_data(path = path.out, data = paste0("NO final.gene.importance.median RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) +}else if(nrow(final.gene.importance.median) == 0 | nrow(na.omit(final.gene.importance.median)) == 0){ + fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) }else{ - tempo1 <- aggregate(final.gene.importance$importance, list(final.gene.importance$features), median, na.rm = TRUE) - names(tempo1) <- c("GENE", "MEDIAN") - tempo2 <- aggregate(final.gene.importance$features, list(final.gene.importance$features), function(x){length(x[ ! is.na(x)])}) # nb - if(identical(tempo1$GENE, tempo2$Group.1)){ - tempo1 <- data.frame(tempo1, NB = tempo2$x) - }else{ - cat(paste0("\n\n============\n\nERROR: IN THE PLOT OF THE final.gene.importance OBJECT, tempo1 AND tempo2 SHOULD HAVE THE SAME GENE COLUMN: \n\n============\n\n")) - tempo1 - tempo2 - cat("\n\n============\n\n") - stop() - } - final.gene.importance.median <- cbind(tempo1, - CI95.inf = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.025, na.rm = TRUE)[, 2], - CI95.sup = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.975, na.rm = TRUE)[, 2], - stringsAsFactors = FALSE - ) - fun_export_data(path = path.out, data = "IMPORTANCE: ", output = log.file) - fun_export_data(path = path.out, data = final.gene.importance.median, output = log.file) - - # barplot + CI - bar.width <- 0.5 - log.scale <- FALSE - amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 if(nrow(final.gene.importance.median) > 20){ @@ -370,9 +469,7 @@ if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("GENE IMPORTANCE (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ", slurm.loop.nb, " LOOPS)"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN & 95CI")) -# assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( - # plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"), plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), axis.text.x = ggplot2::element_text(angle = 90, hjust = 1), axis.text = ggplot2::element_text(size = amplif * 0.75), @@ -395,113 +492,53 @@ if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.gene.importance.median$CI95.sup, na.rm = TRUE) * 1.1))) # fix limits + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(tempo.plot$CI95.sup, na.rm = TRUE) * 1.1))) # fix limits } if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ - windows(5, 5) + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) - fun_export_data(path = path.out, data = paste0("SEE THE final.gene.importance AND THE final.gene.importance.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "final.gene.importance.median") } -#select.gene.curve - -fun_export_data(path = path.out, data = "################ OPTIMAL NUMBER OF GENES DURING THE MODEL ELABORATION", output = log.file) +# plotting the heatmap adnd corplot +fun_export_data(path = path.out, data = "################ HEATMAP AND CORRELATION PLOT ", output = log.file) -if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) == 0){ - fun_export_data(path = path.out, data = paste0("NO OPTIMAL RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) - fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file) -}else if(analysis.kind == "valid_boot"){ - fun_export_data(path = path.out, data = paste0("BEWARE: NO COMPILATION FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) - # fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file) +if( ! (any(ls() %in% "nano.rf.selected.genes") | any(ls() %in% "optimal.gene.nb") | any(ls() %in% "selected.gene.names"))){ + fun_export_data(path = path.out, data = paste0("NO nano.rf.selected.genes RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) +}else if(nrow(nano.rf.selected.genes) == 0 | nrow(na.omit(nano.rf.selected.genes)) == 0 | length(optimal.gene.nb) == 0){ + fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) }else{ - tempo1 <- aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), median, na.rm = TRUE) - names(tempo1) <- c("X", "MEDIAN") - final.select.gene.curve.median <- cbind(tempo1, - CI95.inf = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.025, na.rm = TRUE)[, 2], - CI95.sup = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.975, na.rm = TRUE)[, 2], - stringsAsFactors = FALSE - ) - fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file) - fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file) - - # line + CI area - bar.width <- 0.5 - log.scale <- FALSE - amplif <- label.size - tempo.gg.name <- "gg.indiv.plot." - tempo.gg.count <- 0 - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red")) - # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_ribbon(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), color = NA, fill = "red", alpha = 0.1)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")"))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES")) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN & 95CI")) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) - if(log.scale == TRUE){ - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) + tempo.data <- dat[order(dat$Y), ] + annot.rows <- dat[, "Y", drop = FALSE] # allow to keep the data.frame structure + colnames(annot.rows) <- "ASDAS R/NR" + if(length(na.omit(match(selected.gene.names, names(tempo.data)))) != length(selected.gene.names)){ + tempo.cat <- cat(paste0("\n\n============\n\nERROR: THE COLUMN NAMES OF tempo.data OBJECT SHOULD MATCH ALL THE NAMES IN selected.gene.names.\n + PROBLEM IN selected.gene.names:\n", selected.gene.names[ ! selected.gene.names %in% names(tempo.data)], " + PROBLEM IN names(tempo.data):\n", names(tempo.data)[ ! names(tempo.data) %in% selected.gene.names], " + \n\n============\n\n")) + fun_export_data(path = path.out, data = tempo.cat, output = log.file) + stop(tempo.cat) }else{ - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(xlim= c(min(final.select.gene.curve.median$X, na.rm = TRUE), max(final.select.gene.curve.median$X, na.rm = TRUE)), ylim = c(0, max(final.select.gene.curve.median$CI95.sup, na.rm = TRUE) * 1.1))) # fix limits + fun_export_data(path = path.out, data = "IN THE nano.rf.selected.genes OBJECT, COLUMN ARE ORDERED AS IN THE FREQ OF THE GENE LIST selected.gene.names", output = log.file) + nano.rf.selected.genes <- tempo.data[, match(selected.gene.names, names(tempo.data))] } - if(activate.pdf == TRUE){ - tempo <- dev.set(pdf.nb) # assign to avoid the message - }else{ - windows(5, 5) - } - print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) - fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "final.select.gene.curve.median") + ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato")) + tempo <- dev.set(pdf.nb) # assign to avoid the message + heatmap.plot <- pheatmap(t(scale(nano.rf.selected.genes)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = amplif, fontsize_col = amplif) + print(ggplot2::ggplot()+ggplot2::theme_classic()) + print(heatmap.plot) + tempo <- dev.set(pdf.nb) # assign to avoid the message + corrplot::corrplot(cor(nano.rf.selected.genes), tl.col = "black", tl.cex = amplif / 10) } -# plotting the heatmap adnd corplot -fun_export_data(path = path.out, data = "################ HEATMAP AND CORRELATION PLOT ", output = log.file) -if(analysis.kind == "valid_boot"){ - if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ - fun_export_data(path = path.out, data = paste0("NO GENE LIST FROM THE RANDOM FOREST TRAINING"), output = log.file) - fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) - }else{ - fun_export_data(path = path.out, data = paste0("BEWARE: NO COMPILATION FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) - # fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) - tempo.data <- dat[order(dat$Y), ] - annot.rows <- dat[, "Y", drop = FALSE] # allow to keep the data.frame structure - colnames(annot.rows) <- "ASDAS R/NR" - nano.rf.selected.genes <- tempo.data[, final.mod.gene.names$GENE] - } -}else{ - if(nrow(final.mod.gene.names.freq) == 0 | nrow(na.omit(final.mod.gene.names.freq)) == 0){ - fun_export_data(path = path.out, data = paste0("NO GENE LIST FROM THE RANDOM FOREST TRAINING"), output = log.file) - fun_export_data(path = path.out, data = final.mod.gene.names.freq, output = log.file) - }else{ - fun_export_data(path = path.out, data = paste0("HEATMAP AND CORRELATION HAVE BEEN PLOTTED USING THE final.mod.gene.names.freq FILE AND THE FREQUENCY CUTOFF ", cut.off.freq.for.selected.genes), output = log.file) - tempo.data <- dat[order(dat$Y), ] - annot.rows <- dat[, "Y", drop = FALSE] # allow to keep the data.frame structure - colnames(annot.rows) <- "ASDAS R/NR" - nano.rf.selected.genes <- tempo.data[, final.mod.gene.names.freq$GENE[final.mod.gene.names.freq$FREQ >= cut.off.freq.for.selected.genes]] - ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato")) - # pheatmap(t(scale(nano.rf.selected.genes)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size) - tempo <- dev.set(pdf.nb) # assign to avoid the message - heatmap.plot <- pheatmap(t(scale(nano.rf.selected.genes)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size) - print(ggplot2::ggplot()+ggplot2::theme_classic()) - print(heatmap.plot) - tempo <- dev.set(pdf.nb) # assign to avoid the message - corrplot::corrplot(cor(nano.rf.selected.genes), tl.col = "black", tl.cex = label.size / 10) - fun_export_data(path = path.out, data = paste0("SEE THE nano.rf.selected.genes OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "nano.rf.selected.genes") - } -} - + ######## 4.2 Validate the model # fun_export_data(path = path.out, data = "################################ VALIDATION", output = log.file) -fun_export_data(path = path.out, data = "################ ROC CURVES", output = log.file) data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important ana.kind <- c("rf", "logreg", "rpart") for(i0 in 1:length(data.kind)){ @@ -528,25 +565,22 @@ for(i0 in 1:length(data.kind)){ )) # roc curves plot - bar.width <- 0.5 - log.scale <- FALSE - amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = get(tempo.name.median), mapping = ggplot2::aes(x = X_MEDIAN, y = Y_MEDIAN))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_segment(data = data.frame(x = 0, y = 0, xend = 1, yend = 1), mapping = ggplot2::aes(x = x, y = y, xend = xend, yend = yend), color = "black", linetype = "dashed")) for(i2 in 1:slurm.loop.nb){ # roc curves if(nrow(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(data = get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))[order(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$x, get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$y, decreasing = FALSE), ], mapping = ggplot2::aes(x = x, y = y), color = "grey", alpha = 0.5)) } } - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red")) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_segment(data = data.frame(x = 0, y = 0, xend = 1, yend = 1), mapping = ggplot2::aes(x = x, y = y, xend = xend, yend = yend), color = "black", linetype = "dashed")) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red", size =0.75)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("ROC CURVE ", data.kind[i0], " AND ", ana.kind[i1], " (n = ", slurm.loop.nb,")\nAUC = ", - round(get(paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"))$MEDIAN, 3), " (", - round(get(paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"))$CI95.inf, 3), " - ", - round(get(paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"))$CI95.sup, 3), ")" - ))) + round(get(paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"))$MEDIAN, 3), " (", + round(get(paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"))$CI95.inf, 3), " - ", + round(get(paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"))$CI95.sup, 3), ")" + ))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("FALSE POSITIVE RATE (1 - SPECIFICITY)")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("TRUE POSITIVE RATE (SENSITIVITY)")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) @@ -556,54 +590,25 @@ for(i0 in 1:length(data.kind)){ if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ - windows(5, 5) + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } - # fun_export_data(path = path.out, data = paste0("SEE THE ", tempo.name.median, " AND THE ", paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"), " OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, tempo.name.median, paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median")) - - # confusion matrix - fun_export_data(path = path.out, data = paste0("CONFUSION MATRIX (CI95):"), output = log.file) - tempo.name <- paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1]) - tempo1 <- apply(get(tempo.name)[1:4], 2, median, na.rm = TRUE) - tempo2 <- apply(get(tempo.name)[1:4], 2, quantile, probs = 0.025, na.rm = TRUE) - tempo3 <- apply(get(tempo.name)[1:4], 2, quantile, probs = 0.975, na.rm = TRUE) - tempo4 <- paste(tempo1, tempo2, sep = " (") - tempo4 <- paste(tempo4, tempo3, sep = " - ") - tempo4 <- paste0(tempo4, ")") - assign(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"), as.table(matrix(tempo4, ncol = 2, dimnames = list(TRUTH = c("NR", "R"), PREDICTION = c("NR", "R"))))) - fun_export_data(path = path.out, data = get(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median")), output = log.file, rownames.kept = TRUE) - fun_export_data(path = path.out, data = paste0("SEE THE ", tempo.name.median, " AND THE ", paste0("final.auc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"), " OBJECT IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, tempo.name.median, paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median")) } } - ######## 4.6 Plot top feature boxplots fun_export_data(path = path.out, data = "################ TOP PREDICTOR GENES", output = log.file) -boxdat <- data.frame(ASDAS = df.nano$response_ASDAS_R_NR, - Cohort.name = df.nano$cohort_id, - RF.COHORTE = df.nano$training_validation, - nano.rf.selected.genes -) -boxdat_melt <- reshape2::melt(boxdat, id.vars = c("ASDAS", "Cohort.name", "RF.COHORTE"), variable.name = "Gene") # boxplot -bar.width <- 0.5 -log.scale <- FALSE -amplif <- label.size -facet.nrow = 4 -facet.ncol = 4 -boxdat_melt$Gene <- as.character(boxdat_melt$Gene) -pages.print.nb <- ceiling(length(unique(boxdat_melt$Gene)) / (facet.nrow * facet.ncol)) +pages.print.nb <- ceiling(length(levels(boxdat_melt$Gene)) / (facet.nrow * facet.ncol)) increm <- facet.nrow * facet.ncol for(i0 in 1:pages.print.nb){ select <- (increm * i0 - increm + 1):(increm * i0) - if(max(select) > length(unique(boxdat_melt$Gene))){ - select <- (increm * i0 - increm + 1):length(unique(boxdat_melt$Gene)) + if(max(select) > length(levels(boxdat_melt$Gene))){ + select <- (increm * i0 - increm + 1):length(levels(boxdat_melt$Gene)) } tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 @@ -611,137 +616,103 @@ for(i0 in 1:pages.print.nb){ if(i0 == 1){ fun_export_data(path = path.out, data = paste0("WITH ", analysis.kind, " KIND OF ANALYSIS, BOXPLOTS ARE PROVIDED USING THE TRAINING AND THE VALIDATING COHORTES, AS DEFINED IN THE training_validation COLUMN OF THE df.nano DATA FRAME", path.out), output = log.file) } - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = boxdat_melt[boxdat_melt$Gene %in% unique(boxdat_melt$Gene)[select], ], mapping = ggplot2::aes(x=ASDAS, y=value, colour=ASDAS, shape = RF.COHORTE, linetype = RF.COHORTE))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = boxdat_melt[boxdat_melt$Gene %in% levels(boxdat_melt$Gene)[select], ], mapping = ggplot2::aes(x=ASDAS, y=value, colour=ASDAS, shape = RF.COHORTE, linetype = RF.COHORTE))) }else if(analysis.kind == "full_cross_validation"){ if(i0 == 1){ - fun_export_data(path = path.out, data = paste0("WITH ", analysis.kind, " KIND OF ANALYSIS, BOXPLOTS CANNOT BE DIFFERENCIATED BETWEEN THE TRAINING AND THE VALIDATING COHORTES", path.out), output = log.file) + fun_export_data(path = path.out, data = paste0("WITH ", analysis.kind, " KIND OF ANALYSIS, BOXPLOTS CANNOT BE DIFFERENCIATED BETWEEN THE TRAINING AND THE VALIDATING COHORTES"), output = log.file) } - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = boxdat_melt[boxdat_melt$Gene %in% unique(boxdat_melt$Gene)[select], ], mapping = ggplot2::aes(x=ASDAS, y=value, colour=ASDAS))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = boxdat_melt[boxdat_melt$Gene %in% levels(boxdat_melt$Gene)[select], ], mapping = ggplot2::aes(x=ASDAS, y=value, colour=ASDAS))) }else{ stop(paste0("\n\n============\n\nERROR 1 IN R CODE\n\n============\n\n")) } assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggbeeswarm::geom_beeswarm(dodge.width = 0.75, alpha = 0.7)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_boxplot(outlier.shape = NA, fill = NA)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::facet_wrap(~ Gene, nrow = facet.nrow, ncol = facet.ncol, scales = "fixed", shrink = FALSE)) # scales = "fixed", shrink = FALSE to have same scale range + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::facet_wrap(~ Gene, nrow = facet.nrow, ncol = facet.ncol, scales = "fixed", shrink = FALSE, drop = TRUE)) # scales = "fixed", shrink = FALSE to have same scale range assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle("SELECTED GENES (EACH DOT IS A PATIENT")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("NANOSTRING VALUE")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_bw()) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( - plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size), - axis.text = ggplot2::element_text(size = label.size), - axis.title = ggplot2::element_text(size = label.size), - legend.text = ggplot2::element_text(size = label.size), - legend.title = ggplot2::element_text(size = label.size), - strip.text = ggplot2::element_text(size = label.size) + plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif), + axis.text = ggplot2::element_text(size = amplif), + axis.title = ggplot2::element_text(size = amplif), + legend.text = ggplot2::element_text(size = amplif), + legend.title = ggplot2::element_text(size = amplif), + strip.text = ggplot2::element_text(size = amplif) )) if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ - windows(5, 5) + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } -fun_export_data(path = path.out, data = paste0("SEE THE nano.rf.selected.genes OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) ######## 4.7 Frequencies associated to each predictor fun_export_data(path = path.out, data = "################ FINAL LIST OF PREDICTOR GENES AND FREQUENCIES", output = log.file) -if(nrow(final.gene.list) == 0 | nrow(na.omit(final.gene.list)) == 0){ - fun_export_data(path = path.out, data = paste0("NO FINAL LIST OF PREDICTOR GENES AND FREQUENCIES (NO ROWS)"), output = log.file) - fun_export_data(path = path.out, data = final.gene.list, output = log.file) +if( ! any(ls() %in% "final.gene.list.median")){ + fun_export_data(path = path.out, data = paste0("NO final.gene.list.median RESULTS FROM THE RANDOM FOREST VALIDATION"), output = log.file) +}else if(nrow(final.gene.list.median) == 0 | nrow(na.omit(final.gene.list.median)) == 0){ + fun_export_data(path = path.out, data = paste0("NO FINAL LIST OF PREDICTOR GENES AND FREQUENCIES (NO ROWS)"), output = log.file) }else{ - tempo1 <- aggregate(final.gene.list$Freq, list(final.gene.list$sfeats), median, na.rm = TRUE) - names(tempo1) <- c("GENE", "MEDIAN") - tempo2 <- aggregate(final.gene.list$sfeats, list(final.gene.list$sfeats), function(x){length(x[ ! is.na(x)])}) # nb - if(identical(tempo1$GENE, tempo2$Group.1)){ - tempo1 <- data.frame(tempo1, NB = tempo2$x) - }else{ - cat(paste0("\n\n============\n\nERROR: IN THE PLOT OF THE final.gene.list OBJECT, tempo1 AND tempo2 SHOULD HAVE THE SAME GENE COLUMN: \n\n============\n\n")) - tempo1 - tempo2 - cat("\n\n============\n\n") - stop() - } - final.gene.list.median <- cbind(tempo1, - CI95.inf = aggregate(final.gene.list$Freq, list(final.gene.list$sfeats), quantile, probs = 0.025, na.rm = TRUE)[, 2], - CI95.sup = aggregate(final.gene.list$Freq, list(final.gene.list$sfeats), quantile, probs = 0.975, na.rm = TRUE)[, 2], - stringsAsFactors = FALSE - ) - final.gene.list.median <- final.gene.list.median[order(final.gene.list.median$MEDIAN, final.gene.list.median$NB, decreasing = TRUE), ] - fun_export_data(path = path.out, data = "FINAL LIST OF PREDICTOR GENES AND FREQUENCIES: ", output = log.file) - fun_export_data(path = path.out, data = final.gene.list.median, output = log.file) - - # horiz barplot + CI - bar.width <- 0.5 - log.scale <- FALSE - amplif <- label.size - tempo.gg.name <- "gg.indiv.plot." - tempo.gg.count <- 0 - if(nrow(final.gene.list.median) > 20){ - tempo1 <- table(final.gene.list.median$MEDIAN, useNA = "no") - tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] - tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) - tempo.plot <- final.gene.list.median[round(final.gene.list.median$MEDIAN, 7) %in% round(tempo3, 7), ] - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower - tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST IMPORTANT GENES PLOTTED, AMONG ", nrow(final.gene.list.median)) - }else{ - tempo.plot <- final.gene.list.median - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower - tempo.txt <- "" - } - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("FINAL LIST OF PREDICTOR GENES AND FREQUENCIES (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ", slurm.loop.nb, " LOOPS)"))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN & 95CI")) - # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( - # plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"), - plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), - axis.text.x = ggplot2::element_text(angle = 90, hjust = 1), - axis.text = ggplot2::element_text(size = amplif * 0.75), - axis.title = ggplot2::element_text(size = amplif), - legend.text = ggplot2::element_text(size = amplif), - legend.title = ggplot2::element_text(size = amplif), - strip.text = ggplot2::element_text(size = amplif), - legend.key = ggplot2::element_blank(), - panel.background = ggplot2::element_rect(fill = "grey95"), - panel.border = ggplot2::element_rect(colour = "black", fill = NA), - panel.grid.major.x = ggplot2::element_line(colour = "grey75"), - panel.grid.major.y = ggplot2::element_line(colour = "grey75"), - panel.grid.minor.x = ggplot2::element_blank(), - panel.grid.minor.y = ggplot2::element_blank(), - strip.background = ggplot2::element_rect(fill = "white", colour = "black") - )) - coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[2]]$x, y = coord[[2]]$ymax * 1.05, label = tempo.plot$NB, size = amplif/2, color = "black", vjust = "bottom", hjust = "center", angle = 00)) # beware: no need of order() for labels because coord[[1]]$x set the order - if(log.scale == TRUE){ - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) - }else{ - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) - assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.gene.list.median$CI95.sup, na.rm = TRUE) * 1.1))) # fix limits - } - if(activate.pdf == TRUE){ - tempo <- dev.set(pdf.nb) # assign to avoid the message - }else{ - windows(5, 5) - } - print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) - fun_export_data(path = path.out, data = paste0("SEE THE final.gene.list AND THE final.gene.list.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) - backup.name <- c(backup.name, "final.gene.list.median") + tempo.gg.name <- "gg.indiv.plot." + tempo.gg.count <- 0 + if(nrow(final.gene.list.median) > 20){ + tempo1 <- table(final.gene.list.median$MEDIAN, useNA = "no") + tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] + tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) + tempo.plot <- final.gene.list.median[round(final.gene.list.median$MEDIAN, 7) %in% round(tempo3, 7), ] + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower + tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST IMPORTANT GENES PLOTTED, AMONG ", nrow(final.gene.list.median)) + }else{ + tempo.plot <- final.gene.list.median + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower + tempo.txt <- "" + } + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("FINAL LIST OF PREDICTOR GENES AND FREQUENCIES (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN & 95CI")) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( + plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), + axis.text.x = ggplot2::element_text(angle = 90, hjust = 1), + axis.text = ggplot2::element_text(size = amplif * 0.75), + axis.title = ggplot2::element_text(size = amplif), + legend.text = ggplot2::element_text(size = amplif), + legend.title = ggplot2::element_text(size = amplif), + strip.text = ggplot2::element_text(size = amplif), + legend.key = ggplot2::element_blank(), + panel.background = ggplot2::element_rect(fill = "grey95"), + panel.border = ggplot2::element_rect(colour = "black", fill = NA), + panel.grid.major.x = ggplot2::element_line(colour = "grey75"), + panel.grid.major.y = ggplot2::element_line(colour = "grey75"), + panel.grid.minor.x = ggplot2::element_blank(), + panel.grid.minor.y = ggplot2::element_blank(), + strip.background = ggplot2::element_rect(fill = "white", colour = "black") + )) + coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[2]]$x, y = coord[[2]]$ymax * 1.05, label = tempo.plot$NB, size = amplif/3, color = "black", vjust = "bottom", hjust = "center", angle = 00)) # beware: no need of order() for labels because coord[[1]]$x set the order + if(log.scale == TRUE){ + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) + }else{ + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) + assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(tempo.plot$CI95.sup, na.rm = TRUE) * 1.1))) # fix limits + } + if(activate.pdf == TRUE){ + tempo <- dev.set(pdf.nb) # assign to avoid the message + }else{ + fun_open_window(pdf.disp = activate.pdf, width.fun = window.width, height.fun = window.height) + } + print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } - - - ################################ End Main code ################################ Post Main code - - ############ Pdf window closing fun_close_specif_window()