################################ Initialization erase.objects <- TRUE # write TRUE to erase all the existing objects in R before starting the algorithm and FALSE otherwise. Beginners should use TRUE if(erase.objects == TRUE){ rm(list=ls()) erase.objects = TRUE } erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before starting the algorithm and FALSE otherwise ################################ End Initialization sink(stdout(), type = "message") # diverts R output to a connection script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "project.name", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind") # objects names exactly in the same order as in the bash code and recovered in args if(length(args) != length(tempo.arg.names)){ tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n") stop(tempo.cat) } for(i in 1:length(tempo.arg.names)){ assign(tempo.arg.names[i], args[i]) if(is.character(get(tempo.arg.names[i])) == TRUE & grepl(x = get(tempo.arg.names[i]), pattern = "^[1234567890]+$")){ # convert into numeric assign(tempo.arg.names[i], as.integer(get(tempo.arg.names[i]))) }else if(is.character(get(tempo.arg.names[i])) == TRUE & grepl(x = get(tempo.arg.names[i]), pattern = "^[1234567890.+e-]+$")){ assign(tempo.arg.names[i], as.numeric(get(tempo.arg.names[i]))) }else if(is.character(get(tempo.arg.names[i])) == TRUE & grepl(x = get(tempo.arg.names[i]), pattern = "^(TRUE|FALSE)$")){ # convert into logical assign(tempo.arg.names[i], as.logical(get(tempo.arg.names[i]))) } } ################################ Recording of the initial parameters param.list <- c( "script", "args", "tempo.arg.names", tempo.arg.names ) if(any(duplicated(param.list))){ stop(paste0("\n\n================\n\nERROR: THE param.list OBJECT CONTAINS DUPLICATED ELEMENTS\n\n================\n\n")) # message for developers } char.length <- nchar(param.list) space.add <- max(char.length) - char.length + 5 param.ini.settings <- NULL for(i in 1:length(param.list)){ param.ini.settings <- c(param.ini.settings, paste0("\n", param.list[i], paste0(rep(" ", space.add[i]), collapse = ""), paste0(get(param.list[i]), collapse = ","))) } ################################ End Recording of the initial parameters ################################ DEBUG debug2 <- ' rm(list = ls()) erase.objects <- TRUE erase.graphs <- TRUE script <- "code ini v1.0.0" project.name <-"rogge12231" path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required path.in <- "Z:/rogge12231/rogge_12231_1550160832/" # absolute path of the data folder path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot project.name <- "rogge_project" activate.pdf = FALSE label.size <-12 optional.text <- "" slurm.loop.nb <- 3 analysis.kind <- "full_cross_validation" ' eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop() # data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode ################################ End DEBUG ################################ Packages verification and import # packages are imported even if functions are used using package.name::function() req.package.list <- c( "pheatmap", "corrplot", "ggplot2", "gridExtra", "lubridate", "RCurl" # for url.exists ) if(path.lib == "none"){ path.lib <- .libPaths() # }else{ # .libPaths(new = ) add path to default path .libPaths(new = sub(x = path.lib, pattern = "/$|\\\\$", replacement = "")) # .libPaths() does not support / at the end of a submitted path. Thus check and replace last / or \\ in path } for(i0 in 1:length(req.package.list)){ if( ! req.package.list[i0] %in% rownames(installed.packages(lib.loc = path.lib))){ stop(paste0("\n\n================\n\nERROR: PACKAGE ", req.package.list[i0], " MUST BE INSTALLED IN THE MENTIONNED DIRECTORY:\n", paste(path.lib, collapse = "\n"), "\n CHECK ALSO IN :\n", paste(.libPaths(), collapse = "\n"), "\n\n================\n\n")) }else{ suppressPackageStartupMessages(library(req.package.list[i0], lib.loc = path.lib, quietly = TRUE, character.only = TRUE)) } } ################################ End Packages verification and import ################################ Functions if(length(path.function1) != 1){ stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n")) }else if(grepl(x = path.function1, pattern = "^http")){ if( ! RCurl::url.exists(path.function1)){ stop(paste0("\n\n============\n\nERROR: HTTP INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n")) }else{ source(path.function1) # source the fun_ functions used below } }else if( ! grepl(x = path.function1, pattern = "^http")){ if( ! file.exists(path.function1)){ stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n")) }else{ source(path.function1) # source the fun_ functions used below } } ################################ End Functions ################################ Main code ################ Pre-ignition checking arg.check <- NULL # for function debbuging checked.arg.names <- NULL # for function debbuging ee <- expression(arg.check <- c(arg.check, tempo$problem) , checked.arg.names <- c(checked.arg.names, tempo$param.name)) # initializations tempo <- fun_param_check(data = erase.objects, class = "logical", length = 1) ; eval(ee) tempo <- fun_param_check(data = erase.graphs, class = "logical", length = 1) ; eval(ee) tempo <- fun_param_check(data = script, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = args, class = "character", length = length(tempo.arg.names)) ; eval(ee) tempo <- fun_param_check(data = tempo.arg.names, class = "character", length = length(tempo.arg.names)) ; eval(ee) # imported objects tempo <- fun_param_check(data = path.lib, class = "character", length = 1) ; eval(ee) if(tempo$problem == FALSE & path.lib != "none"){ if( ! dir.exists(path.lib)){ cat(paste0("\n\n============\n\nERROR: DIRECTORY PATH INDICATED IN THE path.in PARAMETER DOES NOT EXISTS: ", path.in, "\n\n============\n\n")) arg.check <- TRUE } } tempo <- fun_param_check(data = path.in, class = "character", length = 1) ; eval(ee) if(tempo$problem == FALSE & ! dir.exists(path.in)){ cat(paste0("\n\n============\n\nERROR: DIRECTORY PATH INDICATED IN THE path.in PARAMETER DOES NOT EXISTS: ", path.in, "\n\n============\n\n")) arg.check <- TRUE } if(tempo$problem == FALSE & ! ("loop1_res_data.RData" %in% list.files(paste0(path.in, "loop1/")))){ cat(paste0("\n\n============\n\nERROR: loop1_res_data.RData SHOULD BE PRESENT IN ", paste0(path.in, "loop1/"), ":\nCHECK THE path.in PARAMETER\n\n============\n\n")) arg.check <- TRUE } tempo <- fun_param_check(data = path.out, class = "character", length = 1) ; eval(ee) if(tempo$problem == FALSE & ! dir.exists(path.out)){ cat(paste0("\n\n============\n\nERROR: DIRECTORY PATH INDICATED IN THE path.out PARAMETER DOES NOT EXISTS: ", path.out, "\n\n============\n\n")) arg.check <- TRUE } # path.function1 fully tested above tempo <- fun_param_check(data = path.function1, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = project.name, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = label.size, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = optional.text, class = "character", length = 1) ; eval(ee) tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee) tempo <- fun_param_check(data = analysis.kind, options = c("longit", "valid_boot", "full_cross_validation"), length = 1) ; eval(ee) if(any(arg.check) == TRUE){ stop() } ################ Ignition ini.time <- as.numeric(Sys.time()) # time of process begin, converted into seconds analysis.nb <- trunc(ini.time) # to provide a specific number ot each analysis log.file <- paste0("r_", project.name, "_", analysis.nb,"_report.txt") # name.dir <- paste0(project.name, "_", analysis.nb) # path.out<-paste0(path.out, name.dir) # suppressWarnings(dir.create(path.out)) backup.name <- NULL # names of the object to save fun_export_data(data = paste0("\n\n################################ ", log.file, " ################"), output = log.file, no.overwrite = FALSE, path = path.out, sep = 4) fun_export_data(path = path.out, data = "################################ INITIAL DATA", output = log.file, sep = 4) fun_export_data(path = path.out, data = paste0("SCRIPT USED: ", script), output = log.file) fun_export_data(path = path.out, data = paste0("KIND OF ANALYSIS PERFORMED: ", analysis.kind), output = log.file) fun_export_data(path = path.out, data = paste0("NUMBER OF LOOPS USED FOR COMPILATION: ", slurm.loop.nb), output = log.file) ################ Graphical parameter ignition zone.ini <- matrix(1, ncol=1) # to reset the layout if required if(erase.graphs == TRUE){ graphics.off() }else{ tempo.cat <- paste0("BEWARE: GRAPHICS HAVE NOT BEEN ERASED. GRAPHICAL PARAMETERS MAY HAVE NOT BEEN REINITIALIZED") fun_export_data(path = path.out, data = tempo.cat, output = log.file) } if(optional.text == "no.txt"){ optional.text <- "" }else{ fun_export_data(path = path.out, data = paste0("OPTIONAL TEXT:\n", optional.text), output = log.file) } activate.pdf <- TRUE par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("final_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par pdf.nb <- dev.cur() ################ Data import for(i0 in 1:slurm.loop.nb){ load(paste0(path.in, "loop", i0, "/loop", i0, "_res_data.RData")) } # > class(loop1_ttab) # [1] "data.frame" # > class(mod.gene.names) # [1] "character" # > class(confmat1.genes.logreg$result) # [1] "matrix" # class(data.pred1.genes.logreg) # [1] "data.frame" # class(data.roc1.genes.rpart) # [1] "data.frame" # > class(final.gene.list) # [1] "data.frame" ################ Data compilation backup.name <- NULL # ttab if(analysis.kind == "full_cross_validation"){ final.ttab <- data.frame(NULL, stringsAsFactors = FALSE) for(i0 in 1:slurm.loop.nb){ if(nrow(get(paste0("loop", i0, "_ttab"))) > 0){ final.ttab <- rbind(final.ttab, data.frame(get(paste0("loop", i0, "_ttab")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) }else{ final.ttab <- rbind(final.ttab, data.frame(logFC = NA, AveExpr = NA, t = NA, P.Value = NA, adj.P.Val = NA, B = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) } } }else{ final.ttab <- get(paste0("loop1_ttab")) fun_export_data(path = path.out, data = paste0("final.ttab OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file) } backup.name <- c(backup.name, "final.ttab") # select.gene.curve if(analysis.kind == "full_cross_validation"){ final.select.gene.curve <- data.frame(NULL, stringsAsFactors = FALSE) for(i0 in 1:slurm.loop.nb){ if(nrow(get(paste0("loop", i0, "_select.gene.curve"))) > 0){ final.select.gene.curve <- rbind(final.select.gene.curve, data.frame(get(paste0("loop", i0, "_select.gene.curve")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) }else{ final.select.gene.curve <- rbind(final.select.gene.curve, data.frame(x = NA, y = NA, PANEL = NA, group = NA, shape = NA, colour = NA, size = NA, fill = NA, alpha = NA, stroke = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) } } }else{ final.select.gene.curve <- get(paste0("loop1_select.gene.curve")) fun_export_data(path = path.out, data = paste0("final.select.gene.curve OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file) } backup.name <- c(backup.name, "final.select.gene.curve") # mod.gene.names if(analysis.kind == "full_cross_validation"){ final.mod.gene.names <- data.frame(NULL, stringsAsFactors = FALSE) for(i0 in 1:slurm.loop.nb){ if(length(get(paste0("loop", i0, "_mod.gene.names"))) > 0){ final.mod.gene.names <- rbind(final.mod.gene.names, data.frame(GENE = get(paste0("loop", i0, "_mod.gene.names")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) }else{ final.mod.gene.names <- rbind(final.mod.gene.names, data.frame(GENE = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) } } }else{ final.mod.gene.names <- data.frame(GENE = get(paste0("loop1_mod.gene.names")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE) fun_export_data(path = path.out, data = paste0("final.mod.gene.names OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file) } backup.name <- c(backup.name, "final.mod.gene.names") # gene.importance if(analysis.kind == "full_cross_validation"){ final.gene.importance <- data.frame(NULL, stringsAsFactors = FALSE) for(i0 in 1:slurm.loop.nb){ if(nrow(get(paste0("loop", i0, "_gene.importance"))) > 0){ final.gene.importance <- rbind(final.gene.importance, data.frame(get(paste0("loop", i0, "_gene.importance")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) }else{ final.gene.importance <- rbind(final.gene.importance, data.frame(features = NA, importance = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) } } }else{ final.gene.importance <- get(paste0("loop1_gene.importance")) fun_export_data(path = path.out, data = paste0("final.gene.importance OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ", analysis.kind, " ANALYSIS"), output = log.file) } backup.name <- c(backup.name, "final.gene.importance") # loop1_confmat1.genes.rf, data.pred1.genes.logreg, data.roc1.genes.logreg data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important ana.kind <- c("rf", "logreg", "rpart") for(i0 in 1:length(data.kind)){ for(i1 in 1:length(ana.kind)){ assign(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1]), data.frame(NULL, stringsAsFactors = FALSE)) assign(paste0("final.data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1]), data.frame(NULL, stringsAsFactors = FALSE)) assign(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]), data.frame(NULL, stringsAsFactors = FALSE)) for(i2 in 1:slurm.loop.nb){ # matrix confusion if(nrow(get(paste0("loop", i2, "_confmat", i0, ".", data.kind[i0], ".", ana.kind[i1]))$result) > 0){ # transform the table into a 1 line data frame tempo1 <- get(paste0("loop", i2, "_confmat", i0, ".", data.kind[i0], ".", ana.kind[i1])) tempo2 <- c(c(tempo1$result[1:2, 1:2]), tempo1$task.desc$size) tempo3 <- c("TRUE_NR_PRED_NR", "TRUE_R_PRED_NR", "TRUE_NR_PRED_R", "TRUE_R_PRED_R", "SIZE") tempo4 <- eval(parse(text = paste0("data.frame(", paste(tempo3, tempo2, sep = "=", collapse = ","), ", LOOP_NB = 'loop", i2, "', stringsAsFactors = FALSE)"))) assign(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1]), rbind(get(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1])), tempo4)) }else{ assign(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1]), rbind(get(paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1])), data.frame(TRUE_NR_PRED_NR = NA, TRUE_R_PRED_NR = NA, TRUE_NR_PRED_R = NA, TRUE_R_PRED_R = NA, SIZE = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))) } # predictions if(nrow(get(paste0("loop", i2, "_data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){ assign(paste0("final.data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1]), rbind(get(paste0("final.data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1])), data.frame(get(paste0("loop", i2, "_data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1])), LOOP_NB = paste0("loop", i2), stringsAsFactors = FALSE))) }else{ assign(paste0("final.data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1]), rbind(get(paste0("final.data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1])), data.frame(truth = NA, prob.NR = NA, prob.R = NA, response = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))) } # roc curves if(nrow(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){ assign(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]), rbind(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), data.frame(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), LOOP_NB = paste0("loop", i2), stringsAsFactors = FALSE))) }else{ assign(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]), rbind(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), data.frame(x = NA, y = NA, PANEL = NA, group = NA, coloup = NA, sizep = NA, linetypep = NA, alphap = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))) } } backup.name <- c(backup.name, paste0("final.confmat", i0, ".", data.kind[i0], ".", ana.kind[i1])) backup.name <- c(backup.name, paste0("final.data.pred", i0, ".", data.kind[i0], ".", ana.kind[i1])) backup.name <- c(backup.name, paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])) } } # final.gene.list final.gene.list <- data.frame(NULL, stringsAsFactors = FALSE) for(i0 in 1:slurm.loop.nb){ if(length(get(paste0("loop", i0, "_final.gene.list"))) > 0){ final.gene.list <- rbind(final.gene.list, data.frame(get(paste0("loop", i0, "_final.gene.list")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) }else{ final.gene.list <- rbind(final.gene.list, data.frame(sfeats = NA, Freq = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE)) } } backup.name <- c(backup.name, "final.gene.list") save(list=c(backup.name), file = paste0(path.out, "compiled_data.RData")) fun_export_data(path = path.out, data = paste0("THE COMPILED DATA ARE SAVED IN: ", paste0(path.out, "compiled_data.RData")), output = log.file) ################ 2 plotting the heatmap adnd corplot fun_export_data(path = path.out, data = "################################ HEATMAP AND CORRELATION PLOT ", output = log.file, sep = 4) fun_export_data(path = path.out, data = paste0("HEATMAP AND CORRELATION PLOT COME FROM INITIAL DATASET (NO BOOTSTRAP INFLUENCE)"), output = log.file) tempo <- dev.set(pdf.nb) # assign to avoid the message print(loop1_heatmap.plot) # same for all the loops corrplot(loop1_corr.plot, tl.col = "black", tl.cex = label.size / 10) # same for all the loops backup.name <- c(backup.name, "loop1_heatmap.plot", "loop1_corr.plot") ################ 3 Differential analysis with limma fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file, sep = 4) if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){ fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE LIMMA ANALYSIS (P VALUES ABOVE 0.05 AFTER CORRECTION FOR INSTANCE)"), output = log.file) fun_export_data(path = path.out, data = final.ttab, output = log.file) }else if(analysis.kind == "valid_boot"){ fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) }else{ tempo1 <- c(table(rownames(final.ttab))) tempo2 <- data.frame( ADJ_P_VALUE_MEDIAN = quantile(final.ttab$adj.P.Val, probs = 0.5, na.rm = TRUE), ADJ_P_VALUE_CI95_INF = quantile(final.ttab$adj.P.Val, probs = 0.025, na.rm = TRUE), ADJ_P_VALUE_CI95_SUP = quantile(final.ttab$adj.P.Val, probs = 0.975, na.rm = TRUE) ) final.ttab.freq <- data.frame(GENE = names(tempo1), NB = unname(tempo1), FREQ = unname(tempo1)/sum(tempo1), tempo2) rownames(final.ttab.freq) <- NULL fun_export_data(path = path.out, data = "FREQUENCIES AND P VALUES MEDIANS & CI: ", output = log.file) fun_export_data(path = path.out, data = final.ttab.freq, output = log.file) # horiz barplot bar.width <- 0.5 log.scale <- FALSE amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 if(nrow(final.ttab.freq) > 20){ tempo1 <- table(final.ttab.freq$FREQ) tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) tempo.plot <- final.ttab.freq[round(final.ttab.freq$FREQ, 7) %in% round(tempo3, 7), ] assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.ttab.freq)) }else{ tempo.plot <- final.ttab.freq assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower tempo.txt <- "" } assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order if(log.scale == TRUE){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_flip(ylim = c(0, max(final.ttab.freq$FREQ, na.rm = TRUE) * 1.02))) # fix limits } if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ windows(5, 5) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } fun_export_data(path = path.out, data = paste0("SEE THE final.ttab AND THE final.ttab.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) backup.name <- c(backup.name, "final.ttab.freq") ################ 4 Machine Learning based analysis ######## 4.1 Discovery: Learning the Random Forest fun_export_data(path = path.out, data = "################################ RANDOM FOREST LEARNING USING THE DISCOVERY SET", output = log.file, sep = 4) #mod.gene.names fun_export_data(path = path.out, data = "################ LIST OF GENES SELECTED DURING THE MODEL ELABORATION", output = log.file, sep = 4) if(nrow(final.mod.gene.names) == 0 | nrow(na.omit(final.mod.gene.names)) == 0){ fun_export_data(path = path.out, data = paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) }else if(analysis.kind == "valid_boot"){ fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) fun_export_data(path = path.out, data = final.mod.gene.names, output = log.file) }else{ tempo1 <- c(table(final.mod.gene.names$GENE)) final.mod.gene.names.freq <- data.frame(GENE = names(tempo1), NB = unname(tempo1), FREQ = unname(tempo1)/sum(tempo1)) rownames(final.mod.gene.names.freq) <- NULL fun_export_data(path = path.out, data = "FREQUENCIES: ", output = log.file) fun_export_data(path = path.out, data = final.mod.gene.names.freq, output = log.file) # horiz barplot bar.width <- 0.5 log.scale <- FALSE amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 if(nrow(final.mod.gene.names.freq) > 20){ tempo1 <- table(final.mod.gene.names.freq$FREQ) tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) tempo.plot <- final.mod.gene.names.freq[round(final.mod.gene.names.freq$FREQ, 7) %in% round(tempo3, 7), ] assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST FREQUENT GENES PLOTTED, AMONG ", nrow(final.mod.gene.names.freq)) }else{ tempo.plot <- final.mod.gene.names.freq assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, FREQ), y = FREQ))) # reorder from higher to lower tempo.txt <- "" } assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = PROP.CI95.inf, ymax = PROP.CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ", slurm.loop.nb, " LOOPS)\n", tempo.txt))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("PROPORTION")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = round(coord[[1]]$y, 3), size = amplif/2, color = "black", vjust = "center", hjust = "right")) # beware: no need of order() for labels because coord[[1]]$x set the order if(log.scale == TRUE){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_flip(ylim = c(0, max(final.mod.gene.names.freq$FREQ, na.rm = TRUE) * 1.02))) # fix limits } if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ windows(5, 5) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } fun_export_data(path = path.out, data = paste0("SEE THE final.mod.gene.names AND THE final.mod.gene.names.freq OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) backup.name <- c(backup.name, "final.mod.gene.names.freq") #gene.importance fun_export_data(path = path.out, data = "################ GENE IMPORTANCE DURING THE MODEL ELABORATION", output = log.file, sep = 4) if(nrow(final.gene.importance) == 0 | nrow(na.omit(final.gene.importance)) == 0){ fun_export_data(path = path.out, data = paste0("NO GENE IMPORTANCE RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) fun_export_data(path = path.out, data = final.gene.importance, output = log.file) }else if(analysis.kind == "valid_boot"){ fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) fun_export_data(path = path.out, data = final.gene.importance, output = log.file) }else{ tempo1 <- aggregate(final.gene.importance$importance, list(final.gene.importance$features), median, na.rm = TRUE) names(tempo1) <- c("GENE", "MEDIAN") tempo2 <- aggregate(final.gene.importance$features, list(final.gene.importance$features), function(x){length(x[ ! is.na(x)])}) # nb if(identical(tempo1$GENE, tempo2$Group.1)){ tempo1 <- data.frame(tempo1, NB = tempo2$x) }else{ cat(paste0("\n\n============\n\nERROR: IN THE PLOT OF THE final.gene.importance OBJECT, tempo1 AND tempo2 SHOULD HAVE THE SAME GENE COLUMN: \n\n============\n\n")) tempo1 tempo2 cat("\n\n============\n\n") stop() } final.gene.importance.median <- cbind(tempo1, CI95.inf = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.025, na.rm = TRUE)[, 2], CI95.sup = aggregate(final.gene.importance$importance, list(final.gene.importance$features), quantile, probs = 0.975, na.rm = TRUE)[, 2], stringsAsFactors = FALSE ) fun_export_data(path = path.out, data = "IMPORTANCE: ", output = log.file) fun_export_data(path = path.out, data = final.gene.importance.median, output = log.file) # horiz barplot + CI bar.width <- 0.5 log.scale <- FALSE amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 if(nrow(final.gene.importance.median) > 20){ tempo1 <- table(final.gene.importance.median$MEDIAN) tempo2 <- tempo1[order(names(tempo1), decreasing = TRUE)] tempo3 <- as.numeric(names(cumsum(tempo2)[cumsum(tempo2) <= 20])) tempo.plot <- final.gene.importance.median[round(final.gene.importance.median$MEDIAN, 7) %in% round(tempo3, 7), ] assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower tempo.txt <- paste0("BEWARE: ONLY THE ", nrow(tempo.plot), " MOST IMPORTANT GENES PLOTTED, AMONG ", nrow(final.gene.importance.median)) }else{ tempo.plot <- final.gene.importance.median assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = tempo.plot, mapping = ggplot2::aes(x = reorder(GENE, -MEDIAN), y = MEDIAN))) # reorder from higher to lower tempo.txt <- "" } assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_bar(stat = "identity", position = "dodge", color = "black", fill = grey(0.8), width = bar.width)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(width = bar.width), color = "black", width = 0)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("GENE IMPORTANCE (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ", slurm.loop.nb, " LOOPS)"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI")) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme( # plot.margin = margin(up.space.mds, right.space.mds, down.space.mds, left.space.mds, "inches"), plot.title = ggplot2::element_text(hjust=1, vjust=1, size = amplif * 0.5), axis.text.x = element_text(angle = 90, hjust = 1), axis.text = ggplot2::element_text(size = amplif * 0.75), axis.title = ggplot2::element_text(size = amplif), legend.text = ggplot2::element_text(size = amplif), legend.title = ggplot2::element_text(size = amplif), strip.text = ggplot2::element_text(size = amplif), legend.key = ggplot2::element_blank(), panel.background = ggplot2::element_rect(fill = "grey95"), panel.border = ggplot2::element_rect(colour = "black", fill = NA), panel.grid.major.x = ggplot2::element_line(colour = "grey75"), panel.grid.major.y = ggplot2::element_line(colour = "grey75"), panel.grid.minor.x = ggplot2::element_blank(), panel.grid.minor.y = ggplot2::element_blank(), strip.background = ggplot2::element_rect(fill = "white", colour = "black") )) coord <- ggplot2::ggplot_build(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + "))))$data # to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotate(geom = "text", x = coord[[1]]$x, y = coord[[1]]$ymax, label = tempo.plot$NB, size = amplif/2, color = "black", vjust = "center", hjust = "right", angle = 90)) # beware: no need of order() for labels because coord[[1]]$x set the order if(log.scale == TRUE){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.gene.importance.median$CI95.sup, na.rm = TRUE) * 1.02))) # fix limits } if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ windows(5, 5) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } fun_export_data(path = path.out, data = paste0("SEE THE final.gene.importance AND THE final.gene.importance.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) backup.name <- c(backup.name, "final.gene.importance.median") #select.gene.curve fun_export_data(path = path.out, data = "################ OPTIMAL NUMBER OF GENES DURING THE MODEL ELABORATION", output = log.file, sep = 4) if(nrow(final.select.gene.curve) == 0 | nrow(na.omit(final.select.gene.curve)) == 0){ fun_export_data(path = path.out, data = paste0("NO OPTIMAL RESULTS FROM THE RANDOM FOREST TRAINING"), output = log.file) fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file) }else if(analysis.kind == "valid_boot"){ fun_export_data(path = path.out, data = paste0("BEWARE: NO LOOP FOR THE THE RANDOM FOREST TRAINING (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file) fun_export_data(path = path.out, data = final.select.gene.curve, output = log.file) }else{ tempo1 <- aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), median, na.rm = TRUE) names(tempo1) <- c("X", "MEDIAN") final.select.gene.curve.median <- cbind(tempo1, CI95.inf = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.025, na.rm = TRUE)[, 2], CI95.sup = aggregate(final.select.gene.curve$y, list(final.select.gene.curve$x), quantile, probs = 0.975, na.rm = TRUE)[, 2], stringsAsFactors = FALSE ) fun_export_data(path = path.out, data = "OPTIMAL NUMBER OF GENES: ", output = log.file) fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file) # line + CI area bar.width <- 0.5 log.scale <- FALSE amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = final.select.gene.curve.median, mapping = ggplot2::aes(x = X, y = MEDIAN))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_point(color = "red", shape = 16, size = 2)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red")) # assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_errorbar(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), position = ggplot2::position_dodge(), color = "black", width = 0)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_ribbon(ggplot2::aes(ymin = CI95.inf, ymax = CI95.sup), color = "red", alpha = 0.3)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("NUMBER OF GENES")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("MEDIAN +/- 95CI")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) if(log.scale == TRUE){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::annotation_logticks(sides = "l")) }else{ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(ylim = c(0, max(final.select.gene.curve.median$CI95.sup, na.rm = TRUE) * 1.02))) # fix limits } if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ windows(5, 5) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) backup.name <- c(backup.name, "final.select.gene.curve.median") ######## 4.2 Validate the model # fun_export_data(path = path.out, data = "################################ VALIDATION", output = log.file, sep = 4) fun_export_data(path = path.out, data = "################ ROC CURVES", output = log.file) data.kind <- c("genes", "crp", "genes.crp") # beware, the order is important ana.kind <- c("rf", "logreg", "rpart") for(i0 in 1:length(data.kind)){ for(i1 in 1:length(ana.kind)){ fun_export_data(path = path.out, data = paste0("######## ", data.kind[i0], " AND ", ana.kind[i1], " ANALYSIS"), output = log.file) if(nrow(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) == 0 | nrow(na.omit(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])))) == 0){ fun_export_data(path = path.out, data = paste0("NO ROC CURVE FOR THE ", data.kind[i0], " AND ", ana.kind[i1], " ANALYSIS"), output = log.file) fun_export_data(path = path.out, data = get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), output = log.file) }else{ tempo1 <- aggregate(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$x, list(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$CUTOFF), median, na.rm = TRUE) names(tempo1) <- c("CUTOFF", "X_MEDIAN") tempo2 <- aggregate(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$y, list(get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))$CUTOFF), median, na.rm = TRUE) names(tempo2) <- c("CUTOFF", "Y_MEDIAN") assign(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median"), cbind(tempo1, tempo2["Y_MEDIAN"])) fun_export_data(path = path.out, data = "MEDIAN ROC CURVE: ", output = log.file) fun_export_data(path = path.out, data = final.select.gene.curve.median, output = log.file) # roc curves bar.width <- 0.5 log.scale <- FALSE amplif <- label.size tempo.gg.name <- "gg.indiv.plot." tempo.gg.count <- 0 assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggplot(data = get(paste0("final.data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1], ".median")), mapping = ggplot2::aes(x = X_MEDIAN, y = Y_MEDIAN))) for(i2 in 1:slurm.loop.nb){ # roc curves if(nrow(get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1]))) > 0){ assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(data = get(paste0("loop", i2, "_data.roc", i0, ".", data.kind[i0], ".", ana.kind[i1])), mapping = ggplot2::aes(x = x, y = y), color = "grey", alpha = 0.5)) } } assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::geom_line(color = "red")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ggtitle(paste0("OPTIMAL NUMBER OF GENES (n = ", slurm.loop.nb,")"))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::xlab("1 - SPECIFICITY")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::ylab("SENSITIVITY")) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), m.gg <- ggplot2::theme_classic(base_size = amplif)) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::scale_y_continuous(expand = c(0,0), breaks = scales::extended_breaks(10))) assign(paste0(tempo.gg.name, tempo.gg.count <- tempo.gg.count + 1), ggplot2::coord_cartesian(xlim = c(0, 1), ylim = c(0, 1))) # fix limits if(activate.pdf == TRUE){ tempo <- dev.set(pdf.nb) # assign to avoid the message }else{ windows(5, 5) } print(eval(parse(text = paste(paste0(tempo.gg.name, 1:tempo.gg.count), collapse = " + ")))) } fun_export_data(path = path.out, data = paste0("SEE THE final.select.gene.curve AND THE final.select.gene.curve.median OBJECTS IN THE compiled_data.RData FILE PRESENT IN: ", path.out), output = log.file) backup.name <- c(backup.name, "final.select.gene.curve.median") ###### ???Add build } } tempo <- dev.set(pdf.nb) # assign to avoid the message roc1.genes.rf <- plotROCCurves(generateThreshVsPerfData(pred1.genes.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 1.genes (only genes) \n AUC=%.2f", performance(pred1.genes.rf, auc))) + theme( plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size), axis.text = ggplot2::element_text(size = label.size), axis.title = ggplot2::element_text(size = label.size), legend.text = ggplot2::element_text(size = label.size), legend.title = ggplot2::element_text(size = label.size), strip.text = ggplot2::element_text(size = label.size) ) roc2.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred2.crp.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 2.crp (only CRP) \n AUC=%.2f", performance(pred2.crp.rf, auc))) + theme( plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size), axis.text = ggplot2::element_text(size = label.size), axis.title = ggplot2::element_text(size = label.size), legend.text = ggplot2::element_text(size = label.size), legend.title = ggplot2::element_text(size = label.size), strip.text = ggplot2::element_text(size = label.size) ) roc3.genes.crp.rf <- plotROCCurves(generateThreshVsPerfData(pred3.genes.crp.rf, measures = list(fpr, tpr, mmce))) + theme_bw() + ggtitle(sprintf("Model 3.genes.crp (genes + CRP) \n AUC=%.2f", performance(pred3.genes.crp.rf, auc))) + theme( plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size), axis.text = ggplot2::element_text(size = label.size), axis.title = ggplot2::element_text(size = label.size), legend.text = ggplot2::element_text(size = label.size), legend.title = ggplot2::element_text(size = label.size), strip.text = ggplot2::element_text(size = label.size) ) data.roc1.genes.rf <- ggplot2::ggplot_build(roc1.genes.rf)$data[[1]] data.roc2.crp.rf <- ggplot2::ggplot_build(roc2.crp.rf)$data[[1]] data.roc3.genes.crp.rf <- ggplot2::ggplot_build(roc3.genes.crp.rf)$data[[1]] backup.name <- c(backup.name, "data.roc1.genes.rf", "data.roc2.crp.rf", "data.roc3.genes.crp.rf") gridExtra::grid.arrange(roc1.genes.rf, roc2.crp.rf, roc3.genes.crp.rf, nrow = 2) # ``` ######## 4.6 Plot top feature boxplots ggbox <- ggplot(data = boxdat_melt, aes(x=Y, y=value, colour=Y, shape = CV, linetype = CV)) + geom_beeswarm(dodge.width = 0.75, alpha = 0.7) + geom_boxplot(outlier.shape = NA, fill = NA) + facet_wrap(~ Gene, ncol = 5) + theme_bw() + theme( plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size), axis.text = ggplot2::element_text(size = label.size), axis.title = ggplot2::element_text(size = label.size), legend.text = ggplot2::element_text(size = label.size), legend.title = ggplot2::element_text(size = label.size), strip.text = ggplot2::element_text(size = label.size) ) tempo <- dev.set(pdf.nb) # assign to avoid the message ggbox ######## 4.7 Frequencies associated to each predictor ################################ End Main code ################################ Post Main code save(list=c(backup.name), file = paste0(path.out, "loop", slurm.loop.nb, "_res_data.RData")) ############ Pdf window closing fun_close_specif_window() ############ total computing time fin.time <- as.numeric(Sys.time()) # time of process end cat("TOTAL COMPUTATION TIME: ", as.character(lubridate::seconds_to_period(round(fin.time - ini.time))), "\n\n") fun_export_data(path = path.out, data = paste0("\n\n\n\n################################ TOTAL COMPUTING TIME\n\n", as.character(lubridate::seconds_to_period(round(fin.time - ini.time)))), output = log.file, sep = 4) ############ Parameter printing fun_export_data("\n\n\n\n################################ INITIAL SETTINGS OF PARAMETERS", output = log.file, path = path.out) fun_export_data(path = path.out, data = param.ini.settings, output = log.file, vector.cat = TRUE) fun_export_data("\n\n\n\n################################ SYSTEM AND PACKAGES", output = log.file, path = path.out) fun_export_data(path = path.out, data = sessionInfo(), output = log.file, vector.cat = TRUE) ################################ End Post Main code