#!/bin/bash # shebang (#! https://en.wikipedia.org/wiki/Shebang_%28Unix%29) indicating to the shell what program to interpret the script with, when executed, probably optional here.
# export allow the variable to be use in subprocesses. Without export, the variable is only available in the current process. Example ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729 instead of export ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729
# _conf: lowercases for alias and scripts, and uppercases for variables
@@ -39,7 +38,7 @@ FILE_NAME1_CONF="supplementary_data_file_test.csv" # name of the data file to im
ML_BOOTSTRAP_NB_CONF=3
LOOP_NB_CONF=3
R_RANDOM_SEED="FALSE"#♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
R_RANDOM_SEED="TRUE"#♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
################ kind of analysis
...
...
@@ -48,7 +47,7 @@ R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the b
# with discovery set 67 indiv (df.nano$cohort_id != "cohortR") and validation set 9 indiv (df.nano$cohort_id == "cohortR")
# "valid_boot" limma and rf training are run once but bootstrap of the validation set 9 indiv (df.nano$cohort_id == "cohortR") using LOOP_NB_CONF parameter
# "full_cross_validation" rows of the dataset are randomly split in two (no replacement), according to CROSS_VALID_RATIO, forming the discovery and validation set
R_ANALYSIS_KIND="longit"
R_ANALYSIS_KIND="full_cross_validation"
CROSS_VALID_RATIO=0.8# proportion (nb indiv randomly selected (wo replacement) for the discovery set) / (total number of indiv)
# -> the validation set is formed by the remaining indiv, with proportion 1 - CROSS_VALID_RATIO
path.lib <- "C:/Users/Gael/Documents/R/win-library/3.5/" # absolute path of the library folder. Write "none" if not required
path.in <- "Z:/rogge12231/rogge_12231_1549993246/" # absolute path of the data folder
path.in <- "Z:/rogge12231/rogge_12231_1550076020/" # absolute path of the data folder
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
project.name <- "rogge_project"
activate.pdf = TRUE
activate.pdf = FALSE
label.size <-12
optional.text <- ""
slurm.loop.nb <- 3
...
...
@@ -104,12 +104,18 @@ for(i0 in 1:length(req.package.list)){
if(length(path.function1)!=1){
stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ",paste(path.function1,collapse=" "),"\n\n============\n\n"))
fun_export_data(path=path.out,data=paste0("final.select.gene.curve OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ",analysis.kind," ANALYSIS"),output=log.file)
fun_export_data(path=path.out,data=paste0("final.gene.importance OBJECT CONTAINS ONLY LOOP1 RESULTS BECAUSE NO LOOP PERFORMED FOR THIS USING ",analysis.kind," ANALYSIS"),output=log.file)
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=final.ttab.freq,mapping=ggplot2::aes(x=reorder(GENE,-FREQ),y=FREQ)))# reorder from higher to lower
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=tempo.plot,mapping=ggplot2::aes(x=reorder(GENE,FREQ),y=FREQ)))# reorder from higher to lower
tempo.txt<-paste0("BEWARE: ONLY THE ",nrow(tempo.plot)," MOST FREQUENT GENES PLOTTED, AMONG ",nrow(final.ttab.freq))
}else{
tempo.plot<-final.ttab.freq
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=tempo.plot,mapping=ggplot2::aes(x=reorder(GENE,FREQ),y=FREQ)))# reorder from higher to lower
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ",slurm.loop.nb," LOOPS)")))
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggtitle(paste0("LIMMA GENE LIST\n(PROP OF TIMES THE GENE IS SIGNIFICANT FOR ",slurm.loop.nb," LOOPS)\n",tempo.txt)))
coord<-ggplot2::ggplot_build(eval(parse(text=paste(paste0(tempo.gg.name,1:tempo.gg.count),collapse=" + "))))$data# to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::annotate(geom="text",x=coord[[1]]$x,y=coord[[1]]$ymax,label=round(coord[[1]]$y,3),size=amplif/2,color="black",vjust="center",hjust="right"))# beware: no need of order() for labels because coord[[1]]$x set the order
fun_export_data(path=path.out,data=paste0("NO GENE LIST RESULTS FROM THE LIMMA ANALYSIS (P VALUES ABOVE 0.05 AFTER CORRECTION FOR INSTANCE)"),output=log.file)
fun_export_data(path=path.out,data=paste0("NO GENE LIST RESULTS FROM THE RANDOM FOREST TRAINING"),output=log.file)
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=final.mod.gene.names.freq,mapping=ggplot2::aes(x=reorder(GENE,-FREQ),y=FREQ)))# reorder from higher to lower
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=tempo.plot,mapping=ggplot2::aes(x=reorder(GENE,FREQ),y=FREQ)))# reorder from higher to lower
tempo.txt<-paste0("BEWARE: ONLY THE ",nrow(tempo.plot)," MOST FREQUENT GENES PLOTTED, AMONG ",nrow(final.mod.gene.names.freq))
}else{
tempo.plot<-final.mod.gene.names.freq
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=tempo.plot,mapping=ggplot2::aes(x=reorder(GENE,FREQ),y=FREQ)))# reorder from higher to lower
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ",slurm.loop.nb," LOOPS)")))
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggtitle(paste0("RANDOM FOREST MODELISATION GENE LIST\n(PROP OF TIMES THE GENES HAVE BEEN SELECTED FOR ",slurm.loop.nb," LOOPS)\n",tempo.txt)))
coord<-ggplot2::ggplot_build(eval(parse(text=paste(paste0(tempo.gg.name,1:tempo.gg.count),collapse=" + "))))$data# to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::annotate(geom="text",x=coord[[1]]$x,y=coord[[1]]$ymax,label=round(coord[[1]]$y,3),size=amplif/2,color="black",vjust="center",hjust="right"))# beware: no need of order() for labels because coord[[1]]$x set the order
tempo2<-aggregate(final.gene.importance$features,list(final.gene.importance$features),function(x){length(x[!is.na(x)])})# nb
if(identical(tempo1$GENE,tempo2$Group.1)){
tempo1<-data.frame(tempo1,NB=tempo2$x)
}else{
cat(paste0("\n\n============\n\nERROR: IN THE PLOT OF THE final.gene.importance OBJECT, tempo1 AND tempo2 SHOULD HAVE THE SAME GENE COLUMN: \n\n============\n\n"))
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=tempo.plot,mapping=ggplot2::aes(x=reorder(GENE,-MEDIAN),y=MEDIAN)))# reorder from higher to lower
tempo.txt<-paste0("BEWARE: ONLY THE ",nrow(tempo.plot)," MOST IMPORTANT GENES PLOTTED, AMONG ",nrow(final.gene.importance.median))
}else{
tempo.plot<-final.gene.importance.median
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=tempo.plot,mapping=ggplot2::aes(x=reorder(GENE,-MEDIAN),y=MEDIAN)))# reorder from higher to lower
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggtitle(paste0("GENE IMPORTANCE (NUMBERS ARE NUMBER OF TIMES THE GENE IS SEEN FOR ",slurm.loop.nb," LOOPS)")))
coord<-ggplot2::ggplot_build(eval(parse(text=paste(paste0(tempo.gg.name,1:tempo.gg.count),collapse=" + "))))$data# to have the summary statistics of the plot. is interesting: x = coord[[2]]$x, y = coord[[2]]$ymax_final
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::annotate(geom="text",x=coord[[1]]$x,y=coord[[1]]$ymax,label=tempo.plot$NB,size=amplif/2,color="black",vjust="center",hjust="right",angle=90))# beware: no need of order() for labels because coord[[1]]$x set the order
assign(paste0(tempo.gg.name,tempo.gg.count<-tempo.gg.count+1),ggplot2::ggplot(data=final.select.gene.curve.median,mapping=ggplot2::aes(x=X,y=MEDIAN)))# reorder from higher to lower
# At the end of the learning procedure, there is one optimal value for the number of genes to include in the model. We use the same strategy as in the learning procedure to select this number of genes and train the random forest algorithm on them. The name of the features is given below.