Commit fdbfa047 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent 62f37620
......@@ -32,9 +32,10 @@ STARTTIME_SH=$(date) ; JOB_ID_SH=$(date -d "$STARTTIME_SH" +%s) ; sh /pasteur/ho
rogge_12231.conf config file. Parameters need to be set by the user inside this file before running
rogge_12231_workflow.sh file allowing checkings and SLURM job submission
rogge_12231_whole_VGVR.R R script run by SLURM
rogge_12231_main_analysis.R R script run by SLURM
rogge_12231_data_compilation.R R script run by SLURM that compil the data from the previous loops
The three files must be in the same directory before running
The four files must be in the same directory before running
#### OUTPUT DESCRIPTION
......
......@@ -16,7 +16,8 @@ alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript'
export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.0.0/little_bash_functions-v1.0.0.sh
# export r_ini_check_conf=/pasteur/homes/gmillot/rogge12231/rogge_12231_ini.R
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_whole_VGVR.R
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_main_analysis.R
export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_data_compilation.R
#_______________________________________________________________________________________________
# GENERAL PARAMETERS
......@@ -40,6 +41,7 @@ NAME_SOURCE_FILE1_CONF="cute_little_R_functions.R"
ML_BOOTSTRAP_NB_CONF=3
LOOP_NB_CONF=3
R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
################ kind of analysis
......
This diff is collapsed.
################################ Initialization
erase.objects <- TRUE # write TRUE to erase all the existing objects in R before starting the algorithm and FALSE otherwise. Beginners should use TRUE
......@@ -18,7 +13,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s
sink(stdout(), type = "message")
script <- commandArgs(trailingOnly = FALSE)[1] # recover script name, e.g., r_341_conf $check_lod_gael_conf
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "name.source.file1", "ml.bootstrap.nb", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio") # objects names exactly in the same order as in the bash code and recovered in args
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "name.source.file1", "ml.bootstrap.nb", "project.name", "activate.pdf", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio", "random.seed") # objects names exactly in the same order as in the bash code and recovered in args
if(length(args) != length(tempo.arg.names)){
tempo.cat <- paste0("\n\n================\n\nERROR: THE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n")
stop(tempo.cat)
......@@ -52,9 +47,7 @@ for(i in 1:length(param.list)){
}
################################ End Recording of the initial parameters
################################ Parameters that need to be set by the user
################ DEBUG
################################ DEBUG
debug1 <- '
rm(list = ls())
......@@ -75,11 +68,10 @@ optional.text <- ""
slurm.loop.nb <- 1
analysis.kind <- "longit"
cross.valid.ratio <- 0.8
random.seed <- TRUE
'
# eval(parse(text = debug1)) ; cat(paste0("\n\n================\n\n================\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n================\n\n================\n\n")) ; stop()
debug2 <- '
rm(list = ls())
erase.objects <- TRUE
......@@ -99,6 +91,7 @@ optional.text <- ""
slurm.loop.nb <- 1
analysis.kind <- "longit"
cross.valid.ratio <- 0.8
random.seed <- TRUE
'
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
......@@ -106,13 +99,8 @@ cross.valid.ratio <- 0.8
################################ End DEBUG
################################ Packages verification and import
# packages are imported even if functions are used using package.name::function()
req.package.list <- c(
"dplyr",
......@@ -199,6 +187,7 @@ tempo <- fun_param_check(data = optional.text, class = "character", length = 1)
tempo <- fun_param_check(data = slurm.loop.nb, typeof = "integer", length = 1, double.as.integer.allowed = TRUE, neg.values = FALSE) ; eval(ee)
tempo <- fun_param_check(data = analysis.kind, options = c("longit", "valid_boot", "full_cross_validation"), length = 1) ; eval(ee)
tempo <- fun_param_check(data = cross.valid.ratio, typeof = "double", length = 1, prop = TRUE) ; eval(ee)
tempo <- fun_param_check(data = random.seed, class = "logical", length = 1) ; eval(ee)
if(any(arg.check) == TRUE){
stop()
}
......@@ -216,7 +205,7 @@ fun_export_data(data = paste0("\n\n################################ ", log.file,
fun_export_data(path = path.out, data = "################################ INITIAL DATA", output = log.file, sep = 4)
fun_export_data(path = path.out, data = paste0("SCRIPT USED: ", script), output = log.file)
fun_export_data(path = path.out, data = paste0("KIND OF ANALYSIS PERFORMED: ", analysis.kind), output = log.file)
fun_export_data(path = path.out, data = paste0("THE RESPONSE USED IS THE COLUMN: response_ASDAS_R_NR", analysis.kind), output = log.file)
################ Graphical parameter ignition
......@@ -235,6 +224,17 @@ if(optional.text == "no.txt"){
par.ini <- fun_open_window(pdf.disp = activate.pdf, path.fun = path.out, pdf.name.file = paste0("loop", slurm.loop.nb, "_graphs"), width.fun = 7, height.fun = 7, paper = "special", no.pdf.overwrite = TRUE, return.output = TRUE)$ini.par
pdf.nb <- dev.cur()
################ randomness ignition
if(random.seed == TRUE){
used.set.seed <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed <- 1
cat(paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n"))
}
set.seed(used.set.seed)
backup.name <- c(backup.name, "used.set.seed")
################ Data import
h1 <- unname(unlist(read.table(paste0(path.in, file.name1), nrows = 1, sep=";", dec=",", stringsAsFactors = FALSE)))
......@@ -269,9 +269,8 @@ dat <- data.frame(Y, lps, seb)
# Two sets of patients were defined in this analysis from the `r nrow(df.nano)` patients in the total cohort: a discovery set of `r length(train)` patients and a validation set of `r length(valid)` patients.
# The data needs to be the training cohort (also called discovery cohorte)
set.seed(1) ; cat(paste0("\n\n================\n\nBEWARE: set.seed() FUNCTION ACTIVATED\n\n================\n\n"))
class.prop <- table(df.nano$response_ASDAS_R_NR)/sum(table(df.nano$response_ASDAS_R_NR))
sample.prop <- class.prop[match(df.nano$response_ASDAS_R_NR, names(class.prop))]
if(analysis.kind == "longit"){
train <- which(df.nano$cohort_id != "cohortR") # 67 rows of df.nano used for training
valid <- which(df.nano$cohort_id == "cohortR") # 9 rows of df.nano used for validation
......@@ -279,7 +278,7 @@ if(analysis.kind == "longit"){
train <- which(df.nano$cohort_id != "cohortR") # 67 rows of df.nano used for training
valid <- which(df.nano$cohort_id == "cohortR") # 9 rows of df.nano used for validation
if(length(valid) != 1){
valid <- base::sample(x = valid, size = length(valid), replace = TRUE)
valid <- base::sample(x = valid, size = length(valid), replace = TRUE) # no need of prob = sample.prop[valid] here because replace = TRUE will globally keep the proportions
}
}else if(analysis.kind == "full_cross_validation"){
train.nb <- round(nrow(df.nano) * cross.valid.ratio, 0)
......@@ -288,8 +287,8 @@ if(analysis.kind == "longit"){
}
fun_export_data(path = path.out, data = paste0("NUMBER OF INDIV IN THE DISCOVERY SET: ", train.nb, " (CROSS VALIDATION RATIO SET TO ", round(cross.valid.ratio, 2), ")"), output = log.file)
fun_export_data(path = path.out, data = paste0("REMAINING INDIV IN THE VALIDATION SET: ", nrow(df.nano) - train.nb), output = log.file)
train <- base::sample(x = 1:nrow(df.nano), size = train.nb, replace = FALSE)
valid <- 1:nrow(df.nano)[-train]
train <- base::sample(x = 1:nrow(df.nano), size = train.nb, replace = FALSE, prob = sample.prop)
valid <- (1:nrow(df.nano))[-train]
}
if(slurm.loop.nb > 1){
......@@ -315,7 +314,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
colnames(design) <- gsub("Y.limma", "", colnames(design)) # rename the identity matrix
# r contrasts
contrasts <- makeContrasts(R - NR, levels = design) # contrast matrix
contrasts <- limma::makeContrasts(R - NR, levels = design) # contrast matrix
# r fit
fit <- lmFit(X.limma, design)
......@@ -324,7 +323,11 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
ttab <- topTable(fit, coef = "R - NR", adjust.method = "BH", p.value = 0.05, number = nrow(X.limma))
fun_export_data(path = path.out, data = "################################ LIMMA ANALYSIS", output = log.file, sep = 4)
fun_export_data(path = path.out, data = 'PARAMETERS USED: topTable(fit, coef = "R - NR", adjust.method = "BH", p.value = 0.05, number = nrow(X))' , output = log.file)
fun_export_data(path = path.out, data = ttab , output = log.file)
if(nrow(ttab) > 0){
fun_export_data(path = path.out, data = ttab , output = log.file)
}else{
fun_export_data(path = path.out, data = "NO GENE LIST RETURNED WITH THE PARAMETERS USED (P VALUES ABOVE 0.05 AFTER CORRECTION FOR INSTANCE)" , output = log.file)
}
backup.name <- c(backup.name, "ttab")
}else if(analysis.kind == "valid_boot" & slurm.loop.nb > 1){
fun_export_data(path = path.out, data = paste0("SEE Loop1 FILE FOR THE LIMMA RESULTS"), output = log.file)
......@@ -867,7 +870,11 @@ backup.name <- c(backup.name, "final.gene.list")
################################ Post Main code
save(list=c(backup.name), file = paste0(path.out, "loop", slurm.loop.nb, "_res_data.RData"))
final.backup.name <- paste0("loop", slurm.loop.nb, "_", backup.name)
for(i0 in 1:length(backup.name)){
assign(final.backup.name[i0], get(backup.name[i0]))
}
save(list=c(final.backup.name), file = paste0(path.out, "loop", slurm.loop.nb, "_res_data.RData"))
############ Pdf window closing
......
......@@ -148,7 +148,6 @@ EX1: `basename $0` -c /pasteur/homes/gmillot/rogge12231/
EOF
}
while getopts ":hc:r:" OPTION ; do
# add : after the option name to specify that something is required (-h has nothing required after)
# the first : before h is to induce getopts switching to "silent error reporting mode" (disable annoying messages).
......@@ -213,6 +212,7 @@ CONF_SCRIPTS=(
"r_main_functions_conf"
"bash_main_functions_conf"
"r_main_conf"
"r_compil_conf"
)
conf_scripts_Num=$(( ${#CONF_SCRIPTS[@]} - 1 )) # total number of elements in the array
LOCAL_USER_VAR+=" CONF_SCRIPTS conf_scripts_Num " # do not forget the space before the variable name
......@@ -233,6 +233,7 @@ CONF_VAR_CHECK=(
"NAME_SOURCE_FILE1_CONF"
"ML_BOOTSTRAP_NB_CONF"
"LOOP_NB_CONF"
"R_RANDOM_SEED"
"PROJECT_NAME_CONF"
"R_PDF_DISPLAY_CONF"
"LABEL_SIZE"
......@@ -352,7 +353,7 @@ while [[ $COUNT < $(($LOOP_NB_CONF + 1)) ]] ; do
source $CONFIG_FILE # never forget this because another environment
OUTPUT_DIR_PATH_tempo2="${OUTPUT_DIR_PATH_tempo}/loop${COUNT}/"
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF $COUNT $R_ANALYSIS_KIND $CROSS_VALID_RATIO"
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF $COUNT $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${COUNT}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${COUNT}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --job-name=wait_loop1 --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo --wait | tee -a ${OUTPUT_DIR_PATH_tempo}/loop${COUNT}/loop${COUNT}_${PROJECT_NAME_CONF}_slurm_jobID.txt # write all th echo from the $PROC alaso into a log file
......@@ -367,7 +368,7 @@ while [[ $COUNT < $(($LOOP_NB_CONF + 1)) ]] ; do
source $CONFIG_FILE # never forget this because another environment
OUTPUT_DIR_PATH_tempo2="${OUTPUT_DIR_PATH_tempo}/loop${SLURM_ARRAY_TASK_ID}/"
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO" # beware $COUNT replaced by ${SLURM_ARRAY_TASK_ID} because job array
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED" # beware $COUNT replaced by ${SLURM_ARRAY_TASK_ID} because job array
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --array=2-$LOOP_NB_CONF --job-name=wait_loop_all --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo --wait | tee -a $(for((i = 2 ; i <= $LOOP_NB_CONF ; i++)) ; do echo ${OUTPUT_DIR_PATH_tempo}/loop${i}/loop${i}_${PROJECT_NAME_CONF}_slurm_jobID.txt ; done) # tee is dispached in all the dir of the job array
......@@ -379,7 +380,7 @@ while [[ $COUNT < $(($LOOP_NB_CONF + 1)) ]] ; do
source $CONFIG_FILE # never forget this because another environment
OUTPUT_DIR_PATH_tempo2="${OUTPUT_DIR_PATH_tempo}/loop${SLURM_ARRAY_TASK_ID}/"
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO"
R_PROC="R_conf ${r_main_conf} $PATH_LIB_CONF $PATH_IN_CONF ${OUTPUT_DIR_PATH_tempo2} $PATH_FUNCTION1_CONF $FILE_NAME1_CONF $NAME_SOURCE_FILE1_CONF $ML_BOOTSTRAP_NB_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF ${SLURM_ARRAY_TASK_ID} $R_ANALYSIS_KIND $CROSS_VALID_RATIO $R_RANDOM_SEED"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_tempo2}loop${SLURM_ARRAY_TASK_ID}_r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --array=1-$LOOP_NB_CONF --job-name=wait_loop_all --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR_tempo --wait | tee -a $(for((i = 1 ; i <= $LOOP_NB_CONF ; i++)) ; do echo ${OUTPUT_DIR_PATH_tempo}/loop${i}/loop${i}_${PROJECT_NAME_CONF}_slurm_jobID.txt ; done) # tee is dispached in all the dir of the job array
......@@ -393,6 +394,22 @@ echo -e '#!/bin/sh
LOCAL_USER_VAR+=" SUP_VAR_tempo" # do not forget the space before the variable name
if [[ $R_ANALYSIS_KIND =~ longit || $LOOP_NB_CONF == 1 ) ]] ; then
echo "NO NEED TO COMPILE DATA SINCE NO LOOP PERFORMED\n"
else
OUTPUT_DIR_PATH_FINAL="${OUTPUT_DIR_PATH_tempo}/final_res"
mkdir ${OUTPUT_DIR_PATH_FINAL}
SUP_VAR+="OUTPUT_DIR_PATH_FINAL=$OUTPUT_DIR_PATH_FINAL"
echo -e '#!/bin/sh
# write the previous line exactly like this, with no comments, otherwise do not work
source $CONFIG_FILE # never forget this because another environment
# next line cannot be put outside (which would have been convenient -> put into the SUP_VAR_tempo for display. But SUP_VAR_tempo for sbatch do not like spaces)
R_PROC="R_conf ${r_compil_conf} $PATH_LIB_CONF $OUTPUT_DIR_PATH_tempo $OUTPUT_DIR_PATH_FINAL $PATH_FUNCTION1_CONF $PROJECT_NAME_CONF $R_PDF_DISPLAY_CONF $LABEL_SIZE $R_OPT_TXT_CONF $LOOP_NB_CONF $R_ANALYSIS_KIND"
R_PROC2="${R_PROC} &> ${OUTPUT_DIR_PATH_FINAL/r_console_messages.txt" # or "$R_PROC > ${OUTPUT_DIR_PATH_FINAL/r_console_messages.txt 2>&1" # to add the estderror in the stdout
eval "$R_PROC2"
' | sbatch -p $DEDICATED_CONF --job-name=compil --qos $QOS_CONF --time $MAX_RUNNING_TIME_CONF -c $NB_CPU_PER_TASK_CONF --mem-per-cpu $MEM_PER_CPU_CONF --mail-type END,FAIL --mail-user $MAIL_CONF --export $SUP_VAR | tee -a ${OUTPUT_DIR_PATH_tempo}/loop${COUNT}/loop${COUNT}_${PROJECT_NAME_CONF}_slurm_jobID.txt # write all th echo from the $PROC alaso into a log file
fi
################ END MAIN CODE
################ LANDING
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment