Commit fd03eca6 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent d8f8965e
#!/bin/bash # shebang (#! https://en.wikipedia.org/wiki/Shebang_%28Unix%29) indicating to the shell what program to interpret the script with, when executed, probably optional here.
# export allow the variable to be use in subprocesses. Without export, the variable is only available in the current process. Example ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729 instead of export ANNOVAR_CONF=/bioinfo/local/build/annovar_20130729
# _conf: lowercases for alias and scripts, and uppercases for variables
......@@ -15,7 +14,7 @@ alias R_conf='module load gcc/4.7.4 R/3.5.0 ; Rscript'
export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R
# export r_main_functions_conf=https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.0.0/little_bash_functions-v1.0.0.sh
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.1.0/little_bash_functions.sh
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_main_analysis.R
export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_data_compilation.R
......@@ -39,7 +38,7 @@ FILE_NAME1_CONF="supplementary_data_file_test.csv" # name of the data file to im
ML_BOOTSTRAP_NB_CONF=3
LOOP_NB_CONF=3
R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
R_RANDOM_SEED="TRUE" #♥ if FALSE, set.seed(1) is systematically used at the beginning of the R script, otherwise, the seed is random (and saved in the RData output)
################ kind of analysis
......@@ -48,7 +47,7 @@ R_RANDOM_SEED="FALSE" #♥ if FALSE, set.seed(1) is systematically used at the b
# with discovery set 67 indiv (df.nano$cohort_id != "cohortR") and validation set 9 indiv (df.nano$cohort_id == "cohortR")
# "valid_boot" limma and rf training are run once but bootstrap of the validation set 9 indiv (df.nano$cohort_id == "cohortR") using LOOP_NB_CONF parameter
# "full_cross_validation" rows of the dataset are randomly split in two (no replacement), according to CROSS_VALID_RATIO, forming the discovery and validation set
R_ANALYSIS_KIND="longit"
R_ANALYSIS_KIND="full_cross_validation"
CROSS_VALID_RATIO=0.8 # proportion (nb indiv randomly selected (wo replacement) for the discovery set) / (total number of indiv)
# -> the validation set is formed by the remaining indiv, with proportion 1 - CROSS_VALID_RATIO
......
This diff is collapsed.
......@@ -94,7 +94,7 @@ analysis.kind <- "valid_boot"
cross.valid.ratio <- 0.8
random.seed <- TRUE
'
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode
......@@ -120,7 +120,8 @@ req.package.list <- c(
"ggplot2",
"pander",
"gridExtra",
"lubridate"
"lubridate",
"RCurl"
)
if(path.lib == "none"){
path.lib <- .libPaths() # .libPaths(new = path.lib) # or .libPaths(new = c(.libPaths(), path.lib))
......@@ -143,12 +144,18 @@ for(i0 in 1:length(req.package.list)){
if(length(path.function1) != 1){
stop(paste0("\n\n============\n\nERROR: path.function1 PARAMETER MUST BE LENGTH 1: ", paste(path.function1, collapse = " "), "\n\n============\n\n"))
}else if(grepl(x = path.function1, pattern = "^http") & ( ! RCurl::url.exists(path.function1))){
}else if(grepl(x = path.function1, pattern = "^http")){
if( ! RCurl::url.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: HTTP INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else if(( ! grepl(x = path.function1, pattern = "^http")) & ( ! file.exists(path.function1))){
stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else{
source(path.function1) # source the fun_ functions used below
}else{
source(path.function1) # source the fun_ functions used below
}
}else if( ! grepl(x = path.function1, pattern = "^http")){
if( ! file.exists(path.function1)){
stop(paste0("\n\n============\n\nERROR: FILE INDICATED IN THE path.function1 PARAMETER DOES NOT EXISTS: ", path.function1, "\n\n============\n\n"))
}else{
source(path.function1) # source the fun_ functions used below
}
}
################################ End Functions
......@@ -422,8 +429,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
reff <- generateHyperParsEffectData(rtune, trafo = FALSE, include.diagnostics = FALSE)
plt <- plotHyperParsEffect(reff, x = "fw.abs", y = "mmce.test.mean")
# add our own touches to the plot
tempo <- dev.set(pdf.nb) # assign to avoid the message
plt + geom_point(colour = "red") +
plt.plot <- plt + geom_point(colour = "red") +
ggtitle("Evaluating optimal number of features") +
theme_bw() +
labs(x="Number of selected features", y="Mean misclassification error on the test sets") +
......@@ -435,7 +441,10 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(plt.plot)
select.gene.curve <- ggplot2::ggplot_build(plt.plot)$data[[2]]
backup.name <- c(backup.name, "select.gene.curve")
# ```
#
# At the end of the learning procedure, there is one optimal value for the number of genes to include in the model. We use the same strategy as in the learning procedure to select this number of genes and train the random forest algorithm on them. The name of the features is given below.
......@@ -480,8 +489,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
feature_importance <- getFeatureImportance(mod$learner.model$next.model)
df_imp <- data.frame(features = names(feature_importance$res),
importance = t(feature_importance$res), stringsAsFactors = FALSE)
tempo <- dev.set(pdf.nb) # assign to avoid the message
ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() +
importance.plot <- ggplot2::ggplot(data = df_imp, aes(x=features, y=importance)) + geom_bar(stat = "identity") + theme_bw() +
theme(
axis.text.x = element_text(angle=90, vjust=0.5, hjust=1),
plot.title = ggplot2::element_text(hjust=1, vjust=1, size = label.size),
......@@ -491,6 +499,10 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
legend.title = ggplot2::element_text(size = label.size),
strip.text = ggplot2::element_text(size = label.size)
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(importance.plot)
gene.importance <- df_imp
backup.name <- c(backup.name, "gene.importance")
# ```
#
#
......@@ -503,13 +515,16 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
colnames(annot.rows) <- "ASDAS R/NR"
subdat <- df.tmp[, sort(mod.gene.names)]
ann_colors = list("ASDAS R/NR" = c(R = "steelblue", NR = "tomato"))
pheatmap(t(scale(subdat)), annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
heatmap.plot <- pheatmap(t(scale(subdat)), silent = TRUE, annotation_col = annot.rows, cluster_cols = FALSE, show_colnames = FALSE, border_color = NA, color = colorRampPalette(c("red", "black", "green"))(499), annotation_colors = ann_colors, fontsize_row = label.size, fontsize_col = label.size)
print(heatmap.plot)
backup.name <- c(backup.name, "heatmap.plot")
# ```
#
# The correlation of the selected features is reprsented hereafter. We see that some of the selected features are strongly correlated.
#
# ```{r corrplot}
corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10)
corr.plot <- corrplot(cor(subdat), tl.col = "black", tl.cex = label.size / 10)
if(analysis.kind == "valid_boot"){
save(list=c("dat.train", "dat.valid", "train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData"))
}
......@@ -840,7 +855,7 @@ ggbox <- ggplot(data = boxdat_melt, aes(x=Y, y=value, colour=Y, shape = CV, line
strip.text = ggplot2::element_text(size = label.size)
)
tempo <- dev.set(pdf.nb) # assign to avoid the message
ggbox
print(ggbox)
# ```
#
######## 4.7 Frequencies associated to each predictor
......
#!/bin/bash
################ INITIALIZATION
#!/bin/bash
# echo -e "\nJOB COMMAND EXECUTED:\n$0\n" # to get the line that executes the job but does not work (gives /bioinfo/guests/gmillot/Gael_code/workflow_fastq_gael.sh)
# BEWARE: double __ is a reserved character string to deal with spaces in paths
module purge
......@@ -46,7 +45,8 @@ function single_path_with_regex_fun { # comes from little_bash_functions-v1.0.0/
# 0: single path detected is valid
# 1: error: $1 not provided
# 2: error: $2 provided or more than one path detected
# 3: single path detected does not exist
# 3: single url detected does not exist
# 4: single path detected does not exist
# EXAMPLES
# single_path_with_regex_fun /cygdrive/c/Users/Gael/Desktop/config_tars_lodscore_gael_2017121[[:digit:]].conf
# single_path_with_regex_fun /pasteur/homes/gmillot/dyslexia/code_gael/config_tars_lodscore_gael_2017121[[:digit:]].conf
......@@ -71,10 +71,16 @@ function single_path_with_regex_fun { # comes from little_bash_functions-v1.0.0/
return 2
else
shopt -s extglob # -s unable global extention, ie the recognition of special global pattern in path, like [[:digit:]]
if [[ ! ( -d ${ARG1_ARR[0]} || -f ${ARG1_ARR[0]} ) ]] ; then
if [[ $(echo ${ARG1_ARR[0]} | grep -cE '^http' ) == 1 ]] ; then # -cE to specify extended and -c to return the number of match (here 0 or one only)
if [[ $(wget ${ARG1_ARR[0]} >/dev/null 2>&1 ; echo $?) != 0 ]] ; then # check the valid url. wget $url >/dev/null 2>&1 prevent any action and print. echo $? print the result of the last command (0 = success, other number = failure)
echo -e "\n### ERROR ### SPECIFIED URL IN single_path_with_regex_fun DOES NOT EXISTS: ${ARG1_ARR[0]}\n";
shopt -u extglob # -u disable global extention, ie the recognition of special global pattern in path, like [[:digit:]]
return 3
fi
elif [[ ! ( -d ${ARG1_ARR[0]} || -f ${ARG1_ARR[0]} ) ]] ; then
echo -e "\n### ERROR ### SPECIFIED PATH IN single_path_with_regex_fun DOES NOT EXISTS: ${ARG1_ARR[0]}\n";
shopt -u extglob # -u disable global extention, ie the recognition of special global pattern in path, like [[:digit:]]
return 3
return 4
else
shopt -u extglob # -u disable global extention, ie the recognition of special global pattern in path, like [[:digit:]]
return 0
......@@ -393,7 +399,7 @@ echo -e '#!/bin/sh
LOCAL_USER_VAR+=" SUP_VAR_tempo" # do not forget the space before the variable name
if [[ $R_ANALYSIS_KIND =~ longit || $LOOP_NB_CONF == 1 ]] ; then
echo "\nNO NEED TO COMPILE DATA SINCE NO LOOP PERFORMED\n"
echo -e "\nNO NEED TO COMPILE DATA SINCE NO LOOP PERFORMED\n"
else
OUTPUT_DIR_PATH_FINAL="${OUTPUT_DIR_PATH_tempo}/final_res"
mkdir ${OUTPUT_DIR_PATH_FINAL}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment