Commit a5f1c994 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

interm

parent e8f571b8
......@@ -57,19 +57,18 @@ erase.graphs <- TRUE
script <- "code ini v1.0.0"
project.name <-"rogge12231"
path.lib <- "none" # absolute path of the library folder. Write "none" if not required
path.in <- "Z:/rogge12231/rogge_12231_1550514780/" # absolute path of the data folder
path.in <- "Z:/rogge12231/attempt_using_v3/rogge_12231_1550913367/" # absolute path of the data folder
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
project.name <- "rogge_project"
label.size <-6
optional.text <- ""
slurm.loop.nb <- 100
analysis.kind <- "valid_boot"
analysis.kind <- "full_cross_validation"
activate.pdf <- FALSE # graph file parameter
cut.off.freq.for.selected.genes <- 0.01 # graph file parameter
'
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode
......@@ -245,9 +244,9 @@ if(analysis.kind == "full_cross_validation"){
final.ttab <- data.frame(NULL, stringsAsFactors = FALSE)
for(i0 in 1:slurm.loop.nb){
if(nrow(get(paste0("loop", i0, "_ttab"))) > 0){
final.ttab <- rbind(final.ttab, data.frame(get(paste0("loop", i0, "_ttab")), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
final.ttab <- rbind(final.ttab, data.frame(get(paste0("loop", i0, "_ttab")), GENE = rownames(get(paste0("loop", i0, "_ttab"))), LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}else{
final.ttab <- rbind(final.ttab, data.frame(logFC = NA, AveExpr = NA, t = NA, P.Value = NA, adj.P.Val = NA, B = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
final.ttab <- rbind(final.ttab, data.frame(logFC = NA, AveExpr = NA, t = NA, P.Value = NA, adj.P.Val = NA, B = NA, GENE = NA, LOOP_NB = paste0("loop", i0), stringsAsFactors = FALSE))
}
}
}else{
......@@ -380,13 +379,13 @@ if(nrow(final.ttab) == 0 | nrow(na.omit(final.ttab)) == 0){
fun_export_data(path = path.out, data = paste0("BEWARE: NO COMPILATION FOR THE LIMMA ANALYSIS (", analysis.kind, " KIND OF ANALYSIS)"), output = log.file)
}else{
tempo0 <- final.ttab[ ! is.na(final.ttab$adj.P.Val), ]
tempo1 <- c(table(rownames(tempo0)))
tempo2 <- aggregate(tempo0$adj.P.Val, list(rownames(tempo0)), median, na.rm = TRUE)
tempo1 <- c(table(tempo0$GENE))
tempo2 <- aggregate(tempo0$adj.P.Val, list(tempo0$GENE), median, na.rm = TRUE)
names(tempo2) <- c("GENE", "ADJ_P_VALUE_MEDIAN")
tempo2 <- data.frame(
tempo2,
ADJ_P_VALUE_CI95_INF = aggregate(tempo0$adj.P.Val, list(rownames(tempo0)), quantile, probs = 0.025, na.rm = TRUE)$x,
ADJ_P_VALUE_CI95_SUP = aggregate(tempo0$adj.P.Val, list(rownames(tempo0)), quantile, probs = 0.975, na.rm = TRUE)$x
ADJ_P_VALUE_CI95_INF = aggregate(tempo0$adj.P.Val, list(tempo0$GENE), quantile, probs = 0.025, na.rm = TRUE)$x,
ADJ_P_VALUE_CI95_SUP = aggregate(tempo0$adj.P.Val, list(tempo0$GENE), quantile, probs = 0.975, na.rm = TRUE)$x
)
final.ttab.freq <- data.frame(tempo2, NB = unname(tempo1), FREQ = unname(tempo1)/slurm.loop.nb)
rownames(final.ttab.freq) <- NULL
......
......@@ -10,7 +10,7 @@ erase.graphs <- TRUE # write TRUE to erase all the graphic windows in R before s
################################ End Initialization
sink(stdout(), type = "message")
sink(stdout(), type = "message") # redirect messages to standart output
script <- commandArgs(trailingOnly = FALSE)[4] # recover script name, e.g., r_341_conf $check_lod_gael_conf. 1) .exe R path, 2) --slave, 3) --no-restore, 4) --file and 5) --args
args <- commandArgs(trailingOnly = TRUE) # recover arguments written after the call of the Rscript, ie after r_341_conf $check_lod_gael_conf
tempo.arg.names <- c("path.lib", "path.in", "path.out", "path.function1", "file.name1", "ml.bootstrap.nb", "project.name", "label.size", "optional.text", "slurm.loop.nb", "analysis.kind", "cross.valid.ratio", "random.seed") # objects names exactly in the same order as in the bash code and recovered in args
......@@ -60,7 +60,7 @@ project.name <-"rogge12231"
path.lib <- "/pasteur/homes/gmillot/softwares/R/x86_64-pc-linux-gnu-library/3.5/" # absolute path of the library folder. Write "none" if not required
path.in <- "/pasteur/homes/gmillot/rogge12231/" # absolute path of the data folder
path.out <- "/pasteur/homes/gmillot/rogge12231/" # absolute path of the output folder
path.function1 <- "/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.4.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
path.function1 <- "/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in
ml.bootstrap.nb <- 3
label.size <- 6
......@@ -84,17 +84,17 @@ path.in <- "C:/Users/Gael/Documents/Hub projects/20190126 Las Rogge 12231/" # ab
path.out <- "C:/Users/Gael/Desktop/" # absolute path of the output folder
path.function1 <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R"
# path.function1 <- "C:/Users/Gael/Documents/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R" # Define the absolute pathway of the folder containing functions created by Gael Millot
file.name1 <- "supplementary_data_file_test.csv" # name of the data file to import in path.in
# file.name1 <- "supplementary_data_file.csv" # name of the data file to import in path.in
file.name1 <- "194samples_67training_13replication_normalized_LR20022019.txt"
ml.bootstrap.nb <- 3
ml.bootstrap.nb <- 10
label.size <- 6
optional.text <- ""
slurm.loop.nb <- 1
analysis.kind <- "valid_boot"
analysis.kind <- "longit"
cross.valid.ratio <- 0.8
random.seed <- TRUE
random.seed <- FALSE
'
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
# data.frame(PARAM = tempo.arg.names, ARG = args) # for debug mode
......@@ -388,6 +388,7 @@ fun_export_data(path = path.out, data = "################################ RANDOM
# required data: df.nano
## see dataiku https://www.dataiku.com/dss/features/machine-learning/
# Random Forests were used on the discovery set to
#
......@@ -407,9 +408,13 @@ fun_export_data(path = path.out, data = "################################ RANDOM
# lps <- log2(df.nano[, i_lps])
# seb <- log2(df.nano[, i_seb])
dat.train <- dat[train, ] # Y and nanostring genes columns of LPS and SEB for the training lines (67 indiv)
dat.valid <- dat[valid, ] # Y and nanostring genes columns of LPS and SEB for the validation lines (9 indiv)
if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "valid_boot" & slurm.loop.nb == 1) | (analysis.kind == "full_cross_validation"))){
dat.train <- dat[train, ] # Y and nanostring genes columns of LPS and SEB for the training lines (67 indiv)
dat.valid <- dat[valid, ] # Y and nanostring genes columns of LPS and SEB for the validation lines (9 indiv)
# dat.train <- dat[train, ] # Y and nanostring genes columns of LPS and SEB for the training lines (67 indiv)
# line below inactivated because I suspect that when reimported, creates problems
# dat.valid <- dat[valid, ] # Y and nanostring genes columns of LPS and SEB for the validation lines (9 indiv)
train.task <- makeClassifTask(data = dat.train, target = "Y") # prepar the classification of Y depending on the rest
# valid.task <- makeClassifTask(data = dat.valid, target = "Y")
# ```
......@@ -509,7 +514,7 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
# * **COMMENT_VR**: Once a method of selecting genes as been defined, it could be interesting to display the variable importance attached to each of these 25 genes once a unique RF model has been trained on the entire training dataset (67 patients). It could help to get a sense of the main drivers (Example Below).
#
# ```{r fig.width=9, fig.height=4}
feature_importance <- getFeatureImportance(mod$learner.model$next.model)
feature_importance <- getFeatureImportance(mod$learner.model$next.model) # BEWARE !! : the function use randomness
df_imp <- data.frame(features = names(feature_importance$res),
importance = t(feature_importance$res), stringsAsFactors = FALSE)
importance.plot <- ggplot2::ggplot(data = df_imp, aes(x = reorder(features, -importance), y=importance)) + geom_bar(stat = "identity") +
......@@ -552,8 +557,10 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
backup.name <- c(backup.name, "corr.plot")
if(analysis.kind == "valid_boot"){
save(list=c("dat.train", "dat.valid", "train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData"))
}
# line below inactivated and replaced because I suspect that when reimported, creates problems
# save(list=c("dat.train", "dat.valid", "train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData"))
save(list=c("train.task", "lrn.rf", "mod.gene.names"), file = paste0(path.out, "loop", slurm.loop.nb, "_discov_data.RData"))
}
}else if(analysis.kind == "valid_boot" & slurm.loop.nb > 1){
if( ! file.exists(paste0(path.loop1, "loop1_discov_data.RData"))){
stop(paste0("\n\n============\n\nERROR: LOOP ", slurm.loop.nb," CANNOT FIND THE ", paste0(path.loop1, "loop1_discov_data.RData"), " FILE\nDUE TO UNACTIVE BLOCKAGE OF THE LOOP 1 IN THE workflow.sh\n\n============\n\n"))
......@@ -576,7 +583,17 @@ fun_export_data(path = path.out, data = "################################ VALIDA
# Now that the features of interest are selected, we can train the final models on the whole discovery set.
#
# ```{r train, warning=FALSE}
# ```{r train, warning=FALSE} #### 20190313 checked compared to ReproducibleCode_20190109.Rmd
if(random.seed == TRUE){
used.set.seed2 <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed2 <- 1
cat(paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n"))
}
set.seed(used.set.seed2)
backup.name <- c(backup.name, "used.set.seed2")
dat.train.extended <- data.frame(dat.train, cvr[train,])
dat.train1.genes <- dat.train.extended[, c("Y", mod.gene.names)]
dat.train2.crp <- dat.train.extended[, c("Y", "CRP_M0")]
......@@ -605,7 +622,7 @@ mod3.genes.crp.logreg <- train(lrn.logreg, train.task3.genes.crp)
mod1.genes.rpart <- train(lrn.rpart, train.task1.genes)
mod2.crp.rpart <- train(lrn.rpart, train.task2.crp)
mod3.genes.crp.rpart <- train(lrn.rpart, train.task3.genes.crp)
# ```
# ``` #### end 20190313 checked compared to ReproducibleCode_20190109.Rmd
#
# Three different models are compared to one another.
#
......@@ -919,6 +936,16 @@ for(i0 in 1:pages.print.nb){
#
#
# ```{r learners_freq, results='asis', cache=TRUE}
if(random.seed == TRUE){
used.set.seed3 <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed3 <- 1
cat(paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n"))
}
set.seed(used.set.seed3)
backup.name <- c(backup.name, "used.set.seed3")
lrn.rf.filter.25 <- mlr::makeFilterWrapper(learner = lrn.rf,
fw.method = "randomForest.importance",
fw.abs = 25)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment