Commit 50f64da5 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

v5.0.0 release

parent 0b9fd5a0
......@@ -25,7 +25,7 @@ Download the desired Tagged version, never the current master.
Open rogge_12231.conf
Set the parameters
Start the job using a command like:
STARTTIME_SH=$(date) ; JOB_ID_SH=$(date -d "$STARTTIME_SH" +%s) ; sh /pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v4.0.0/rogge_12231_workflow.sh -c /pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v4.0.0/rogge_12231.conf | tee /pasteur/homes/gmillot/rogge12231/${JOB_ID_SH}_workflow_log &
STARTTIME_SH=$(date) ; JOB_ID_SH=$(date -d "$STARTTIME_SH" +%s) ; sh /pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v5.0.0/rogge_12231_workflow.sh -c /pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v5.0.0/rogge_12231.conf | tee /pasteur/homes/gmillot/rogge12231/${JOB_ID_SH}_workflow_log &
#### FILE DESCRIPTION
......@@ -60,9 +60,14 @@ Check for updated versions (most recent tags) at https://gitlab.pasteur.fr/gmill
#### WHAT'S NEW IN
## v5.0.0
Gene filtering added
## v4.0.0
bug fixed for the Limma analysis and for the valid_boot kind of analysis
Bug fixed for the Limma analysis and for the valid_boot kind of analysis
## v3.0.0
......
......@@ -15,8 +15,8 @@ alias R_conf='module load gcc/4.9.4 R/3.5.0 ; Rscript'
export r_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/cute_little_R_functions-v4.5.0/cute_little_R_functions.R
# export r_main_functions_conf=https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v4.5.0/cute_little_R_functions.R
export bash_main_functions_conf=/pasteur/homes/gmillot/Git_versions_to_use/little_bash_functions-v1.1.0/little_bash_functions.sh
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_main_analysis.R
export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v1.0.0/rogge_12231_data_compilation.R
export r_main_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v5.0.0/rogge_12231_main_analysis.R
export r_compil_conf=/pasteur/homes/gmillot/Git_versions_to_use/rogge_12231-v5.0.0/rogge_12231_data_compilation.R
#_______________________________________________________________________________________________
# GENERAL PARAMETERS
......
......@@ -89,10 +89,10 @@ file.name1 <- "194samples_67training_13replication_normalized_LR20022019.txt"
ml.bootstrap.nb <- 2
label.size <- 6
optional.text <- ""
slurm.loop.nb <- 2
analysis.kind <- "valid_boot"
slurm.loop.nb <- 1
analysis.kind <- "longit"
cross.valid.ratio <- 0.8
random.seed <- TRUE
random.seed <- FALSE
'
# eval(parse(text = debug2)) ; cat(paste0("\n\n================\n\nERROR: ACTIVE DEBUG VALUES\n\n================\n\n")) ; stop()
......@@ -121,7 +121,9 @@ req.package.list <- c(
"pander",
"gridExtra",
"lubridate",
"RCurl"
"RCurl",
"mixtools"
# "futile.logger" # to remove .log file of VennDiagram
)
if(all(path.lib == "none")){
path.lib <- .libPaths() # .libPaths(new = path.lib) # or .libPaths(new = c(.libPaths(), path.lib))
......@@ -247,15 +249,15 @@ pdf.nb <- dev.cur()
################ randomness ignition
if(random.seed == TRUE){
used.set.seed <- sample(x = 0:(2^31-1), size = 1)
used.set.seed1 <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed <- 1
used.set.seed1 <- 1
tempo.cat <- paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n")
cat(tempo.cat)
fun_export_data(path = path.out, data = tempo.cat, output = log.file)
}
set.seed(used.set.seed)
backup.name <- c(backup.name, "used.set.seed")
set.seed(used.set.seed1)
backup.name <- c(backup.name, "used.set.seed1")
################ Data import
......@@ -280,10 +282,16 @@ df.nano <- cbind(df.nano, d3)
d4 <- d4[25:length(d4)]
names(d4) <- paste0("SEB_", h[25:length(h)]) # SEB add
df.nano <- cbind(df.nano, d4)
# the next line is a request from by LARS ROGGE
genes_to_remove <- c('HLA_DRB1','HLA_DQA1','HLA_DQB1','HLA_DRA','HLA_DRB3',
'KIR_Activating_Subgroup_1','KIR_Activating_Subgroup_2',
'KIR_Inhibiting_Subgroup_1','KIR_Inhibiting_Subgroup_2')
df.nano <- df.nano[, ! (colnames(df.nano) %in% genes_to_remove)]
################ Data modification
fun_export_data(path = path.out, data = "################################ DATA MODIFICATION AND GENE FILTERING", output = log.file)
# h <- ifelse(is.na(h3), paste(h2, h1, sep="_"), h3)
# colnames(df.nano) <- h
# rownames(df.nano) <- paste0("row", rownames(df.nano))
......@@ -295,13 +303,136 @@ i_seb <- grep("^SEB_", colnames(df.nano))
# cli <- df.nano[, -c(i_lps, i_seb)] # clinical data columns
lps <- log2(df.nano[, i_lps]) # log2 of the LPS columns
seb <- log2(df.nano[, i_seb]) # log2 of the SEB columns
################ Gene filtering (code from DifferentialAnalysis_20190306.Rmd)
# ```{r mix, cache=TRUE}
# a <- c(unlist(lps), unlist(seb))
# fixed random seed required for mixtools::normalmixEM. BEWARE: I had to put a new random seed after this gene filtering step
used.set.seed_normalmix <- 12345
set.seed(used.set.seed_normalmix)
tempo.cat <- paste0("BEWARE: NON RANDOM set.seed(", used.set.seed_normalmix, ") SYSTEMATICALLY ACTIVATED FOR GENE FILTERING STEP")
fun_export_data(path = path.out, data = tempo.cat, output = log.file)
backup.name <- c(backup.name, "used.set.seed_normalmix")
a <- unlist(seb)
b <- unlist(lps)
garba <- capture.output(resa <- mixtools::normalmixEM(a, k=2, verb = FALSE))
garbb <- capture.output(resb <- mixtools::normalmixEM(b, k=2, verb = FALSE))
preda <- apply(resa$posterior, 1, which.max)
predb <- apply(resb$posterior, 1, which.max)
thresha <- min(max(a[preda == 1]), max(a[preda == 2]))
threshb <- min(max(b[predb == 1]), max(b[predb == 2]))
thresh <- mean(c(thresha, threshb))
dat_filt <- data.frame(logExpr = c(a, b), Stimulation = rep(c("SEB", "LPS"), c(length(a), length(b))))
gg_color_hue <- function(n) {
hues <- seq(15, 375, length = n + 1)
hcl(h = hues, l = 65, c = 100)[1:n]
}
twocols <- gg_color_hue(2)
distrib.graph <- ggplot(dat_filt, aes(x=logExpr, fill = Stimulation)) +
geom_density(color = NA, alpha = 0.5) +
theme_bw() +
labs(x="Log2 expression") +
geom_vline(xintercept = thresha, color=twocols[2], linetype = "dashed") +
geom_vline(xintercept = threshb, color=twocols[1], linetype = "dashed")
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(distrib.graph)
# ```
# The value for the threshold is equal to `r round(thresha, 3)` (in log2 scale, which corresponds to `r sprintf("%0.3f", 2**thresha)` counts) for SEB. The value for the threshold is equal to `r threshb` (in log2 scale, which corresponds to `r round(2**threshb, 3)` counts) for LPS. We could use either both thresholds or (discutably) choose a common threshold for both tables, that could for example be equal to $\frac{`r round(thresha, 3)` + `r round(threshb, 3)`}{2} = `r round(thresh, 3)`$. The following barplot represents how many genes have a certain number of low values.
# ```{r count}
tab_seb <- table(colSums(seb < thresha))
tab_lps <- table(colSums(lps < threshb))
dat.count <- data.frame(i = as.numeric(c(names(tab_seb), names(tab_lps))),
count = c(tab_seb, tab_lps),
Stimulation = rep(c("SEB", "LPS"),
c(length(tab_seb), length(tab_lps))))
freq.graph <- ggplot(dat.count, aes(x = i, y=count, fill = Stimulation)) +
geom_col(position = "dodge") +
theme_bw() +
scale_y_log10() +
labs(x = "Number of low values",
y = "Number of genes")
tempo <- dev.set(pdf.nb) # assign to avoid the message
print(freq.graph)
# ```
# E.g., there are `r tab_seb["80"]` genes in the SEB table that are **below** the threshold for all the patients and `r tab_lps["80"]` genes in the LPS table that are in the same situation. Conversely, there are `r tab_seb["0"]` genes in the SEB table that are **above** the threshold for all the patients and `r tab_lps["0"]` genes in the LPS table that are in the same situation.
# Based on this preliminary analysis, we remove the genes that have more than 5 % of "low values" for each stimulation (LPS and SEB), the thresholds being computed separately for each table. The Venn diagram represented below summs up the structure of the selected genes. The genes that are outside the intersection are given below.
# ```{r remove_noise}
signal_seb <- which(colSums(seb < thresha) <= 0.05 * nrow(df.nano)) # "kept" genes list
signal_lps <- which(colSums(lps < threshb) <= 0.05 * nrow(df.nano)) # "kept" genes list
el_lps <- gsub("LPS_", "", names(signal_lps)) # kept gene list without prefix LPS
el_seb <- gsub("SEB_", "", names(signal_seb)) # kept gene list without prefix LPS
rm_signal_lps <- names(lps)[ ! names(lps) %in% names(signal_lps)]
rm_signal_seb <- names(seb)[ ! names(seb) %in% names(signal_seb)]
fun_export_data(path = path.out, data = paste0("KEPT LPS GENES: "), output = log.file, sep = 1)
fun_export_data(path = path.out, data = names(signal_lps), output = log.file)
fun_export_data(path = path.out, data = paste0("ELIMINATED LPS GENES: "), output = log.file, sep = 1)
fun_export_data(path = path.out, data = rm_signal_lps, output = log.file)
fun_export_data(path = path.out, data = paste0("KEPT SEB GENES: "), output = log.file, sep = 1)
fun_export_data(path = path.out, data = names(signal_seb), output = log.file)
fun_export_data(path = path.out, data = paste0("ELIMINATED SEB GENES: "), output = log.file, sep = 1)
fun_export_data(path = path.out, data = rm_signal_seb, output = log.file)
# futile.logger::flog.threshold(futile.logger::ERROR, name = "VennDiagramLogger") # to remove the .log file generated by VennDiagram
VD <- VennDiagram::venn.diagram(list(LPS = el_lps, SEB = el_seb),
filename = NULL,
fill = c("steelblue", "darkorchid"))
tempo <- dev.set(pdf.nb) # assign to avoid the message
grid.newpage() ; grid.draw(VD)
# setdiff(union(el_lps, el_seb), intersect(el_lps, el_seb))
fun_export_data(path = path.out, data = paste0("GENES UNIQUE IN LPS AFTER FILTERING: "), output = log.file, sep = 1)
fun_export_data(path = path.out, data = setdiff(el_lps, el_seb), output = log.file)
fun_export_data(path = path.out, data = paste0("GENES UNIQUE IN SEB AFTER FILTERING: "), output = log.file, sep = 1)
fun_export_data(path = path.out, data = setdiff(el_seb, el_lps), output = log.file)
# ```
# ```{r train_valid}
lps_filtered <- lps[, signal_lps]
seb_filtered <- seb[, signal_seb]
# saving data
df.nano.ini <- df.nano
backup.name <- c(backup.name, "df.nano.ini")
df.nano <- df.nano[, ! (colnames(df.nano) %in% c(rm_signal_lps, rm_signal_seb))]
backup.name <- c(backup.name, "df.nano")
# resetting the random.seed because of the fixed one used.set.seed_normalmix <- 12345 added for gene filtering
if(random.seed == TRUE){
used.set.seed2 <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed2 <- 1
tempo.cat <- paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n")
cat(tempo.cat)
fun_export_data(path = path.out, data = tempo.cat, output = log.file)
}
set.seed(used.set.seed2)
backup.name <- c(backup.name, "used.set.seed2")
################ End Gene filtering (code from DifferentialAnalysis_20190306.Rmd)
# colnames(lps) <- sapply(strsplit(colnames(lps), "_"), function(x) sprintf("%s-%s", paste0(x[-1], collapse = "_"), x[1])) # remane the column LPS_XXX into XXX-LPS
# colnames(seb) <- sapply(strsplit(colnames(seb), "_"), function(x) sprintf("%s-%s", paste0(x[-1], collapse = "_"), x[1]))
dat <- data.frame(Y, lps, seb)
# next line inactivated because of the new line next after for gene filtering
# dat <- data.frame(Y, lps, seb)
dat <- data.frame(Y, lps_filtered, seb_filtered)
# export df.nano lps seb cvr Y dat
if(slurm.loop.nb == 1){
# df.nano : the whole data table
# dat : df.nano with only
# dat : df.nano with only Y and genes
save(list=c("df.nano", "dat"), file = paste0(path.out, "complete.data.table.RData"))
}else{
fun_export_data(path = path.out, data = paste0("SEE loop1 DIRECTORY TO GET THE complete.data.table.RData CONTAINING THE df.nano FILE", optional.text), output = log.file)
......@@ -350,7 +481,9 @@ backup.name <- c(backup.name, "train", "valid")
# The table of the genes with an adjusted p-value lower than $.05$ is reported below.
if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "valid_boot" & slurm.loop.nb == 1) | (analysis.kind == "full_cross_validation"))){
X.limma <- t(cbind(lps, seb)[train, ]) # cbind the LPS and SEB nanostring data and t() -> dim: 830 67
# next line inactivated because of the new line next after for gene filtering
# X.limma <- t(cbind(lps, seb)[train, ]) # cbind the LPS and SEB nanostring data and t() -> dim: 830 67
X.limma <- t(cbind(lps_filtered, seb_filtered)[train, ]) # cbind the LPS and SEB nanostring data and t() -> dim: 830 67
Y.limma <- df.nano$response_ASDAS_R_NR[train] # response (what has to be predict) among the 67 trainers
# dat <- data.frame(Y, lps[train, ], seb[train, ]) # data frame
# cvr <- df.nano[train, c("Gender", "Age", "B27", "smoke", "CRP_M0", "BASDAI_M0", "ASDAS_M0")] # some of the clinical data in trainers into cvr
......@@ -577,9 +710,6 @@ if(any((analysis.kind == "longit" & slurm.loop.nb == 1) | (analysis.kind == "val
}
######## 4.2 Validate the model
#
fun_export_data(path = path.out, data = "################################ VALIDATION", output = log.file)
......@@ -589,15 +719,15 @@ fun_export_data(path = path.out, data = "################################ VALIDA
# ```{r train, warning=FALSE} #### 20190313 checked compared to ReproducibleCode_20190109.Rmd
if(random.seed == TRUE){
used.set.seed2 <- sample(x = 0:(2^31-1), size = 1)
used.set.seed3 <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed2 <- 1
used.set.seed3 <- 1
tempo.cat <- paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n")
cat(tempo.cat)
fun_export_data(path = path.out, data = tempo.cat, output = log.file)
}
set.seed(used.set.seed2)
backup.name <- c(backup.name, "used.set.seed2")
set.seed(used.set.seed3)
backup.name <- c(backup.name, "used.set.seed3")
dat.train.extended <- data.frame(dat.train, cvr[train,])
dat.train1.genes <- dat.train.extended[, c("Y", mod.gene.names)]
......@@ -943,15 +1073,15 @@ for(i0 in 1:pages.print.nb){
# ```{r learners_freq, results='asis', cache=TRUE}
if(random.seed == TRUE){
used.set.seed3 <- sample(x = 0:(2^31-1), size = 1)
used.set.seed4 <- sample(x = 0:(2^31-1), size = 1)
}else{
used.set.seed3 <- 1
used.set.seed4 <- 1
tempo.cat <- paste0("\n\n================\n\nBEWARE: NON RANDOM set.seed(1) FUNCTION ACTIVATED \n\n================\n\n")
cat(tempo.cat)
fun_export_data(path = path.out, data = tempo.cat, output = log.file)
}
set.seed(used.set.seed3)
backup.name <- c(backup.name, "used.set.seed3")
set.seed(used.set.seed4)
backup.name <- c(backup.name, "used.set.seed4")
lrn.rf.filter.25 <- mlr::makeFilterWrapper(learner = lrn.rf,
fw.method = "randomForest.importance",
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment