Commit f34fb699 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

tempo

parent df2d7a75
......@@ -354,23 +354,23 @@ fun_report(data = paste0("\n\n<br /><br />Beginning of the motif positions in th
fun_report(data = head(fw), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />Beginning of the motif positions in the reverse strand:\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = head(rv), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />NUMBER OF FORWARD 5\'", motif_fw, "3\' SITES:\n", format(nrow(fw), big.mark=","), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />NUMBER OF REVERSE 5\'", motif_rv, "3\' SITES:\n", format(nrow(rv), big.mark=","), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />NUMBER OF FORWARD 5\'", motif_fw, "3\' SITES:\n", format(nrow(fw), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />NUMBER OF REVERSE 5\'", motif_rev, "3\' SITES:\n", format(nrow(rv), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fw <- data.frame(fw, orient = 0) # forward strand
if(any( ! grepl(fw$seq, pattern = motif_fw)){stop("\n\nERROR\n\n")}
if(any( ! grepl(fw$seq, pattern = motif_fw))){stop("\n\nERROR\n\n")}
rv <- data.frame(rv, orient = 16) # reverse strand
if(any( ! grepl(rv$seq, pattern = motif_rv)){stop("\n\nERROR\n\n")}
if(any( ! grepl(rv$seq, pattern = motif_rev))){stop("\n\nERROR\n\n")}
a <- rbind(fw, rv, stringsAsFactors = FALSE)
a$pos <- a$pos + 2 # because pos START AT ZERO AND IS FOR THE FIRST LEFT NUC
a <- data.frame(a, fork = a$orient, stringsAsFactors = TRUE)
ori.center <- mean(ori.coord, na.rm = TRUE)
dif.center <- mean(dif.coord, na.rm = TRUE)
ori.center <- mean(ori_coord, na.rm = TRUE)
ter.center <- mean(ter_coord, na.rm = TRUE)
if(dif.center > ori.center){
a$fork[a$pos < ori.center | a$pos > dif.center] <- abs(a$fork[a$pos < ori.center | a$pos > dif.center] - 16) # left of the Ori and right of the dif (dif > ori) is switch 0 -> 16 and 16 -> 0 to have the leading and lagging
if(ter.center > ori.center){
a$fork[a$pos < ori.center | a$pos > ter.center] <- abs(a$fork[a$pos < ori.center | a$pos > ter.center] - 16) # left of the Ori and right of the dif (dif > ori) is switch 0 -> 16 and 16 -> 0 to have the leading and lagging
}else{
a$fork[a$pos <= ori.center & a$pos >= dif.center] <- abs(a$fork[a$pos <= ori.center & a$pos >= dif.center] - 16) # right of the Ori and left of the dif (dif < ori) is switch 0 -> 16 and 16 -> 0 to have the leading and lagging
a$fork[a$pos <= ori.center & a$pos >= ter.center] <- abs(a$fork[a$pos <= ori.center & a$pos >= ter.center] - 16) # right of the Ori and left of the dif (dif < ori) is switch 0 -> 16 and 16 -> 0 to have the leading and lagging
}
a$fork<- factor(a$fork, levels = c(0, 16), labels = c("LEADING", "LAGGING"))
......@@ -383,38 +383,38 @@ b$orient[b$orient == 16] <- "REVERSE"
# verifications right of Ori
fun_report(data = "\n\n<br /><br />\n\nON THE RIGHT OF THE ORI AND LEFT OF THE DIFF\nFORWARD INSERSION = LEADING\nREVERSE INSERSION = LAGGING", output = report.rmd, path = "./", overwrite = FALSE)
d <- b[b$pos > ori.center & b$p < dif.center,]
d <- b[b$pos > ori.center & b$p < ter.center,]
fun_report(data = addmargins(table(d[, c("fork", "orient")])), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = addmargins(table(d$names)), output = report.rmd, path = "./", overwrite = FALSE)
# verifications left of Ori
fun_report(data = "\n\n<br /><br />\n\nON THE LEFT OF THE ORI OR RIGHT OF THE DIFF\nFORWARD INSERSION = LAGGING\nREVERSE INSERSION = LEADING", output = report.rmd, path = "./", overwrite = FALSE)
d <- b[b$pos < ori.center | b$p > dif.center,]
d <- b[b$pos < ori.center | b$p > ter.center,]
fun_report(data = addmargins(table(d[, c("fork", "orient")])), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = addmargins(table(d$names)), output = report.rmd, path = "./", overwrite = FALSE)
# bed like file
options(scipen = 1000) # to avoid writing of scientific numbers in tables, see https://stackoverflow.com/questions/3978266/number-format-writing-1e-5-instead-of-0-00001
write.table(b, file = paste0("./motif_sites.freq"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t")
write.table(b, file = paste0("./motif_sites.pos"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t")
options(scipen = 0)
# main
ori.center <- mean(ori.coord, na.rm = TRUE)
dif.center <- mean(dif.coord, na.rm = TRUE)
p.fw.lead <- (dif.center - ori.center) / coli.size
ori.center <- mean(ori_coord, na.rm = TRUE)
ter.center <- mean(ter_coord, na.rm = TRUE)
p.fw.lead <- (ter.center - ori.center) / genome_size
p.fw.lag <- 1 - p.fw.lead
p.rv.lead <- p.fw.lag
p.rv.lag <- p.fw.lead
fun_report(data = paste0("\n\n<br /><br />\n\nPROPORTION OF THE FORWARD LEADING PART OF THE GENOME [(dif.center - ori.center) / coli.size]\n", round(p.fw.lead, 3), "\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />\n\nPROPORTION OF THE FORWARD LEADING PART OF THE GENOME [(ter.center - ori.center) / genome_size]\n", round(p.fw.lead, 3), "\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />\n\nPROPORTION OF THE FORWARD LAGGING PART OF THE GENOME (1 - p.fw.lead)\n", round(p.fw.lag, 3), "\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />\n\nPROPORTION OF THE REVERSE LEADING PART OF THE GENOME (= p.fw.lag)\n", round(p.rv.lead, 3), "\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />\n\nPROPORTION OF THE REVERSE LAGGING PART OF THE GENOME (= p.fw.lead)\n", round(p.rv.lag, 3), "\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
# theo.p <- c(p.fw.lag = 0.503 / 2, p.rv.lag = 0.497 / 2, p.fw.lead = 0.497 / 2, p.rv.lead = 0.503 / 2) # from 20201209 section 43.2
theo.p <- c(p.fw.lag = p.fw.lag / 2, p.rv.lag = p.rv.lag / 2, p.fw.lead = p.fw.lead / 2, p.rv.lead = p.rv.lead / 2) # from 20201209 section 43.2
# main
final <- b
......
......@@ -72,10 +72,13 @@ if(interactive() == FALSE){ # if(grepl(x = commandArgs(trailingOnly = FALSE), pa
}
tempo.arg.names <- c(
"pos",
"freq",
"ori_coord",
"ter_coord",
"xlab",
"genome_size",
"window_size",
"step",
"file_name",
"cute",
"log"
......@@ -101,11 +104,21 @@ rm(tempo.cat)
################################ Test
# cat("\n\n!!!!!!!!!!!!!!!!!!! WARNING: test values are activated\n\n")
# stat <- "C:/Users/Gael/Documents/Git_projects/14985_loot/dataset/test.fastq_Nremove_trim_5pAttc_1-51.stat"
# attc_seq <- "CAATTCATTCAAGCCGACGCCGCTTCGCGGCGCGGCTTAATTCAAGCG"
# cute <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v10.9.0/cute_little_R_functions.R"
# log <- "report.txt"
# pos <- "C:\\Users\\gael\\Documents\\Git_projects\\14985_loot\\work\\ec\\0c01dd6e92fc4bfa0f504ab1a42412\\obs_rd_insertions.pos"
# freq <- "C:\\Users\\gael\\Documents\\Git_projects\\14985_loot\\work\\ec\\0c01dd6e92fc4bfa0f504ab1a42412\\obs_rd_insertions.freq"
# ori_coord <- "2320711 2320942"
# ter_coord <- "4627368 4627400"
# xlab <- "Ecoli Genome (bp)"
# genome_size <- "4641652"
# window_size <- "50000 200000"
# step <- "100"
# file_name <- "test.fastq2"
# cute <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v10.9.0/cute_little_R_functions.R"
# log <- "plot_insertion_report.txt"
################################ end Test
......@@ -119,10 +132,13 @@ param.list <- c(
"run.way",
if(run.way == "SCRIPT"){"command"},
"pos",
"freq",
"ori_coord",
"ter_coord",
"xlab",
"genome_size",
"window_size",
"step",
"file_name",
"cute",
"log"
......@@ -231,10 +247,13 @@ text.check <- NULL #
checked.arg.names <- NULL # for function debbuging: used by r_debugging_tools
ee <- expression(arg.check <- c(arg.check, tempo$problem) , text.check <- c(text.check, tempo$text) , checked.arg.names <- c(checked.arg.names, tempo$object.name))
tempo <- fun_check(data = pos, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = freq, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = ori_coord, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = ter_coord, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = xlab, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = genome_size, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = window_size, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = step, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = file_name, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = cute, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = log, class = "vector", typeof = "character", length = 1) ; eval(ee)
......@@ -247,11 +266,13 @@ if(any(arg.check) == TRUE){ # normally no NA
# end management of NA arguments
# management of NULL arguments
tempo.arg <-c(
"pos",
"freq",
"ori_coord",
"ter_coord",
"xlab",
"genome_size",
"window_size",
"step",
"file_name",
"cute",
"log"
......@@ -273,25 +294,38 @@ warn <- NULL
# other checkings
ori_coord <- strsplit(ori_coord, split = " ")[[1]]
if(length(ori_coord) != 2 & any(grepl(ori_coord, pattern = "\\D"))){# normally no NA with is.null()
if(length(ori_coord) != 2 & any(grepl(ori_coord, pattern = "\\D"))){ # "\\D" means at least something other than 0123456789 to have TRUE
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE ori_coord PARAMETER MUST BE TWO INTEGERS SEPARATED BY A SINGLE SPACE\nHERE IT IS: \n", paste0(ori_coord, collapse = " "))
stop(paste0("\n\n================\n\n", tempo.cat, "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between ==
}else{
ori_coord <- as.integer(ori_coord)
}
ter_coord <- strsplit(ter_coord, split = " ")[[1]]
if(length(ter_coord) != 2 & any(grepl(ter_coord, pattern = "\\D"))){# normally no NA with is.null()
if(length(ter_coord) != 2 & any(grepl(ter_coord, pattern = "\\D"))){ # "\\D" means at least something other than 0123456789 to have TRUE
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE ter_coord PARAMETER MUST BE TWO INTEGERS SEPARATED BY A SINGLE SPACE\nHERE IT IS: \n", paste0(ter_coord, collapse = " "))
stop(paste0("\n\n================\n\n", tempo.cat, "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between ==
}else{
ter_coord <- as.integer(ter_coord)
}
if(length(genome_size) != 1 & any(grepl(genome_size, pattern = "\\D"))){# normally no NA with is.null()
if(any(grepl(genome_size, pattern = "\\D"))){ # "\\D" means at least something other than 0123456789 to have TRUE, not need of length(step) != 1 because of the space between two numbers
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE genome_size PARAMETER MUST BE A SINGLE INTEGER\nHERE IT IS: \n", paste0(genome_size, collapse = " "))
stop(paste0("\n\n================\n\n", tempo.cat, "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between ==
}else{
genome_size <- as.integer(genome_size)
}
window_size <- strsplit(window_size, split = " ")[[1]]
if(any(grepl(window_size, pattern = "\\D"))){ # "\\D" means at least something other than 0123456789 to have TRUE
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE window_size PARAMETER MUST BE INTEGER\nHERE IT IS: \n", paste0(window_size, collapse = " "))
stop(paste0("\n\n================\n\n", tempo.cat, "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between ==
}else{
window_size <- as.integer(window_size)
}
if(any(grepl(step, pattern = "\\D"))){ # "\\D" means at least something other than 0123456789 to have TRUE, not need of length(step) != 1 because of the space between two numbers
tempo.cat <- paste0("ERROR IN plot_insertion.R:\nTHE step PARAMETER MUST BE A SINGLE INTEGER\nHERE IT IS: \n", paste0(step, collapse = " "))
stop(paste0("\n\n================\n\n", tempo.cat, "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between ==
}else{
step <- as.integer(step)
}
# end other checkings
......@@ -344,7 +378,8 @@ if(erase.graphs == TRUE){
################ Data import
obs <- read.table(pos, stringsAsFactors = FALSE) # does not take the header
obs.rd.freq <- read.table(freq, stringsAsFactors = FALSE, header = TRUE, sep = "\t")
obs.rd.pos <- read.table(pos, stringsAsFactors = FALSE, header = TRUE, sep = "\t")
################ end Data import
......@@ -353,11 +388,16 @@ obs <- read.table(pos, stringsAsFactors = FALSE) # does not take the header
############ modifications of imported tables
tempo <- max(res$freq) + 20
ori <- data.frame(x= c(ori_coord[1], ori_coord[1], ori_coord[2], ori_coord[2], ori_coord[1]), y = c(-tempo, tempo, tempo, -tempo ,-tempo), ZONE = "Ori", stringsAsFactors = TRUE)
dif <- data.frame(x= c(ter_coord[1], ter_coord[1], ter_coord[2], ter_coord[2], ter_coord[1]), y = c(-tempo, tempo, tempo, -tempo ,-tempo), ZONE = "Ter", stringsAsFactors = TRUE)
structure <- rbind(ori, dif, stringsAsFactors = TRUE)
names(obs.rd.freq)[names(obs.rd.freq) == "seq"] <- "KIND"
tempo <- max(obs.rd.freq$freq) + 20
ori.freq <- data.frame(x= c(ori_coord[1], ori_coord[1], ori_coord[2], ori_coord[2], ori_coord[1]), y = c(-tempo, tempo, tempo, -tempo ,-tempo), ZONE = "Ori", stringsAsFactors = TRUE)
dif.freq <- data.frame(x= c(ter_coord[1], ter_coord[1], ter_coord[2], ter_coord[2], ter_coord[1]), y = c(-tempo, tempo, tempo, -tempo ,-tempo), ZONE = "Ter", stringsAsFactors = TRUE)
structure.freq <- rbind(ori.freq, dif.freq, stringsAsFactors = TRUE)
obs.freq <- obs.rd.freq[obs.rd.freq$KIND == "obs", ]
names(obs.rd.pos)[names(obs.rd.pos) == "seq"] <- "KIND"
obs.pos <- obs.rd.pos[obs.rd.pos$KIND == "obs", ]
############ end modifications of imported tables
......@@ -365,46 +405,16 @@ structure <- rbind(ori, dif, stringsAsFactors = TRUE)
############ plotting
# insertion freq / genome raw distribution on both sides
png(filename = paste0("plot_", file_name, "_insertion_raw.png"), width = 5000, height = 1800, units = "px", res = 300)
if(ncol(obs) > 0){
tempo <- res
tempo$freq[tempo$orient == "REVERSE"] <- tempo$freq[tempo$orient == "REVERSE"] * -1
fun_gg_scatter(
data1 = list(structure, tempo), # res # res[res$KIND == "obs", ]
x = list("x", "pos"),
y = list("y", "freq"),
categ = list("ZONE", NULL),
geom = list("geom_step", "geom_stick"),
geom.stick.base = 0,
color = list(c("black", "brown"), fun_gg_palette(n = 2)[1]), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = NULL,
title = "RAW",
x.lim = c(0, genome_size),
x.lab = xlab,
x.left.extra.margin = 0,
x.right.extra.margin = 0,
x.second.tick.nb = 4,
y.lab = "Insertion frequency\n<- reverse forward ->",
y.second.tick.nb = 4,
text.size = 24,
title.text.size = 16
)
}else{
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
# histogram: insertion_number / site_number
png(filename = paste0("plot_", file_name, "_insertion_hist_tot.png"), width = 5000, height = 1800, units = "px", res = 300)
tempo1 <- table(res$freq)
tempo1 <- table(obs.freq$freq)
tempo2 <- names(tempo1)
names(tempo1) <- NULL
tempo3 <- data.frame(insertion_number = as.integer(tempo2), site_number = as.vector(tempo1))
if(ncol(obs) > 0){
if(ncol(obs.freq) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
data1 = tempo3, # res # res[res$KIND == "obs.freq", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
......@@ -431,9 +441,9 @@ if(ncol(obs) > 0){
}
png(filename = paste0("plot_", file_name, "_insertion_hist_tot_zoom.png"), width = 5000, height = 1800, units = "px", res = 300)
if(ncol(obs) > 0){
if(ncol(obs.freq) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
data1 = tempo3, # res # res[res$KIND == "obs.freq", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
......@@ -462,13 +472,13 @@ if(ncol(obs) > 0){
png(filename = paste0("plot_", file_name, "_insertion_hist_forward.png"), width = 5000, height = 1800, units = "px", res = 300)
tempo1 <- table(res$freq[res$orient == "FORWARD"])
tempo1 <- table(obs.freq$freq[obs.freq$orient == "FORWARD"])
tempo2 <- names(tempo1)
names(tempo1) <- NULL
tempo3 <- data.frame(insertion_number = as.integer(tempo2), site_number = as.vector(tempo1))
if(ncol(obs) > 0){
if(ncol(obs.freq) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
data1 = tempo3, # res # res[res$KIND == "obs.freq", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
......@@ -495,13 +505,13 @@ if(ncol(obs) > 0){
}
png(filename = paste0("plot_", file_name, "_insertion_hist_reverse.png"), width = 5000, height = 1800, units = "px", res = 300)
tempo1 <- table(res$freq[res$orient == "REVERSE"])
tempo1 <- table(obs.freq$freq[obs.freq$orient == "REVERSE"])
tempo2 <- names(tempo1)
names(tempo1) <- NULL
tempo3 <- data.frame(insertion_number = as.integer(tempo2), site_number = as.vector(tempo1))
if(ncol(obs) > 0){
if(ncol(obs.freq) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
data1 = tempo3, # res # res[res$KIND == "obs.freq", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
......@@ -528,7 +538,108 @@ if(ncol(obs) > 0){
}
# insertion freq / genome binning (sliding windows)
# insertion freq / genome raw distribution on both sides
png(filename = paste0("plot_", file_name, "_insertion_raw.png"), width = 5000, height = 1800, units = "px", res = 300)
if(ncol(obs.rd.freq) > 0){
tempo <- obs.rd.freq
tempo$freq[tempo$orient == "REVERSE"] <- tempo$freq[tempo$orient == "REVERSE"] * -1
fun_gg_scatter(
data1 = list(structure.freq, tempo), # res # res[res$KIND == "obs", ]
x = list("x", "pos"),
y = list("y", "freq"),
categ = list("ZONE", "KIND"),
geom = list("geom_step", "geom_stick"),
geom.stick.base = 0,
color = list(c("black", "brown"), fun_gg_palette(n = 2)), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = NULL,
title = "RAW",
x.lim = c(0, genome_size),
x.lab = xlab,
x.left.extra.margin = 0,
x.right.extra.margin = 0,
x.second.tick.nb = 4,
y.lab = "Insertion frequency\n<- reverse forward ->",
y.second.tick.nb = 4,
text.size = 24,
title.text.size = 16
)
}else{
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
for(i0 in window_size){
# leading and lagging
lead.obs <- fun_slide(data = obs.rd.pos$pos[obs.rd.pos$fork== "LEADING" & obs.rd.pos$KIND == "obs"], window.size = i0, step = step, fun = length, from = min(obs.rd.pos$pos), to = max(obs.rd.pos$pos), parall = FALSE, cute.path = cute)
lag.obs <- fun_slide(data = obs.rd.pos$pos[obs.rd.pos$fork== "LAGGING" & obs.rd.pos$KIND == "obs"], window.size = i0, step = step, fun = length, from = min(obs.rd.pos$pos), to = max(obs.rd.pos$pos), parall = FALSE, cute.path = cute)
lead.rd <- fun_slide(data = obs.rd.pos$pos[obs.rd.pos$fork== "LEADING" & obs.rd.pos$KIND == "random"], window.size = i0, step = step, fun = length, from = min(obs.rd.pos$pos), to = max(obs.rd.pos$pos), parall = FALSE, cute.path = cute)
lag.rd <- fun_slide(data = obs.rd.pos$pos[obs.rd.pos$fork== "LAGGING" & obs.rd.pos$KIND == "random"], window.size = i0, step = step, fun = length, from = min(obs.rd.pos$pos), to = max(obs.rd.pos$pos), parall = FALSE, cute.path = cute)
lead.obs <- data.frame(lead.obs, STRAND = "LEADING", KIND = "OBS", stringsAsFactors = TRUE)
lag.obs <- data.frame(lag.obs, STRAND = "LAGGING", KIND = "OBS", stringsAsFactors = TRUE)
total.obs <- lead.obs[, ! names(lead.obs) %in% "STRAND"]
total.obs$value <- lead.obs$value + lag.obs$value
lead.rd <- data.frame(lead.rd, STRAND = "LEADING", KIND = "RANDOM", stringsAsFactors = TRUE)
lag.rd <- data.frame(lag.rd, STRAND = "LAGGING", KIND = "RANDOM", stringsAsFactors = TRUE)
total.rd <- lead.rd[, ! names(lead.rd) %in% "STRAND"]
total.rd$value <- lead.rd$value + lag.rd$value
total <- rbind(total.obs, total.rd)
# check
if( ! isTRUE(all.equal(lead.obs$center, lag.obs$center))){cat("\n\nERROR: CENTERS OF LEAD AND LAG WINDOWS NOT THE SAME IN OBS\n\n")}
if( ! isTRUE(all.equal(lead.rd$center, lag.rd$center))){cat("\n\nERROR: CENTERS OF LEAD AND LAG WINDOWS NOT THE SAME IN RANDOM\n\n")}
# oriC data frame generation
tempo.max <- max(total$value) + 100
tempo.min <- - 100
ori <- data.frame(x= c(ori_coord[1], ori_coord[1], ori_coord[2], ori_coord[2], ori_coord[1]), y = c(tempo.min, tempo.max, tempo.max, tempo.min ,tempo.min), ZONE = "oriC", stringsAsFactors = TRUE)
dif <- data.frame(x= c(ter_coord[1], ter_coord[1], ter_coord[2], ter_coord[2], ter_coord[1]), y = c(tempo.min, tempo.max, tempo.max, tempo.min ,tempo.min), ZONE = "dif", stringsAsFactors = TRUE)
structure <- rbind(ori, dif, stringsAsFactors = TRUE)
if(ncol(total) > 0){
# graph of total frequency
res <- fun_gg_scatter(data1 = list(structure, total),
x = list("x", "center"),
y = list("y", "value"),
geom = list("geom_step", "geom_line"),
categ = list("ZONE", "KIND"),
alpha = 1,
color = list(c("black", "brown"), c(fun_gg_palette(n = 8)[7], fun_gg_palette(n = 2)[2])),
dot.size = 0.7,
line.size = 1,
x.lab = xlab,
x.lim = xlim,
x.tick.nb = 6,
x.second.tick.nb = 3,
x.left.extra.margin = 0,
x.right.extra.margin = 0,
y.lab = "Number of insertion sites",
y.lim = ylim,
y.tick.nb = 6,
y.second.tick.nb = 3,
y.top.extra.margin = 0,
y.bottom.extra.margin = 0,
grid = FALSE,
article = TRUE,
legend.width = 0.2,
text.size = 16,
title = paste0("INSERTION SITE FREQUENCY (WARNING: MAX ~ 100 INTEG PER POSITION, SEE 24.7.2) | WINDOW SIZE: ", i0, " | STEP: ", step, " | GRAPH "),
title.text.size = 7,
return = TRUE
)
ggplot2::ggsave(paste0("total_insertion_site_freq_", i0, ".png"), plot = res$gtable, device = "png", width = 5000, height = 1800, units = "px", res = 300)
}else{
png(filename = paste0("total_insertion_site_freq_", i0, ".png"), width = 5000, height = 1800, units = "px", res = 300)
fun_gg_empty_graph(text = "EMPTY pos FILE: NO PLOT DRAWN")
}
}
......
This diff is collapsed.
......@@ -658,7 +658,7 @@ process seq_around_insertion { // sections 24.9.1 of the labbook 20200707
output:
file "${file_name}_around_insertion.bed" into around_insertion_bed_ch1
file "seq_around_insertion_report.txt"
file "report.rmd" into log_ch18
file "report.rmd" into log_ch17
script:
"""
......@@ -759,7 +759,7 @@ process report2 {
val insertion_dist
output:
file "report.rmd" into log_ch19
file "report.rmd" into log_ch18
script:
"""
......@@ -793,7 +793,7 @@ process logo { // 24.9.3 of the labbook 20200707
val cute_path
output:
file "*.png" into fig_ch7 // warning: 4 files
file "*.png" into fig_ch6 // warning: 4 files
file "logo_report.txt"
script:
......@@ -817,9 +817,9 @@ process global_logo { // 24.9.3 of the labbook 20200707
val cute_path
output:
file "global_logo_${file_name}.png" into fig_ch8
file "global_logo_${file_name}.png" into fig_ch7
file "global_logo_report.txt"
file "report.rmd" into log_ch20
file "report.rmd" into log_ch19
script:
"""
......@@ -851,20 +851,22 @@ process final_insertion_files { // 44.1 of the labbook 20201210
file "${file_name}_annot.pos" into pos_ch1
file "${file_name}_annot_insertion.freq"
file "final_insertion_files_report.txt"
file "report.rmd" into log_ch21
file "report.rmd" into log_ch20
script:
"""
echo -e "\\n\\n<br /><br />\\n\\n### Final insertion site files\\n\\n" > report.rmd
echo -e "\\n\\n<br /><br />\\n\\nSee the [${file_name}_annot.pos](./files/${file_name}_annot.pos) and [${file_name}_annot.freq](./files/${file_name}_annot.freq) files\\n\\n" >> report.rmd
final_insertion_files.R "${pos}" "${ori_coord}" "${ter_coord}" "${file_name}" "${cute_path}" "final_insertion_files_report.txt"
pos_nb=\$(wc -l ${file_name}_annot.pos | cut -f1 -d' ')
echo -e "\\n\\nNumber of different positions: \$(printf "%'d" \${pos_nb})\\n" >> report.rmd
"""
}
process motif { // 43 of the labbook 20201209
label 'r_ext' // see the withLabel: bash in the nextflow config file
publishDir "${out_path}/files", mode: 'copy', pattern: "{*.pos}", overwrite: false
publishDir "${out_path}/files", mode: 'copy', pattern: "motif_sites.pos", overwrite: false
publishDir "${out_path}/reports", mode: 'copy', pattern: "motif_report.txt", overwrite: false
cache 'true'
......@@ -878,8 +880,7 @@ process motif { // 43 of the labbook 20201209
val cute_path
output:
file "motif_fw.pos"
file "motif_rev.pos"
file "motif_sites.pos" into motif_ch
file "motif_report.txt"
file "report.rmd" into log_ch21
......@@ -902,15 +903,9 @@ process motif { // 43 of the labbook 20201209
}
process random_insertion { // sections 44.1 and 44.2 of the labbook 20201210
process random_insertion { // sections 44 of the labbook 20201210
label 'r_ext' // see the withLabel: bash in the nextflow config file
publishDir "${out_path}/figures", mode: 'copy', pattern: "{*.png}", overwrite: false // https://docs.oracle.com/javase/tutorial/essential/io/fileOps.html#glob
publishDir "${out_path}/files", mode: 'copy', pattern: "{*.pos,*.freq}", overwrite: false
publishDir "${out_path}/reports", mode: 'copy', pattern: "random_insertion_report.txt", overwrite: false
cache 'true'
......@@ -918,68 +913,108 @@ process random_insertion { // sections 44.1 and 44.2 of the labbook 20201210
input:
val file_name
file pos from pos_ch1
file motif_pos from motif_ch
val ori_coord
val ter_coord
val motif
val insertion_dist
val motif_fw
val genome_size
val cute_path
output:
file "${file_name}_around_insertion.bed" into around_insertion_bed_ch1
file "*.png" into fig_ch8 // warning: 6 files
file "obs_rd_insertions.pos" into obs_rd_insertions_pos_ch1
file "obs_rd_insertions.freq" into obs_rd_insertions_freq_ch1
file "random_insertion_report.txt"
file "report.rmd" into log_ch22
script:
"""
echo -e "\\n\\n<br /><br />\\n\\n### Random insertion sites\\n\\n" > report.rmd
random_insertion.R "${pos}" "${ori_coord}" "${ter_coord}" "${motif}" "${insertion_dist}" "${file_name}" "${cute_path}" "random_insertion_report.txt"
echo -e "\\n\\n<br /><br />\\n\\nIn each sequence of length \$((${insertion_dist} * 2)) <br />position \$((${insertion_dist} + 1)) corresponds to the first nucleotide of the reference genome part of the read" >> report.rmd
echo -e "\\n\\n<br /><br />\\n\\n### Random insertion sites\\n\\n" > report.rmd
random_insertion.R "${pos}" "${motif_pos}" "${ori_coord}" "${ter_coord}" "${motif_fw}" "${genome_size}" "${file_name}" "${cute_path}" "random_insertion_report.txt" "report.rmd"
echo -e "\\n\\n<br /><br />\\n\\n#### Insertion site counts\\n\\n" > report.rmd
echo -e '
\\n\\n<br /><br />\\n\\n</center>\\n\\n
![Figure 13: Number of motifs insertion per fork](./figures/plot_motif_insertion_per_fork.png){width=600}
\\n\\n</center>\\n\\n
\\n\\n<br /><br />\\n\\n</center>\\n\\n
![Figure 14: Number of motifs insertion per strand](./figures/plot_motif_insertion_per_strand.png){width=600}
\\n\\n</center>\\n\\n
\\n\\n<br /><br />\\n\\n</cente