Commit e26f44e6 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

This version works without -resume. Remain to solve this and to add Goalign

parent c6752c1a
......@@ -72,13 +72,13 @@ if(interactive() == FALSE){ # if(grepl(x = commandArgs(trailingOnly = FALSE), pa
}
tempo.arg.names <- c(
"freq",
"selected_freq",
"ori_coord",
"ter_coord",
"genome_size",
"xlab",
"file_name",
"kind",
"nb_max_insertion_sites",
"insertion_dist",
"cute",
"log"
......@@ -113,7 +113,6 @@ rm(tempo.cat)
# genome_size <- "4641652"
# file_name <- "caca"
# kind <- "dup"
# nb_max_insertion_sites <- "6"
# insertion_dist <- "20"
# cute <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v11.2.0/cute_little_R_functions.R"
# log <- "report.txt"
......@@ -131,13 +130,13 @@ param.list <- c(
"run.way",
if(run.way == "SCRIPT"){"command"},
"freq",
"selected_freq",
"ori_coord",
"ter_coord",
"genome_size",
"xlab",
"file_name",
"kind",
"nb_max_insertion_sites",
"insertion_dist",
"cute",
"log"
......@@ -199,10 +198,8 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_report"
)
......@@ -246,13 +243,13 @@ text.check <- NULL #
checked.arg.names <- NULL # for function debbuging: used by r_debugging_tools
ee <- expression(arg.check <- c(arg.check, tempo$problem) , text.check <- c(text.check, tempo$text) , checked.arg.names <- c(checked.arg.names, tempo$object.name))
tempo <- fun_check(data = freq, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = selected_freq, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = ori_coord, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = ter_coord, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = genome_size, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = xlab, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = file_name, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = kind, options = c("dup", "nodup"), length = 1) ; eval(ee)
tempo <- fun_check(data = nb_max_insertion_sites, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = insertion_dist, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = cute, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = log, class = "vector", typeof = "character", length = 1) ; eval(ee)
......@@ -266,13 +263,13 @@ if(any(arg.check) == TRUE){ # normally no NA
# management of NULL arguments
tempo.arg <-c(
"freq",
"selected_freq",
"ori_coord",
"ter_coord",
"genome_size",
"xlab",
"file_name",
"kind",
"nb_max_insertion_sites",
"insertion_dist",
"cute",
"log"
......@@ -385,6 +382,8 @@ if(erase.graphs == TRUE){
freq <- read.table(freq, header = TRUE, stringsAsFactors = FALSE)
selected_freq <- read.table(selected_freq, header = TRUE, stringsAsFactors = FALSE)
################ end Data import
......@@ -419,7 +418,7 @@ if(ncol(freq) > 0){
categ = list("ZONE", NULL),
geom = list("geom_step", "geom_stick"),
geom.stick.base = 0,
color = list(c("blue", "brown"), fun_gg_palette(n = 2)[1]), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
color = list(c("blue", "green"), fun_gg_palette(n = 2)[1]), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = NULL,
title = "RAW",
x.lim = c(0, genome_size),
......@@ -436,7 +435,33 @@ if(ncol(freq) > 0){
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
png(filename = paste0("plot_", file_name, "_insertion_", kind, "_selected.png"), width = 5000, height = 1800, units = "px", res = 300)
if(ncol(selected_freq) > 0){
tempo <- selected_freq
tempo$freq[tempo$orient == "Reverse"] <- tempo$freq[tempo$orient == "Reverse"] * -1
fun_gg_scatter(
data1 = list(structure.freq, tempo), # res # res[res$KIND == "obs", ]
x = list("x", "Position"),
y = list("y", "freq"),
categ = list("ZONE", NULL),
geom = list("geom_step", "geom_stick"),
geom.stick.base = 0,
color = list(c("blue", "green"), fun_gg_palette(n = 2)[1]), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = NULL,
title = "SELECTED SITES",
x.lim = c(0, genome_size),
x.lab = xlab,
x.left.extra.margin = 0,
x.right.extra.margin = 0,
x.second.tick.nb = 4,
y.lab = "Insertion frequency\n<- reverse forward ->",
y.second.tick.nb = 4,
text.size = text.size,
title.text.size = title.text.size
)
}else{
fun_gg_empty_graph(text = "EMPTY selected.freq FILE: NO PLOT DRAWN")
}
############ end plotting
......
......@@ -188,10 +188,6 @@ req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_report"
)
tempo <- NULL
......@@ -383,30 +379,39 @@ fun_report(data = paste0("\nNUMBER OF OBS POSITIONS:\n", format(nrow(obs), big.m
res <- aggregate(x = obs$Position, by = list(Sequence = obs$Sequence, Position = obs$Position, names = obs$names, fork = obs$fork, orient = obs$orient), FUN = length)
names(res)[names(res) == "x"] <- "freq"
res.ini <- res
if( ! grepl(x = file_name, pattern = "^.*nodup.*$")){
if(nb_max_insertion_sites > nrow(res)){
res <- res[order(res$freq, decreasing = TRUE)[1:nrow(res)], ]
tempo.cat <- paste0("\nWARNING: nb_max_insertion_sites PARAMETER IS GREATER THAN THE NUMBER OF DIFFERENT SITES: \n", format(nrow(res), big.mark=","))
fun_report(data = tempo.cat, output = log, path = "./", overwrite = FALSE)
cat(tempo.cat)
}else{
res <- res[order(res$freq, decreasing = TRUE)[1:nb_max_insertion_sites], ]
}
}else{
tempo <- res[order(res$freq, decreasing = TRUE), ]
tempo.limit <- tempo$freq[nb_max_insertion_sites]
res <- tempo[tempo$freq >= tempo.limit, ] # did not take res[order(res$freq, decreasing = TRUE), ][1:nb_max_insertion_sites, ] in the case of freq equality
}
options(scipen = 1000)
write.table(res, file = paste0("./", file_name, "_annot_selected.freq"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t") # if duplicated -> selected data for highest usage # file for plotting selection insertion sites in duplicated reads
options(scipen = 0)
}
options(scipen = 1000) # to avoid writing of scientific numbers in tables, see https://stackoverflow.com/questions/3978266/number-format-writing-1e-5-instead-of-0-00001
write.table(res, file = paste0("./", file_name, "_annot.freq"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t")
write.table(res.ini, file = paste0("./", file_name, "_annot.freq"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t") # if duplicated -> non selected data for highest usage # file for plotting all the insertion sites in duplicated and non duplicated reads
options(scipen = 0)
print(unique(obs.ini))
print(res$Position)
print(file_name)
if( ! grepl(x = file_name, pattern = "^.*nodup.*$")){
obs <- obs[obs$pos %in% res$Position, ]
obs.ini <- obs.ini[obs.ini[ , 2] %in% res$Position, ] # obs.ini[ , 2] because no column names
}
options(scipen = 1000) # to avoid writing of scientific numbers in tables, see https://stackoverflow.com/questions/3978266/number-format-writing-1e-5-instead-of-0-00001
write.table(obs, file = paste0("./", file_name, "_annot.pos"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t")
write.table(obs, file = paste0("./", file_name, "_annot.pos"), row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t") # if duplicated -> non selected data for highest usage # annotated file, just for line number computation (i.e., nb of insertion events) and also of random events
options(scipen = 0)
options(scipen = 1000)
write.table(obs.ini, file = paste0("./", file_name, ".pos"), row.names = FALSE, col.names = FALSE, append = FALSE, quote = FALSE, sep = "\t")
write.table(obs.ini, file = paste0("./", file_name, ".pos"), row.names = FALSE, col.names = FALSE, append = FALSE, quote = FALSE, sep = "\t") # if duplicated -> selected data for highest usage # non annotated file, for sequence extractions
options(scipen = 0)
......
......@@ -178,10 +178,9 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_mat_op",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_report"
)
......@@ -343,11 +342,10 @@ for(i0 in 1:length(freq)){
tempo <- tempo[, 1:(ncol(tempo) - abs(decal))]
# end equivalent to reverse - complement for contingency
}else{
print(tempo)
tempo <- tempo[order(row.names(tempo)), ]
tempo <- tempo[, (1 + abs(decal)):ncol(tempo)]
}
if(i0 == 1){
if(is.null(final)){
final <- tempo
}else{
final <- fun_mat_op(list(final, tempo))
......@@ -386,7 +384,7 @@ angle <- 90
tempo.just <- fun_gg_just(angle = angle, pos = "bottom")
if(ncol(final) > 0){
title <- "GLOBAL CONSENSUS (LEADING / LAGGING AND FORWARD / REVERSE OVERLAY USING POSITION 1"
title <- "GLOBAL CONSENSUS (LEADING / LAGGING AND FORWARD / REVERSE OVERLAY USING POSITION 1 AS ORIGIN"
gg1 = ggseqlogo::geom_logo(data = final, method = "bits", seq_type = "dna") # Derived from https://weblogo.berkeley.edu/logo.cgi because the website does not work
gg2 <- ggplot2::theme(
# axis.text.x = ggplot2::element_text(angle = tempo.just$angle, hjust = tempo.just$hjust, vjust = tempo.just$vjust, size = text.size), #not required anymore because I used annotate() finally
......
......@@ -176,11 +176,7 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_gg_just",
"fun_report"
)
tempo <- NULL
......
......@@ -195,11 +195,6 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_report"
)
tempo <- NULL
......
......@@ -213,7 +213,6 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_gg_empty_graph",
......@@ -630,7 +629,7 @@ if(ncol(obs.rd.freq) > 0){
categ = list("ZONE", "KIND"),
geom = list("geom_step", "geom_stick"),
geom.stick.base = 0,
color = list(c("black", "brown"), fun_gg_palette(n = 2)), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
color = list(c("blue", "green"), fun_gg_palette(n = 2)), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = NULL,
title = "RAW",
x.lim = c(0, genome_size),
......@@ -689,7 +688,7 @@ for(i0 in window_size){
geom = list("geom_step", "geom_line"),
categ = list("ZONE", "KIND"),
alpha = 1,
color = list(c("black", "brown"), c(fun_gg_palette(n = 8)[7], fun_gg_palette(n = 2)[2])),
color = list(c("blue", "green"), c(fun_gg_palette(n = 8)[7], fun_gg_palette(n = 2)[2])),
dot.size = 0.7,
line.size = 1,
x.lab = xlab,
......@@ -739,7 +738,7 @@ for(i0 in window_size){
geom = list("geom_step", "geom_line", "geom_line"),
categ = list("ZONE", "STRAND", "STRAND"),
alpha = 1,
color = list(c("black", "brown"), fun_gg_palette(n = 4)[1:2], fun_gg_palette(n = 4)[3:4]),
color = list(c("blue", "green"), fun_gg_palette(n = 4)[1:2], fun_gg_palette(n = 4)[3:4]),
dot.size = 0.7,
line.size = 1,
x.lab = xlab,
......@@ -786,7 +785,7 @@ if(ncol(obs.rd.freq) > 0){
categ = list("ZONE", "KIND"),
geom = list("geom_step", "geom_stick"),
geom.stick.base = 0,
color = list(c("black", "brown"), fun_gg_palette(n = 2)), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
color = list(c("blue", "green"), fun_gg_palette(n = 2)), # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = NULL,
title = "RAW",
x.lim = c(0, genome_size),
......
......@@ -79,8 +79,7 @@ if(interactive() == FALSE){ # if(grepl(x = commandArgs(trailingOnly = FALSE), pa
"genome_size",
"file_name",
"cute",
"log",
"report.rmd"
"log"
) # objects names exactly in the same order as in the bash code and recovered in args. Here only one, because only the path of the config file to indicate after the random_insertion.R script execution
if(length(args) != length(tempo.arg.names)){
stop(paste0("\n\n================\n\nERROR IN random_insertion.R\nTHE NUMBER OF ELEMENTS IN args (", length(args),") IS DIFFERENT FROM THE NUMBER OF ELEMENTS IN tempo.arg.names (", length(tempo.arg.names),")\nargs:", paste0(args, collapse = ","), "\ntempo.arg.names:", paste0(tempo.arg.names, collapse = ","), "\n\n================\n\n"), call. = FALSE)
......@@ -113,7 +112,6 @@ rm(tempo.cat)
# file_name <- "test.fastq2"
# cute <- "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v11.0.0/cute_little_R_functions.R"
# log <- "motif_report.txt"
# report.rmd <- "report.rmd"
......@@ -139,8 +137,7 @@ param.list <- c(
"genome_size",
"file_name",
"cute",
"log",
"report.rmd"
"log"
)
if(any(duplicated(param.list))){
stop(paste0("\n\n================\n\nINTERNAL CODE ERROR 1 IN random_insertion.R\nTHE param.list OBJECT CONTAINS DUPLICATED ELEMENTS:\n", paste(param.list[duplicated(param.list)], collapse = " "), "\n\n================\n\n"), call. = FALSE) # message for developers
......@@ -199,11 +196,6 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_report"
)
tempo <- NULL
......@@ -254,7 +246,6 @@ tempo <- fun_check(data = genome_size, class = "vector", typeof = "character", l
tempo <- fun_check(data = file_name, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = cute, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = log, class = "vector", typeof = "character", length = 1) ; eval(ee)
tempo <- fun_check(data = report.rmd, class = "vector", typeof = "character", length = 1) ; eval(ee)
if(any(arg.check) == TRUE){ # normally no NA
stop(paste0("\n\n================\n\n", paste(text.check[arg.check], collapse = "\n"), "\n\n================\n\n"), call. = FALSE) # == in stop() to be able to add several messages between == #
}
......@@ -272,8 +263,7 @@ tempo.arg <-c(
"genome_size",
"file_name",
"cute",
"log",
"report.rmd"
"log"
)
tempo.log <- sapply(lapply(tempo.arg, FUN = get, env = sys.nframe(), inherit = FALSE), FUN = is.null)
if(any(tempo.log) == TRUE){# normally no NA with is.null()
......@@ -393,17 +383,17 @@ set.seed(NULL)
rd <- rd[tempo1, ]
rd <- rd[order(rd$Position), ]
rd$Sequence <- "random"
fun_report(data = paste0("\n\n<br /><br />Head of the random file generated:\n\n", format(nrow(rd), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = head(rd), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />Number of random positions:\n\n", format(nrow(rd), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />Head of the obs file:\n\n", format(nrow(rd), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = head(obs), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />Number of obs positions:\n\n", format(nrow(obs), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nHead of the random file generated:\n\n"), output = log, path = "./", overwrite = FALSE)
fun_report(data = head(rd), output = log, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nNumber of random positions:\n\n", format(nrow(rd), big.mark=",")), output = log, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nHead of the obs file:\n\n"), output = log, path = "./", overwrite = FALSE)
fun_report(data = head(obs), output = log, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nNumber of obs positions:\n\n", format(nrow(obs), big.mark=",")), output = log, path = "./", overwrite = FALSE)
final <- rbind(obs, rd)
fun_report(data = paste0("\n\n<br /><br />Head of the final random + obs file:\n\n", format(nrow(rd), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = head(final), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />Number of random + obs positions:\n\n", format(nrow(final), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nHead of the final random + obs file:\n\n", format(nrow(rd), big.mark=",")), output = log, path = "./", overwrite = FALSE)
fun_report(data = head(final), output = log, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nNumber of random + obs positions:\n\n", format(nrow(final), big.mark=",")), output = log, path = "./", overwrite = FALSE)
# saving position file
options(scipen = 1000) # to avoid writing of scientific numbers in tables, see https://stackoverflow.com/questions/3978266/number-format-writing-1e-5-instead-of-0-00001
......@@ -414,7 +404,7 @@ options(scipen = 0)
# freq file
freq <- aggregate(x = final$Position, by = list(Sequence = final$Sequence, Position = final$Position, names = final$names, fork = final$fork, orient = final$orient), FUN = length)
names(freq)[names(freq) == "x"] <- "freq"
fun_report(data = paste0("\n\n<br /><br />Number of random + obs unique positions:\n\n", format(nrow(freq), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nNumber of random + obs unique positions:\n\n", format(nrow(freq), big.mark=",")), output = log, path = "./", overwrite = FALSE)
options(scipen = 1000) # to avoid writing of scientific numbers in tables, see https://stackoverflow.com/questions/3978266/number-format-writing-1e-5-instead-of-0-00001
write.table(freq, file = "./obs_rd_insertions.freq", row.names = FALSE, col.names = TRUE, append = FALSE, quote = FALSE, sep = "\t")
options(scipen = 0)
......@@ -451,9 +441,9 @@ orient.motif.theo.p <- c(p.fw = p.fw.lag.motif + p.fw.lead.motif, p.rv = p.rv.la
# nb of insertion must be the same
fun_report(data = paste0("\n\n<br /><br />nb of insertion must be the same between obs and random:\n\n"), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />obs:\n\n", format(nrow(final[final$Sequence == "obs", ]), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\n<br /><br />random:\n\n", format(nrow(final[final$Sequence == "random", ]), big.mark=",")), output = report.rmd, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nnb of insertion must be the same between obs and random:\n\n"), output = log, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nobs:\n\n", format(nrow(final[final$Sequence == "obs", ]), big.mark=",")), output = log, path = "./", overwrite = FALSE)
fun_report(data = paste0("\n\nrandom:\n\n", format(nrow(final[final$Sequence == "random", ]), big.mark=",")), output = log, path = "./", overwrite = FALSE)
......
......@@ -182,11 +182,6 @@ if(length(cute) != 1){
req.function <- c(
"fun_check",
"fun_pack",
"fun_df_remod",
"fun_gg_scatter",
"fun_gg_palette",
"fun_open",
"fun_gg_empty_graph",
"fun_report"
)
tempo <- NULL
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment