Commit a632a752 authored by Gael  MILLOT's avatar Gael MILLOT
Browse files

tempo

parent eefe412a
#!usr/bin/env bash
#########################################################################
## ##
......
#!usr/bin/env bash
#########################################################################
## ##
......
#!usr/bin/env bash
#########################################################################
## ##
......
#!usr/bin/env bash
#########################################################################
## ##
......
#!usr/bin/env bash
#########################################################################
## ##
......
#!/usr/bin/env Rscript
#########################################################################
## ##
......
#!/usr/bin/env Rscript
#########################################################################
## ##
......
#!/usr/bin/env Rscript
#########################################################################
## ##
## plot_insertion.R ##
## plot_insertion.R ##
## ##
## Gael A. Millot ##
## Bioinformatics and Biostatistics Hub ##
......@@ -401,10 +401,9 @@ structure <- rbind(ori, dif, stringsAsFactors = TRUE)
############ plotting
# insertion freq / genome raw distribution on both sides
# fun_open(width = 12, height = 4, pdf.name = "plot_fivep_filtering_stat") # must be systematically opened for main.nf
png(filename = paste0("plot_", file_name, "_insertion_raw.png"), width = 5000, height = 1800, units = "px", res = 300)
if(ncol(obs) > 0){
tempo <- res
tempo$freq[tempo$orient == "REVERSE"] <- tempo$freq[tempo$orient == "REVERSE"] * -1
......@@ -432,6 +431,144 @@ if(ncol(obs) > 0){
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
# histogram: insertion_number / site_number
png(filename = paste0("plot_", file_name, "_insertion_hist_tot.png"), width = 5000, height = 1800, units = "px", res = 300)
tempo1 <- table(res$freq)
tempo2 <- names(tempo1)
names(tempo1) <- NULL
tempo3 <- data.frame(insertion_number = as.integer(tempo2), site_number = as.vector(tempo1))
if(ncol(obs) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
geom = "geom_stick",
geom.stick.base = 0,
line.size = 3,
color = fun_gg_palette(n = 7, kind = "dark")[6], # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = 0,
title = "TOTAL ZOOMED",
x.lab = "Insertion number",
x.left.extra.margin = 0.05,
y.top.extra.margin = 0.05,
y.bottom.extra.margin = 0,
x.right.extra.margin = 0.05,
x.second.tick.nb = NULL,
y.lab = "Site number",
y.log = "no",
y.second.tick.nb = 5,
text.size = 24,
title.text.size = 16
)
}else{
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
png(filename = paste0("plot_", file_name, "_insertion_hist_tot_zoom.png"), width = 5000, height = 1800, units = "px", res = 300)
if(ncol(obs) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
geom = "geom_stick",
geom.stick.base = 0,
line.size = 3,
color = fun_gg_palette(n = 7, kind = "dark")[6], # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = 0,
title = "TOTAL",
x.lab = "Insertion number",
x.left.extra.margin = 0.05,
x.right.extra.margin = 0.05,
x.second.tick.nb = NULL,
y.top.extra.margin = 0.05,
y.bottom.extra.margin = 0,
y.lim = c(0, 10),
y.lab = "Site number",
y.log = "no",
y.second.tick.nb = 5,
text.size = 24,
title.text.size = 16
)
}else{
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
png(filename = paste0("plot_", file_name, "_insertion_hist_forward.png"), width = 5000, height = 1800, units = "px", res = 300)
tempo1 <- table(res$freq[res$orient == "FORWARD"])
tempo2 <- names(tempo1)
names(tempo1) <- NULL
tempo3 <- data.frame(insertion_number = as.integer(tempo2), site_number = as.vector(tempo1))
if(ncol(obs) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
geom = "geom_stick",
geom.stick.base = 0,
line.size = 3,
color = fun_gg_palette(n = 7, kind = "dark")[6], # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = 0,
title = "FORWARD",
x.lab = "Insertion number",
x.left.extra.margin = 0.05,
y.top.extra.margin = 0.05,
y.bottom.extra.margin = 0,
x.right.extra.margin = 0.05,
x.second.tick.nb = NULL,
y.lab = "Site number",
y.log = "no",
y.second.tick.nb = 5,
text.size = 24,
title.text.size = 16
)
}else{
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
png(filename = paste0("plot_", file_name, "_insertion_hist_reverse.png"), width = 5000, height = 1800, units = "px", res = 300)
tempo1 <- table(res$freq[res$orient == "REVERSE"])
tempo2 <- names(tempo1)
names(tempo1) <- NULL
tempo3 <- data.frame(insertion_number = as.integer(tempo2), site_number = as.vector(tempo1))
if(ncol(obs) > 0){
fun_gg_scatter(
data1 = tempo3, # res # res[res$KIND == "obs", ]
x = "insertion_number",
y = "site_number",
categ = NULL,
geom = "geom_stick",
geom.stick.base = 0,
line.size = 3,
color = fun_gg_palette(n = 7, kind = "dark")[6], # fun_gg_palette(n = 2) # fun_gg_palette(n = 2)[1]
legend.width = 0,
title = "REVERSE",
x.lab = "Insertion number",
x.left.extra.margin = 0.05,
y.top.extra.margin = 0.05,
y.bottom.extra.margin = 0,
x.right.extra.margin = 0.05,
x.second.tick.nb = NULL,
y.lab = "Site number",
y.log = "no",
y.second.tick.nb = 5,
text.size = 24,
title.text.size = 16
)
}else{
fun_gg_empty_graph(text = "EMPTY .freq FILE: NO PLOT DRAWN")
}
# insertion freq / genome binning (sliding windows)
############ end plotting
......
#!/usr/bin/env Rscript
#########################################################################
## ##
......
#!/usr/bin/env Rscript
#########################################################################
## ##
......
#!usr/bin/env bash
#########################################################################
## ##
......
This diff is collapsed.
C A A T T C A T T C A A G C C G A C G C C G C T T C G C G G C G C G G C T T A A T T C A A G C G N N N
A 0.00 1.00 1.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 1.00 1.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 0.00 0.00 1.00 1.00 0.00 0.00 0.00 0.26 0.00 0.63
C 1.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 1.00 1.00 0.00 0.00 1.00 0.00 1.00 1.00 0.00 1.00 0.00 0.00 1.00 0.00 1.00 0.00 0.00 1.00 0.00 1.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 0.00 1.00 0.00 0.02 0.00 0.17
G 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 0.00 1.00 0.00 0.00 1.00 0.00 0.00 1.00 0.00 0.00 0.00 0.00 1.00 0.00 1.00 1.00 0.00 1.00 0.00 1.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 0.00 1.00 0.03 0.00 0.15
T 0.00 0.00 0.00 1.00 1.00 0.00 0.00 1.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 1.00 1.00 0.00 0.00 1.00 1.00 0.00 0.00 0.00 0.00 0.00 0.00 0.69 1.00 0.04
N 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
/*
#########################################################################
## ##
## nextflow.config ##
## ##
## Gael A. Millot ##
## Bioinformatics and Biostatistics Hub ##
## Computational Biology Department ##
## Institut Pasteur Paris ##
## ##
#########################################################################
*/
//////// variables that will be used only in the main.nf
// variables exported to the main.nf environment. See https://www.nextflow.io/docs/latest/config.html#scope-env
env {
//// path and files
git_path="https://gitlab.pasteur.fr/gmillot/14985_loot/"
in_path="/mnt/c/Users/Gael/Documents/Git_projects/14985_loot/dataset"
//in_path="/mnt/share/14985_loot/dataset/B2699/00_Rawdata"
//in_path="/pasteur/zeus/projets/p01/BioIT/gmillot/14985_loot/dataset/B4985/3" // where initial fastq file is
//in_path="/pasteur/zeus/projets/p01/BioIT/gmillot/14985_loot/dataset/"
//in_path="/pasteur/zeus/projets/p01/BioIT/gmillot/14985_loot/dataset/B2699/00_Rawdata" // where initial fastq file is
fastq_file="test.fastq2.gz" // fastq file name
//fastq_file="4-4_S1_L001_R1_001.fastq.gz"
//fastq_file="3-4_S1_L001_R1_001.fastq.gz"
primer_fasta="/mnt/c/Users/Gael/Documents/Git_projects/14985_loot/dataset/20200520_adapters_TruSeq_B2699_14985_CL.fasta"
//primer_fasta="/pasteur/zeus/projets/p01/BioIT/gmillot/14985_loot/results/20200520_res_CL14985_newtrim_align/20200520_adapters_TruSeq_B2699_14985_CL.fasta" // list of primers used for the library and used by Alien trimmer to trim the raw reads
//primer_fasta="/mnt/share/14985_loot/results/20200520_res_CL14985_newtrim_align/20200520_adapters_TruSeq_B2699_14985_CL.fasta"
//// end path and files
//// alientrimmer
alientrimmer_l_param=30 // L parameter of alienTrimmer
//// end alientrimmer
//// fivep_filtering
attc_seq="CAATTCATTCAAGCCGACGCCGCTTCGCGGCGCGGCTTAATTCAAGCG" // sequence of attc, in red and purple in section 4 20200505 of the CL labbook (48 bases on the left of the cutting site). Required for plotting. Warning: never change this sequence
fivep_seq_filtering='^CAATTCATTCAAGCCGACGCCGCTTCGCGGCGCGGCTTAATTCAAGCG.+$' // regex indicating the 5' sequence of reads to select, then to trim from the selected reads. See the section 8.6 to 8.13 of the labbook 20200520, but instead of analysing and trimming in two steps (29 Nuc of AttC part of the primer then 19 Nuc between primer and Attc cutting site), perform all in a single step, and play with the regex, like Test also
// ^CAATTCATTCAAGCCGACGCCGCTTCGCG[GN][CN][GN][CN][GN][GN][CN][TN][TN][AN][AN][TN][TN][CN][AN][AN][GN][CN][GN].+$
// [CN][AN][AN][TN][TN][CN][AN][TN][TN][CN][AN][AN][GN][CN][CN][GN][AN][CN][GN][CN][CN][GN][CN][TN][TN][CN][GN][CN][GN][GN][CN][GN][CN][GN][GN][CN][TN][TN][AN][AN][TN][TN][CN][AN][AN][GN][CN][GN].+$
// ^CAATTCATTCAAGCCGACGCCGCTTCGCGGCGCGGCTTAATTCAAGCG.+$
// ^[CN][AN][AN][TN][TN][CN][AN][TN][TN][CN][AN][AN][GN][CN][CN][GN][AN][CN][GN][CN][CN][GN][CN][TN][TN][CN][GN][CN][GN]GCGCGGCTTAATTCAAGCG.+$
fivep_seq_nb=48 // must be the exact number of nuc positions indicated in fivep_seq_filtering
//// end fivep_filtering
cutoff_nb=25 // reads of length cutoff_nb after trimming are removed
ref_path="/mnt/c/Users/Gael/Documents/Git_projects/14985_loot/dataset/coli_K12_MG1655_NC_000913.3_ORI_CENTERED/"
//ref_path="/pasteur/zeus/projets/p01/BioIT/gmillot/reference_genomes/coli_K12_MG1655_NC_000913.3_ORI_CENTERED/" // path of the reference genome
ref_file="Ecoli-K12-MG1655_ORI_CENTERED.fasta" // fasta file of the reference genome
ori_coord="2320711 2320942" // [2320711, 2320942] // Ecoli centered coordinates
ter_coord="4627368 4627400" //[4627368, 4627400] // Ecoli centered coordinates
color_coverage="5" // three integers for the color of the three coverage plots[1, 2, 5]
xlab="Ecoli Genome (bp)" // name of the reference genome for graphics
genome_size="4641652" // in bp
cute_path="https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v10.9.0/cute_little_R_functions.R" // single character string indicating the file (and absolute pathway) of the required cute_little_R_functions toolbox. With ethernet connection available, this can also be used: "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v5.1.0/cute_little_R_functions.R" or local "C:\\Users\\Gael\\Documents\\Git_projects\\cute_little_R_functions\\cute_little_R_functions.R"
}
//////// end variables that will be used only in the main.nf
//////// variables that will be used below (and potentially in the main.nf file)
//// must be also exported
system_exec = 'local' // the system that runs the workflow. Either 'local' or 'slurm'
out_path="/mnt/c/Users/Gael/Desktop" // where the report file will be saved. Example report_path = '.' for where the main.nf run is executed or report_path = '/mnt/c/Users/Gael/Desktop'
//out_path="/pasteur/zeus/projets/p01/BioIT/gmillot/14985_loot/results" // where the report file will be saved. Example report_path = '.' for where the main.nf run is executed or report_path = '/mnt/c/Users/Gael/Desktop'
//// end must be also exported
//// general variables
result_folder_name="20220120_res_CL14985_test"
//// end general variables
//// slurm variables
fastqueue = 'common,dedicated' // fast for -p option of slurm. Example: fastqueue = 'common,dedicated'. Example: fastqueue = 'hubbioit'
fastqos= '--qos=fast' // fast for --qos option of slurm. Example: fastqos= '--qos=fast'
normalqueue = 'common,dedicated' // normal for -p option of slurm. Example: normalqueue = 'bioevo'
normalqos = '--qos=hubbioit' // normal for --qos option of slurm. Example: normalqos = '--qos=dedicated'
longqueue = 'common,dedicated' // slow for -p option of slurm. Example: longqueue = 'bioevo'
longqos = '--qos=hubbioit' // slow for --qos option of slurm. Example: longqos = '--qos=dedicated'
add_options = ' ' // additional option of slurm. Example: addoptions = '--exclude=maestro-1101,maestro-1034' or add_options = ' '
//// end slurm variables
//////// end variables that will be used below
//////// Pre processing
int secs = (new Date().getTime())/1000
out_path="${out_path}/${result_folder_name}_${secs}"
//////// end Pre processing
//////// variables used here and also in the main.nf file
env {
system_exec = "${system_exec}"
out_path = "${out_path}"
}
//////// variables used here and also in the main.nf file
//////// Scopes
// kind of execution. Either 'local' or 'slurm'
// those are closures. See https://www.nextflow.io/docs/latest/script.html#closures
executor {
name = "${system_exec}"
queueSize = 2000
}
// create a report folder and print a html report file . If no absolute path, will be where the run is executed
// see https://www.nextflow.io/docs/latest/config.html#config-report
report {
enabled = true
file = "${out_path}/reports/nf_report.html" // warning: here double quotes to get the nextflow variable interpretation
}
// txt file with all the processes and info
trace {
enabled = true
file = "${out_path}/reports/nf_trace.txt"
}
// html file with all the processes
timeline {
enabled = true
file = "${out_path}/reports/nf_timeline.html"
}
// .dot picture of the workflow. Only one file allowed
dag {
enabled = true
file = "${out_path}/reports/nf_dag.png" // Warning: require graphviz installed in the system, see protocol 136
}
// define singularity parameters
singularity {
enabled = true
autoMounts = true // automatically mounts host paths in the executed container
//runOptions = '--home $HOME:/home/$USER --bind /pasteur' // provide any extra command line options supported by the singularity exec. HEre, fait un bind de tout /pasteur dans /pasteur du container. Sinon pas d accès
cacheDir = 'singularity' // name of the directory where remote Singularity images are stored. When rerun, the exec directly uses these without redownloading them. When using a computing cluster it must be a shared folder accessible to all computing nodes
}
//////// end Scopes
//////// directives
// provide the default directives for all the processes in the main.nf pipeline calling this config file
process {
// directives for all the processes
// executor='local' // no need because already defined above in the executor scope
if(system_exec == 'slurm'){
queue = "$fastqueue"
clusterOptions = "$fastqos $add_options"
scratch=false
maxRetries=1
errorStrategy='retry'
}else{
maxRetries=0
errorStrategy='terminate'
}
// all the processes of the main.nf file with the label 'bedtools' will use this directives by default
withLabel: bash {
container='gmillot/bash-extended_v4.0:gitlab_v8.0'
cpus=1 // only used when name = "local" in the executor part above
memory='3G' // only used when name = "local" in the executor part above
}
withLabel: alien_trimmer {
container='gmillot/alien_trimmer_v0.4.0:gitlab_v8.1' // no most recent at 20210930
cpus=1 // only used when name = "local" in the executor part above
memory='3G' // only used when name = "local" in the executor part above
}
withLabel: fastqc {
container='evolbioinfo/fastqc:v0.11.8'
cpus=1 // only used when name = "local" in the executor part above
}
withLabel: r_ext {
container='gmillot/r_v4.0.5_extended_v2.0:gitlab_v6.4'
cpus=1 // only used when name = "local" in the executor part above
memory='64G' // only used when name = "local" in the executor part above
}
withLabel: bowtie2 {
container='gmillot/bowtie2_v2.3.4.3_extended_v2.0:gitlab_v8.0'
cpus=12 // only used when name = "local" in the executor part above
memory='64G' // only used when name = "local" in the executor part above
}
withLabel: samtools {
container='gmillot/samtools_v1.14:gitlab_v8.0'
cpus=1
memory='1G'
}
withLabel: bedtools {
container='gmillot/bedtools_v2.30.0:gitlab_v8.0'
cpus=12
memory='64G'
}
// all the processes of the main.nf file with the label 'bedtools' will use this directives by
withLabel: gatk {
//scratch=true
container='broadinstitute/gatk:4.1.9.0'
memory='60G'
if(system_exec == 'slurm'){
queue = {task.attempt>1 ? "$normalqueue" : "$fastqueue" }
clusterOptions = {task.attempt > 1 ? "$normalqos $add_options" : "$fastqos $add_options" }
}
}
withLabel: bwa {
container="evolbioinfo/bwa:v0.7.17"
cpus=20
memory='30G'
}
withLabel: bcftools {
container="evolbioinfo/bcftools:f27f849"
cpus=1
memory='10G'
}
withLabel: multiqc {
container='ewels/multiqc:1.10.1'
errorStrategy='ignore'
cpus=1
}
}
//////// end directives
\ No newline at end of file
......@@ -705,7 +705,13 @@ process plot_insertion { // sections 24.7.2, 44.1 and 45.1 of the labbook 202005
echo -e "\\n\\nSee the CL Labbook section 24.7.3 to explain the limitation around 100 bp\\n" >> report.rmd
echo -e '
<br /><br /><center>
![Figure 3: Frequency of insertion according to chromosome position.](./figures/plot_${file_name}_insertion_raw.png){width=600}
![Figure 3: Insertion site usage (total insertions).](./figures/plot_${file_name}_insertion_hist_tot.png){width=600}
</center><br /><br />
![Figure 3: Insertion site usage zoomed for sites with few insertions (total insertions).](./figures/plot_${file_name}_insertion_hist_tot_zoom.png){width=600}
</center><br /><br />
![Figure 3: Insertion site usage (forward strand).](./figures/plot_${file_name}_insertion_hist_forward.png){width=600}
</center><br /><br />
![Figure 3: Insertion site usage (reverse strand).](./figures/plot_${file_name}_insertion_hist_reverse.png){width=600}
</center><br /><br />
' >> report.rmd
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment