write.table(res,file=paste0("./",file_name,"_annot_selected.freq"),row.names=FALSE,col.names=TRUE,append=FALSE,quote=FALSE,sep="\t")# if duplicated -> selected data for highest usage # file for plotting selection insertion sites in duplicated reads
options(scipen=1000)# to avoid writing of scientific numbers in tables, see https://stackoverflow.com/questions/3978266/number-format-writing-1e-5-instead-of-0-00001
echo-e"<br /><br />\n\n### Selection of reads with the attC in 5'\n\n">${log}
echo-e"<br /><br />\n\n### Selection of reads with the attC in 5' and trimming of this sequence\n\n">${log}
# fastq filtering
awk-vvar1=${fivep_seq_filtering}'
{lineKind=(NR-1)%4;}
...
...
@@ -105,7 +105,8 @@ gawk -v var1=$header 'BEGIN{
# printing the result
echo-e"<br /><br />\n\n### Preparation of the graph of base frequencies\n\n">>${log}
echo-e"\n\nLength of the expected attC sequence at the 5' part of reads: ${fivep_seq_nb}\n\n">>${log}
echo-e"Number of bases added after the expected attC sequence at the 5' part of reads, for graphical purpose: ${added_nb}\n\n">>${log}
echo-e"\n\nNumber of bases added after the expected attC sequence at the 5' part of reads, for graphical purpose: ${added_nb}\n\n">>${log}
echo-e"\n\nFrequencies of the graph are in the [${output_file}_5pAttc_1-${sum_of_2_nb}.stat](./files/${output_file}_5pAttc_1-${sum_of_2_nb}.stat) file\n\n">>${log}
)# objects names exactly in the same order as in the bash code and recovered in args. Here only one, because only the path of the config file to indicate after the plot_insertion.R script execution
...
...
@@ -153,6 +154,7 @@ param.list <- c(
"window_size",
"step",
"file_name",
"task.cpus",
"cute",
"log"
)
...
...
@@ -237,6 +239,7 @@ if( ! is.null(tempo)){
# R Packages required
req.package.list<-c(
"lubridate",
"parallel",
"ggplot2",
"lemon",
"pdftools"
...
...
@@ -272,6 +275,7 @@ tempo <- fun_check(data = prop_ess_coding_genome, class = "vector", typeof = "ch
@@ -639,24 +631,25 @@ process insertion { // section 24.7 of the labbook 20200707
output:
file "${bam.baseName}.pos" into orient_ch1 // warning: 2 files (no duplication and duplications)
file "insertion_report.txt" into insertion_report_ch // warning: several files
file "report.rmd" optional true into log_ch15 // single file
file "report.rmd" into tempo_log_ch // optional true into log_ch15 // single file
// optional true: report.rmd sent into a channel only if the process has generated a report.rmd. Of note, if("${bam}" == "${file_name}_q20_nodup.bam"){file "report.rmd" into log_ch15} does not work
script:
"""
if [[ ${bam} == "${file_name}_q20_nodup.bam" ]] ; then
echo -e "\\n\\nOne of the step is to correct insertion site read extremity for the reverse reads. It consist in the redefinition of POS according to FLAG in the bam file. See the [insertion_report.txt](./reports/insertion_report.txt) file in the reports folders for details\\n\\n" >> report.rmd
echo -e "\\n\\nOne of the step is to recover positions of reverse reads (16), that use the 3\' end of the read as insertion site and not the 5\' part as with forward reads (0).\\nIt consist in the redefinition of POS according to FLAG in the bam file. See the [insertion_report.txt](./reports/insertion_report.txt) file in the reports folders for details\\n\\n" >> report.rmd
fi
# extraction of bam column 2, 4 and 10, i.e., FLAG, POS and SEQ
# Of note, samtools fasta \$DIR/\$SAMPLE_NAME > \${OUTPUT}.fasta # convert bam into fasta
echo -e "\\n\\n######## ${bam} file\n\nExtraction of the FLAG (containing the read orientation) the POS and the SEQ of the bams\\nHeader is the 1) sens of insersion (0 or 16) and 2) insertion site position\\n\\n" >> insertion_report.txt
cat tempo | head -60 | tail -20 >> insertion_report.txt
echo -e "\\n\\nExtraction of the FLAG (containing the read orientation) the POS and the SEQ of the bams\\nHeader is the 1) sens of insersion (0 or 16) and 2) insertion site position\\n\\n" >> insertion_report.txt
echo -e "\\n\\nFinal fasta file\\n\\n" >> insertion_report.txt
echo -e "\\n\\nFinal fasta file\\n\\nPositions of reverse reads (16) use the 3\\' end of the read as insertion site and not the 5\\' part as with forward reads (0)\\n\\n" >> insertion_report.txt
cat ${file_name}_reorient.fasta | head -60 | tail -20 >> insertion_report.txt
@@ -670,17 +663,20 @@ process insertion { // section 24.7 of the labbook 20200707
echo -e "Ratio: " >> report.rmd
echo -e \$(printf "%.2f\n" \$(echo \$" \$read_nb_after / \$read_nb_before " | bc -l)) >> report.rmd # the number in '%.2f\\n' is the number of decimals
insertion_report_ch.collectFile(name: "insertion_report.txt").subscribe{it -> it.copyTo("${out_path}/reports")} // concatenate all the cov_report.txt files in channel cov_report_ch into a single file published into ${out_path}/reports
process final_insertion_files { // 44.1 of the labbook 20201210. Also select the nb_max_insertion_sites most frequent sites in the duplicated file
label 'r_ext' // see the withLabel: bash in the nextflow config file
@@ -693,7 +689,7 @@ process final_insertion_files { // 44.1 of the labbook 20201210. Also select the
file "${pos.baseName}.pos" into pos_ch1, pos_ch2 // warning: 2 files, selected sites if duplicated
file "${pos.baseName}_annot.pos" into pos_annot_ch // warning: 2 files, non selected sites if duplicated
file "${pos.baseName}_annot.freq" into freq_ch // warning: 2 files, non selected sites if duplicated
file "${pos.baseName}_annot_selected.freq" optional true into freq_dup_selected_ch1, freq_dup_selected_ch2 // warning: 1 files, optional true because only made with dup file
file "${pos.baseName}_annot_selected.freq" into freq_dup_selected_ch // optional true into freq_dup_selected_ch1, freq_dup_selected_ch2 // warning: 1 files, optional true because only made with dup file
file "final_insertion_files_report.txt" into final_insertion_files_report_ch // warning: 2 files
script:
...
...
@@ -719,9 +715,14 @@ freq_ch.branch{
nodup: it.getName() =~ /nodup/
dup: true
}.set{tempo_ch2}
// tempo_ch2.nodup.set{freq_nodup_ch1} // warning: 1 file now
tempo_ch2.dup.set{freq_dup_ch1} // warning: 1 file now
freq_dup_selected_ch.branch{
nodup: it.getName() =~ /nodup/
dup: true
}.set{tempo_ch2}
tempo_ch2.dup.into{freq_dup_selected_ch1 ; freq_dup_selected_ch2} // warning: 1 file now
@@ -1030,7 +1031,7 @@ process random_insertion { // sections 44 of the labbook 20201210
echo -e "\\n\\n<br /><br />\\n\\n### Random insertion sites\\n\\n" > report.rmd
echo -e "\\n\\n#### Insertion site counts\\n\\n" >> report.rmd
echo -e "\\n\\nSee the [random_insertion_report.txt](./files/random_insertion_report.txt) file for details\\n\\n" >> report.rmd
echo -e "\\n\\nSee the [random_insertion_report.txt](./reports/random_insertion_report.txt) file for details, notably the number of random sites (which should be the same as the number of observed sites)\\n\\n" >> report.rmd
echo -e '
\\n\\n<br /><br />\\n\\n</center>\\n\\n
{width=400}
...
...
@@ -1058,7 +1059,7 @@ process random_insertion { // sections 44 of the labbook 20201210
process plot_insertion { // sections 24.7.2 and 45 of the labbook 20200520, for TSS, section 47 20201211, for CDS section 48 20201211
label 'r_ext' // see the withLabel: bash in the nextflow config file
label 'r_ext_12cpu' // see the withLabel: bash in the nextflow config file
{width=600}
 to extand)](./figures/alignment.html){width=600}
\\n\\n</center>\\n\\n
' >> report.rmd
echo -e "\\n\\n<br /><br />\\n\\nWarning: the frequency of each position is taken into account in the logo plot\\n\\n" >> report.rmd