diff --git a/README.md b/README.md index e8e4a4c8c938b97127962881b6bf074aaee97227..2035c8f12db770ae3e2f90ba3f7cd16753abff97 100755 --- a/README.md +++ b/README.md @@ -192,6 +192,11 @@ Gitlab developers ## WHAT'S NEW IN +### v6.0.0 + +1) Ok up to q20 tested using the test file + + ### v5.1.0 1) html report improved a bit diff --git a/dev/test.config b/dev/test.config index ac72518b90213e4a9dab187020e9edca4930adf3..9749ea37252a8c8a14fdaa3acd8f180cb87baf09 100644 --- a/dev/test.config +++ b/dev/test.config @@ -44,6 +44,7 @@ env { fivep_seq_nb=48 // must be the exact number of nuc positions indicated in fivep_seq_filtering //// end fivep_filtering cutoff_nb=25 // reads of length cutoff_nb after trimming are removed + ref_genome="/mnt/c/Users/Gael/Documents/Git_projects/14985_loot/dataset/coli_K12_MG1655_NC_000913.3_ORI_CENTERED/Ecoli-K12-MG1655_ORI_CENTERED" cute_path="https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v10.9.0/cute_little_R_functions.R" // single character string indicating the file (and absolute pathway) of the required cute_little_R_functions toolbox. With ethernet connection available, this can also be used: "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v5.1.0/cute_little_R_functions.R" or local "C:\\Users\\Gael\\Documents\\Git_projects\\cute_little_R_functions\\cute_little_R_functions.R" } diff --git a/main.nf b/main.nf index a240f5352d92d6fd462b2ab382d94f4faf1910fd..973d5ed7de916f0b67591d952f9afcac7693d1a6 100755 --- a/main.nf +++ b/main.nf @@ -32,6 +32,7 @@ modules = params.modules // remove the dot -> can be used in bash scripts fastq_ch_test = file("${in_path}/${fastq_file}") // to test if exist below primer_ch_test = file("${primer_fasta}") // to test if exist below +ref_genome_ch_test = file("${ref_genome}.fasta") // to test if exist below //////// end Variables from config.file that need to be checked @@ -48,6 +49,7 @@ added_nb_ch = Channel.from("${added_nb}") sum_ch = Channel.from("${fivep_seq_nb}", "${added_nb}").toInteger().sum() Channel.from("${cute_path}").into{cute_ch1 ; cute_ch2 ; cute_ch3 ; cute_ch4 ; cute_ch5} cutoff_nb_ch = Channel.from("${cutoff_nb}") +ref_genome_ch = Channel.from("${ref_genome}") //////// end Channels @@ -64,6 +66,10 @@ if(system_exec == 'local' || system_exec == 'slurm'){ if( ! file_exists2){ error "\n\n========\n\nERROR IN NEXTFLOW EXECUTION\n\nINVALID primer_fasta PARAMETER IN nextflow.config FILE: ${primer_fasta}\n\nIF POINTING TO A DISTANT SERVER, CHECK THAT IT IS MOUNTED\n\n========\n\n" } + def file_exists3 = ref_genome_ch_test.exists() + if( ! file_exists3){ + error "\n\n========\n\nERROR IN NEXTFLOW EXECUTION\n\nINVALID ref_genome PARAMETER IN nextflow.config FILE: ${ref_genome}\n\nIF POINTING TO A DISTANT SERVER, CHECK THAT IT IS MOUNTED\n\nTHE REFERENCE GENOME MUST HAVE BEEN REFERENCED\n\n========\n\n" + } }else{ error "\n\n========\n\nERROR IN NEXTFLOW EXECUTION\n\nINVALID system_exec PARAMETER IN nextflow.config FILE: ${system_exec}\n\n========\n\n" } @@ -373,6 +379,84 @@ process fastqc2 { // section 8.14 of the labbook 20200520 } +process bowtie2 { // section 24.1 of the labbook 20200707 + label 'bowtie2' // see the withLabel: bash in the nextflow config file + publishDir "${out_path}/reports", mode: 'copy', pattern: "bowtie2_report.txt", overwrite: false // + cache 'true' + + input: + file fq from cutoff_ch + val ref from ref_genome_ch + file "report.rmd" from log_ch10 + + output: + file "${fq.baseName}_bowtie2_sorted.bam" into bowtie2_ch + file "bowtie2_report.txt" + file "report.rmd" into log_ch11 + + script: + """ + echo -e "\\n\\n## Bowtie2 alignment\\n\\n" >> report.rmd + bowtie2 --very-sensitive -x "${ref}" -U ${fq} -t -S ${fq.baseName}_bowtie2.sam |& tee -a report.rmd + # --very-sensitive: no soft clipping allowed and very sensitive seed alignment + # -t time displayed + samtools view -bh -o ${fq.baseName}_bowtie2.bam ${fq.baseName}_bowtie2.sam |& tee -a bowtie2_report.txt + samtools sort -o ${fq.baseName}_bowtie2_sorted.bam ${fq.baseName}_bowtie2.bam |& tee -a bowtie2_report.txt + samtools index ${fq.baseName}_bowtie2_sorted.bam |& tee -a bowtie2_report.txt + """ +} + + +process Q20 { // section 24.2 of the labbook 20200707 + label 'samtools' // see the withLabel: bash in the nextflow config file + publishDir "${out_path}/reports", mode: 'copy', pattern: "q20_report.txt", overwrite: false // + cache 'true' + + input: + file bam from bowtie2_ch + file "report.rmd" from log_ch11 + + output: + file "${bam.baseName}_q20.bam" into q20_ch + file "q20_report.txt" + file "report.rmd" into log_ch12 + + script: + """ + samtools view -q 20 -b ${bam} > ${bam.baseName}_q20.bam |& tee q20_report.txt + samtools index ${bam.baseName}_q20.bam + echo -e "\\n\\n## Q20 filtering\\n\\n" >> report.rmd + cat q20_report.txt >> report.rmd + """ +} + + +process no_soft_clipping { // section 24.5 of the labbook 20200707 + label 'samtools' // see the withLabel: bash in the nextflow config file + cache 'true' + + input: + file bam from q20_ch + file "report.rmd" from log_ch12 + + output: + file "report.rmd" into log_ch13 + + script: + """ + echo -e "\\n\\n## Control that no more soft clipping in reads\\n\\n" >> report.rmd + echo -e "nb of reads with soft clipping (S) in CIGAR: \$(printf "%'d" \$(samtools view ${bam} | awk '\$6 ~ /.*[S].*/{print \$0}' | wc -l))" >> report.rmd + echo -e "\\n\\ntotal nb of reads: \$(printf "%'d" \$(samtools view ${bam} | wc -l))" >> report.rmd + """ +} + + + +// process picard_duplicates { // section 24.3 of the labbook 20200707 + + + + process backup { label 'bash' // see the withLabel: bash in the nextflow config file publishDir "${out_path}/reports", mode: 'copy', pattern: "{*.config,*.log}", overwrite: false // since I am in mode copy, all the output files will be copied into the publishDir. See \\wsl$\Ubuntu-20.04\home\gael\work\aa\a0e9a739acae026fb205bc3fc21f9b @@ -381,12 +465,12 @@ process backup { input: file config_file file log_file - file "report.rmd" from log_ch10 + file "report.rmd" from log_ch13 output: file "${config_file}" // warning message if we use file config_file file "${log_file}" // warning message if we use file log_file - file "report.rmd" into log_ch11 + file "report.rmd" into log_ch14 script: """ @@ -402,10 +486,10 @@ process workflowVersion { // create a file with the workflow version in out_path cache 'false' input: - file "report.rmd" from log_ch11 + file "report.rmd" from log_ch14 output: - file "report.rmd" into log_ch12 + file "report.rmd" into log_ch15 script: """ @@ -439,7 +523,7 @@ process print_report { // section 8.8 of the labbook 20200520 input: val cute from cute_ch5 - file "tempo_report" from log_ch12 + file "tempo_report" from log_ch15 tuple val ("stat_tempo_name"), file ("stat_tempo") from stat_fastq_5p_filter_ch2 file "plot_fivep_filtering_stat" from fig_ch1 file "plot_read_length_ini" from fig_ch2 diff --git a/nextflow.config b/nextflow.config index fc837a42d1ae03bd8698c2e5d050c7bd6e86d012..32498e3efcb7a8eea9da9a43c03ad8ff6b828189 100755 --- a/nextflow.config +++ b/nextflow.config @@ -46,6 +46,7 @@ env { added_nb=3 // number of nucleotids taken after fivep_seq_nb for graphic display, to see that the frequency of each base tends toward 0.25 after fivep_seq_nb on the graph //// end fivep_filtering cutoff_nb=25 // reads of length cutoff_nb after trimming are removed + ref_genome="/pasteur/homes/gmillot/reference_genomes/coli_K12_MG1655_NC_000913.3_ORI_CENTERED/Ecoli-K12-MG1655_ORI_CENTERED" cute_path="https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/-/raw/v10.9.0/cute_little_R_functions.R" // single character string indicating the file (and absolute pathway) of the required cute_little_R_functions toolbox. With ethernet connection available, this can also be used: "https://gitlab.pasteur.fr/gmillot/cute_little_R_functions/raw/v5.1.0/cute_little_R_functions.R" or local "C:\\Users\\Gael\\Documents\\Git_projects\\cute_little_R_functions\\cute_little_R_functions.R" } @@ -169,28 +170,38 @@ process { withLabel: bash { container='gmillot/bash-extended_v3.0:gitlab_v4.0' - cpus=1 - memory='3G' + cpus=1 // only used when name = "local" in the executor part above + memory='3G' // only used when name = "local" in the executor part above } withLabel: alien_trimmer { container='gmillot/alien_trimmer_v0.4.0:gitlab_v5.1' // no most recent at 20210930 - cpus=1 - memory='3G' + cpus=1 // only used when name = "local" in the executor part above + memory='3G' // only used when name = "local" in the executor part above } withLabel: fastqc { container='evolbioinfo/fastqc:v0.11.8' - cpus=1 + cpus=1 // only used when name = "local" in the executor part above } withLabel: r_ext { container='gmillot/r_v4.0.5_extended_v2.0:gitlab_v6.4' - cpus=1 - memory='64G' + cpus=1 // only used when name = "local" in the executor part above + memory='64G' // only used when name = "local" in the executor part above } + withLabel: bowtie2 { + container='gmillot/bowtie2_v2.3.4.3_extended_v1.0:gitlab_v7.0' + cpus=12 // only used when name = "local" in the executor part above + memory='64G' // only used when name = "local" in the executor part above + } + withLabel: samtools { + container='gmillot/samtools_v1.14:gitlab_v7.0' + cpus=1 + memory='1G' + } // all the processes of the main.nf file with the label 'bedtools' will use this directives by default @@ -200,11 +211,7 @@ process { memory='3G' } - withLabel: samtools { - container='evolbioinfo/samtools:v1.11' - cpus=1 - memory='1G' - } + withLabel: coverage { container='evolbioinfo/samtools:v1.11'