diff --git a/Snakefile b/Snakefile index b47b16332a090701d09d27d4a00a19468c815aaf..b980edd9559fe6f103727734718c0e6e27251ce3 100755 --- a/Snakefile +++ b/Snakefile @@ -97,9 +97,7 @@ include: os.path.join(RULES, "fastqc.rules") #---------------------------------- if config["adapters"]["remove"] : - - ## TODO add AlienTrimmer - adapter_tool = "cutadapt" + adapter_tool = config["adapters"]["tool"] adapters_input_fastq = input_data adapters_wkdir = "01-Trimming" adapters_output = ["01-Trimming/{SAMPLE}_R1_trim.fastq.gz"] @@ -114,7 +112,13 @@ if config["adapters"]["remove"] : adapters_qual = config["adapters"]["quality"] adapters_log = "01-Trimming/logs/{SAMPLE}_trim.txt" final_output.extend(expand(adapters_output, SAMPLE=samples)) - include: os.path.join(RULES, "cutadapt.rules") + if adapter_tool == "cutadapt" : + include: os.path.join(RULES, "cutadapt.rules") + elif adapter_tool == "alienTrimmer" : + include: os.path.join(RULES, "alienTrimmer.rules") + else : + raise ValueError("Please provides an implemented removal adapter tool: must be cutadapt or alienTrimer") + else: adapters_output = input_data @@ -128,7 +132,7 @@ ref = [ config["genome"]["name"] ] if config["genome"]["host_mapping"]: ref += [ config["genome"]["host_name"] ] -elif config["genome"]["rRNA_mapping"]: +if config["genome"]["rRNA_mapping"]: ref += [ "Ribo" ] def mapping_index(wildcards): @@ -138,7 +142,16 @@ def mapping_index(wildcards): return {"fasta": config["genome"]["host_fasta_file"]} elif (wildcards.REF == "Ribo" ): return {"fasta": config["genome"]["ribo_fasta_file"]} - + +def mapping_prefix(wildcards): + if (wildcards.REF == config["genome"]["name"]): + return {"prefix": config["genome"]["name"]} + elif (wildcards.REF == config["genome"]["host_name"]): + return {"prefix": config["genome"]["host_name"]} + elif (wildcards.REF == "Ribo" ): + return {"prefix": "ribo"} + + def annot_index(wildcards): if (wildcards.REF == config["genome"]["name"]): return {"gff_file": config["genome"]["gff_file"]} @@ -148,8 +161,6 @@ def annot_index(wildcards): return {"gff_file": ""} - - #---------------------------------- # bowtie2 MAPPING #---------------------------------- @@ -158,27 +169,32 @@ mapper = [] if config["bowtie2_mapping"]["do"]: mapper += ["bowtie2"] - if config["genome"]["index"]: - # indexing for bowtie2 - bowtie2_index_fasta = unpack(mapping_index) - bowtie2_index_output_done = os.path.join(os.path.dirname(config["genome"]["fasta_file"]), "{REF}.1.bt2") - bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}") - bowtie2_index_log = "02-Mapping/bowtie2/logs/bowtie2_{REF}_indexing.log" - include: os.path.join(RULES, "bowtie2_index.rules") + # indexing for bowtie2 + bowtie2_index_fasta = unpack(mapping_index) + bowtie2_index_log = "02-Mapping/bowtie2/logs/bowtie2_{REF}_indexing.log" + if config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : + bowtie2_index_output_done = config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}.1.bt2".format(config["genome"]["host_name"]) + bowtie2_index_output_prefix = config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}".format(config["genome"]["host_name"]) + elif not config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : + bowtie2_index_output_done = config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}.1.bt2".format(config["genome"]["name"]) + bowtie2_index_output_prefix = config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}".format(config["genome"]["name"]) else: - bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2") - bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}") + bowtie2_index_output_done = config["genome"]["genome_directory"]+"{REF}/bowtie2/{REF}.1.bt2" + bowtie2_index_output_prefix = config["genome"]["genome_directory"]+"{REF}/bowtie2/{REF}" + + include: os.path.join(RULES, "bowtie2_index.rules") + # Mapping step bowtie2_mapping_input = adapters_output bowtie2_mapping_index_done = bowtie2_index_output_done - bowtie2_mapping_sort = "02-Mapping/bowtie2/{REF}/{SAMPLE}_{REF}_sort.bam" - bowtie2_mapping_bam = "02-Mapping/bowtie2/{REF}/{SAMPLE}_{REF}.bam" - bowtie2_mapping_sortprefix = "02-Mapping/bowtie2/{REF}/{SAMPLE}_{REF}_sort" - bowtie2_mapping_logs_err = "02-Mapping/bowtie2/{REF}/logs/{SAMPLE}_{REF}_mapping.err" - bowtie2_mapping_logs_out = "02-Mapping/bowtie2/{REF}/logs/{SAMPLE}_{REF}_mapping.out" + bowtie2_mapping_sort = "02-Mapping/{REF}/bowtie2/{SAMPLE}_{REF}_sort.bam" + bowtie2_mapping_bam = "02-Mapping/{REF}/bowtie2/{SAMPLE}_{REF}.bam" + bowtie2_mapping_sortprefix = "02-Mapping/{REF}/bowtie2/{SAMPLE}_{REF}_sort" + bowtie2_mapping_logs_err = "02-Mapping/{REF}/bowtie2/logs/{SAMPLE}_{REF}_mapping.err" + bowtie2_mapping_logs_out = "02-Mapping/{REF}/bowtie2/logs/{SAMPLE}_{REF}_mapping.out" bowtie2_mapping_prefix_index = bowtie2_index_output_prefix bowtie2_mapping_options = config["bowtie2_mapping"]["options"] final_output.extend(expand(bowtie2_mapping_sort, SAMPLE=samples, REF=ref)) @@ -191,17 +207,22 @@ if config["bowtie2_mapping"]["do"]: if config["star_mapping"]["do"]: mapper += ["STAR"] - if config["genome"]["index"]: - star_index_fasta = unpack(mapping_index) - star_index_output_done = os.path.join(config["genome"]["genome_directory"],"{REF}/SAindex") - star_index_output_dir = os.path.join(config["genome"]["genome_directory"],"{REF}") - star_mapping_splice_file = unpack(annot_index) - star_index_log = "02-Mapping/STAR/logs/STAR_{REF}_indexing.log" - include: os.path.join(RULES, "star_index.rules") - + star_index_fasta = unpack(mapping_index) + star_mapping_splice_file = unpack(annot_index) + star_index_log = "02-Mapping/STAR/logs/STAR_{REF}_indexing.log" + + if config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : + star_index_output_done = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}.1.bt2".format(config["genome"]["host_name"]) + star_index_output_dir = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}".format(config["genome"]["host_name"]) + elif not config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : + star_index_output_done = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}.1.bt2".format(config["genome"]["name"]) + star_index_output_dir = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}".format(config["genome"]["name"]) else: - star_index_output_dir = os.path.join(config["genome"]["genome_directory"],"{REF}") - star_index_output_done = os.path.join(config["genome"]["genome_directory"],"{REF}/SAindex" ) + star_index_output_done = config["genome"]["genome_directory"]+"{REF}/STAR/{REF}.1.bt2" + star_index_output_dir = config["genome"]["genome_directory"]+"{REF}/STAR/{REF}" + + include: os.path.join(RULES, "star_index.rules") + #first pass mapping star_mapping_pass1_input = adapters_output @@ -209,8 +230,8 @@ if config["star_mapping"]["do"]: star_mapping_pass1_index = star_index_output_dir star_mapping_pass1_logs = "02-Mapping/STAR/logs/{SAMPLE}_{REF}_init.out" star_mapping_pass1_output_prefix = "02-Mapping/STAR/{REF}/{SAMPLE}_{REF}_init_" - star_mapping_pass1_junctions = "02-Mapping/STAR/{REF}/{SAMPLE}_{REF}_init_SJ.out.tab" - star_mapping_pass1_bam = "02-Mapping/STAR/{REF}/{SAMPLE}_{REF}_init_Aligned.sortedByCoord.out.bam" + star_mapping_pass1_junctions = "02-Mapping/{REF}/STAR/{SAMPLE}_{REF}_init_SJ.out.tab" + star_mapping_pass1_bam = "02-Mapping/{REF}/STAR/{SAMPLE}_{REF}_init_Aligned.sortedByCoord.out.bam" star_mapping_pass1_read_groups = "" final_output.extend(expand(star_mapping_pass1_junctions, SAMPLE=samples, REF=ref)) include: os.path.join(RULES, "star_mapping_pass1.rules") @@ -220,11 +241,11 @@ if config["star_mapping"]["do"]: star_mapping_pass2_done = star_index_output_done star_mapping_pass2_index = star_index_output_dir star_mapping_pass2_logs = "02-Mapping/STAR/logs/{SAMPLE}_{REF}.out" - star_mapping_pass2_output_prefix = "02-Mapping/STAR/{REF}/{SAMPLE}_{REF}_" - star_mapping_pass2_junctions = expand("02-Mapping/STAR/{{REF}}/{SAMPLE}_{{REF}}_init_SJ.out.tab", SAMPLE=samples) + star_mapping_pass2_output_prefix = "02-Mapping/{REF}/STAR/{SAMPLE}_{REF}_" + star_mapping_pass2_junctions = expand("02-Mapping/{{REF}}/STAR/{SAMPLE}_{{REF}}_init_SJ.out.tab", SAMPLE=samples) star_mapping_pass2_bam = star_mapping_pass1_bam star_mapping_pass2_read_groups = "ID:{SAMPLE}" - star_mapping_pass2_sort = "02-Mapping/STAR/{REF}/{SAMPLE}_{REF}_Aligned.sortedByCoord.out.bam" + star_mapping_pass2_sort = "02-Mapping/{REF}/STAR/{SAMPLE}_{REF}_Aligned.sortedByCoord.out.bam" final_output.extend(expand(star_mapping_pass2_sort, SAMPLE=samples, REF=ref)) include: os.path.join(RULES, "star_mapping_pass2.rules") @@ -238,9 +259,9 @@ if config["star_mapping"]["do"]: def counting_index(wildcards): if (wildcards.MAP == "bowtie2"): - input = "02-Mapping/{MAP}/{REF}/{SAMPLE}_{REF}_sort.bam" + input = "02-Mapping/{REF}/{MAP}/{SAMPLE}_{REF}_sort.bam" elif (wildcards.MAP == "STAR"): - input = "02-Mapping/{MAP}/{REF}/{SAMPLE}_{REF}_Aligned.sortedByCoord.out.bam" + input = "02-Mapping/{REF}/{MAP}/{SAMPLE}_{REF}_Aligned.sortedByCoord.out.bam" return(input) def counting_gff(wildcards): diff --git a/config/config.yaml b/config/config.yaml index 41dc78196b513b83c2b170043c6ee914fcba91e5..934d39e0e58cc432531493c7e443c87413ac8a01 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -26,13 +26,13 @@ #========================================================= # directory where fastq are stored -input_dir: /path/to/fastq/data +input_dir: ../data # How mate paires are written in fastq input_mate: '_R[12]' # filemame extension input_extension: '.fastq.gz' # directory where you want -analysis_dir: /path/to/analysis/rnaflow +analysis_dir: . # tmpdir: write tempory file on this directory (default /tmp/, but could be "/pasteur/sonic/scratch/users/LOGIN/") tmpdir: "/pasteur/sonic/scratch/public/" @@ -54,17 +54,16 @@ tmpdir: "/pasteur/sonic/scratch/public/" #=============================================================================== genome: - index: true - #genome_directory: /path/to/genome/ + genome_directory: ../genome/ name: saccer3 - fasta_file: /path/to/genome/saccer3.fa - gff_file: /path/to/genome/saccer3.gff - host_mapping: false + fasta_file: ../genome/saccer3/saccer3.fa + gff_file: ../genome/saccer3/saccer3.gff + host_mapping: true host_name: hg38 - host_fasta_file: /path/to/genome/hg38.fa - host_gff_file: /path/to/genome/hg38.gff + host_fasta_file: ../genome/hg38/hg38.fa + host_gff_file: ../genome/hg38/hg38.gff rRNA_mapping: true - ribo_fasta_file: /path/to/genome/hg38_rRNA.fa + ribo_fasta_file: ../genome/hg38/hg38_rRNA.fa #=============================================================================== # FastQC section @@ -84,7 +83,9 @@ fastqc: # :Parameters: # # - remove: skip this step if data are clean -# - adapter_list: a string or file (prefixed with *file:*) +# - tool: could be "cutadapt" or "alienTrimmer" +# - adapter_list: a string or file (prefixed with *file:*) for cutadapt +# - alien_file: the alien file for alienTrimmer # - m: 30 means discard trimmed reads that are shorter than 30. # - mode: could be g for 5', a for 3', b for both 5'/3' # - options: See cutadapt documentation for details. For paired-end reads, @@ -99,10 +100,13 @@ fastqc: adapters: remove: yes + tool: alienTrimmer # could be alienTrimmer adapter_list: file:config/TruSeq_Stranded_RNA.fa + alien_file: config/TruSeq_Stranded_RNA.fa m: 25 mode: a - options: -O 6 --trim-n --max-n 1 + options: -O 6 --trim-n --max-n 1 # for cutadapt + #options: -k 10 -m 5 -p 80 # for alienTrimmer quality: 30 threads: 4 @@ -132,7 +136,7 @@ bowtie2_mapping: star_mapping: - do: yes + do: no # for small genome, STAR may abort with segmentation fault. Setting sjdb Over hang to 250 skips this issue options: "--outFilterMismatchNoverLmax 0.05 --outSAMunmapped Within --sjdbOverhang 250" threads: 4 diff --git a/workflow/rules/adapters.rules b/workflow/rules/adapters.rules deleted file mode 100755 index 76f05ceab17f45b584effbea38073632919909fe..0000000000000000000000000000000000000000 --- a/workflow/rules/adapters.rules +++ /dev/null @@ -1,69 +0,0 @@ -######################################################################### -# RNAflow: an automated pipeline to analyse transcriptomic data # -# # -# Authors: Rachel Legendre # -# Copyright (c) 2021-2022 Institut Pasteur (Paris). # -# # -# This file is part of RNAflow workflow. # -# # -# RNAflow is free software: you can redistribute it and/or modify # -# it under the terms of the GNU General Public License as published by # -# the Free Software Foundation, either version 3 of the License, or # -# (at your option) any later version. # -# # -# RNAflow is distributed in the hope that it will be useful, # -# but WITHOUT ANY WARRANTY; without even the implied warranty of # -# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # -# GNU General Public License for more details . # -# # -# You should have received a copy of the GNU General Public License # -# along with RNAflow (LICENSE). # -# If not, see <https://www.gnu.org/licenses/>. # -######################################################################### - - - - -rule adapters: - input: - fastq = adapters_input_fastq - output: - adapters_output - params: - wkdir = adapters_wkdir, - options = adapters_options, - adapters = adapters_adapt_list, - mode = adapters_mode, - min = adapters_min, - qual = adapters_qual - singularity: - "rnaflow.img" - threads: config['adapters']['threads'] - log: - adapters_log - envmodules: - "cutadapt", - "cutadapt-devel" - shell: - """ - set +o pipefail - - tmp="{input}" - infiles=($tmp) - - tmp="{output}" - outfiles=($tmp) - - # add mode and adapter sequences - cmd+=" cutadapt -{params.mode} {params.adapters} -m {params.min} -q {params.qual} {params.options} " - # paired end or single end - if [[ ${{#infiles[@]}} -eq 2 ]]; - then - cmd+=" -o ${{outfiles[0]}} -p ${{outfiles[1]}} ${{infiles[0]}} ${{infiles[1]}} " - else - cmd+=" -o ${{outfiles[0]}} ${{infiles[0]}}" - fi - #run command - eval "${{cmd}} > {log}" - - """ diff --git a/workflow/rules/alienTrimmer.rules b/workflow/rules/alienTrimmer.rules index ea142092494fb5cfc544e8c16b9ea9b08f99b4ff..7bad1ad2e2086e112332d6823ab24c3a23fba81e 100755 --- a/workflow/rules/alienTrimmer.rules +++ b/workflow/rules/alienTrimmer.rules @@ -1,7 +1,7 @@ ######################################################################### # RNAflow: an automated pipeline to analyse transcriptomic data # # # -# Authors: Rachel Legendre # +# Authors: Rachel Legendre, Thomas Bigot # # Copyright (c) 2021-2022 Institut Pasteur (Paris). # # # # This file is part of RNAflow workflow. # @@ -24,7 +24,7 @@ -rule adapters: +rule alienTrimmer: input: fastq = adapters_input_fastq output: @@ -32,12 +32,13 @@ rule adapters: params: wkdir = adapters_wkdir, options = adapters_options, - adapters = adapters_adapt_list, + adapters = config['adapters']['alien_file'], mode = adapters_mode, min = adapters_min, qual = adapters_qual singularity: "rnaflow.img" + shadow: "shallow" threads: config['adapters']['threads'] log: adapters_log @@ -46,23 +47,33 @@ rule adapters: shell: """ set +o pipefail - + tmp="{input}" infiles=($tmp) tmp="{output}" outfiles=($tmp) - # add mode and adapter sequences - cmd+="AlienTrimmer -c {params.adapters} -l {params.min} -q {params.qual} {params.options} -z" + # add options and adapter sequences + cmd+="AlienTrimmer -a {params.adapters} -l {params.min} -q {params.qual} {params.options}" # paired end or single end if [[ ${{#infiles[@]}} -eq 2 ]]; then - cmd+=" -o ${{outfiles[0]}} -p ${{outfiles[1]}} -1 ${{infiles[0]}} -2 ${{infiles[1]}} " + cmd+="-1 ${{infiles[0]}} -2 ${{infiles[1]}} -o out_paired" else - cmd+=" -o ${{outfiles[0]}} -i ${{infiles[0]}}" + cmd+=" -i ${{infiles[0]}} -o out_single" fi #run command eval "${{cmd}} > {log}" + + #rename output files + if [[ ${{#infiles[@]}} -eq 2 ]]; + then + mv out_paired.1.fastq ${{outfiles[0]}} + mv out_paired.2.fastq ${{outfiles[1]}} + else + mv out_single.fastq ${{outfiles[0]}} + fi """ + diff --git a/workflow/rules/cutadapt.rules b/workflow/rules/cutadapt.rules index 76f05ceab17f45b584effbea38073632919909fe..31afc066fef07ecaeb3882f26b13327b5531bb99 100755 --- a/workflow/rules/cutadapt.rules +++ b/workflow/rules/cutadapt.rules @@ -24,7 +24,7 @@ -rule adapters: +rule cutadapt: input: fastq = adapters_input_fastq output: