diff --git a/Snakefile b/Snakefile index f7e3619a1d7a60f66c6ed491c6e438c9a2f12f51..e3908962e61d52ed3246d64fb76c5a4ee63ade43 100755 --- a/Snakefile +++ b/Snakefile @@ -105,7 +105,6 @@ if config["adapters"]["remove"] : adapters_output += ["01-Trimming/{SAMPLE}_R2_trim.fastq.gz"] # Set parameters - adapters_adapt_list = config["adapters"]["adapter_list"] adapters_options = config["adapters"]["options"] adapters_mode = config["adapters"]["mode"] adapters_min = config["adapters"]["m"] @@ -123,7 +122,23 @@ if config["adapters"]["remove"] : else: adapters_output = input_data - + +#---------------------------------- +# bowtie2 MAPPING on Ribo only +#---------------------------------- + + +if config["genome"]["rRNA_mapping"] : + sortmerna_input = adapters_output + sortmerna_fasta = config["genome"]["ribo_fasta_file"] + sortmerna_outfile_rRNA = "02-Mapping/Ribo/{SAMPLE}_rRNA.fastq.gz" + sortmerna_outfile_no_rRNA = "02-Mapping/Ribo/{SAMPLE}_norRNA.fastq.gz" + sortmerna_logs_err = "02-Mapping/Ribo/logs/{SAMPLE}_mapping.err" + sortmerna_logs_out = "02-Mapping/Ribo/logs/{SAMPLE}_mapping.out" + final_output.extend(expand(sortmerna_outfile_no_rRNA, SAMPLE=samples)) + include: os.path.join(RULES, "sortmerna.rules") + + #---------------------------------- # genome gestion #---------------------------------- @@ -132,24 +147,20 @@ ref = [ config["genome"]["name"] ] if config["genome"]["host_mapping"]: ref += [ config["genome"]["host_name"] ] -if config["genome"]["rRNA_mapping"]: - ref += [ "Ribo" ] + def mapping_index(wildcards): if (wildcards.REF == config["genome"]["name"]): return {"fasta": config["genome"]["fasta_file"]} elif (wildcards.REF == config["genome"]["host_name"]): return {"fasta": config["genome"]["host_fasta_file"]} - elif (wildcards.REF == "Ribo" ): - return {"fasta": config["genome"]["ribo_fasta_file"]} + def mapping_prefix(wildcards): if (wildcards.REF == config["genome"]["name"]): return {"prefix": config["genome"]["name"]} elif (wildcards.REF == config["genome"]["host_name"]): return {"prefix": config["genome"]["host_name"]} - elif (wildcards.REF == "Ribo" ): - return {"prefix": "ribo"} def annot_index(wildcards): @@ -157,8 +168,6 @@ def annot_index(wildcards): return {"gff_file": config["genome"]["gff_file"]} elif (wildcards.REF == config["genome"]["host_name"]): return {"gff_file": config["genome"]["host_gff_file"]} - elif (wildcards.REF == "Ribo" ): - return {"gff_file": ""} #---------------------------------- @@ -173,15 +182,8 @@ if config["bowtie2_mapping"]["do"]: # indexing for bowtie2 bowtie2_index_fasta = unpack(mapping_index) bowtie2_index_log = "02-Mapping/bowtie2/logs/bowtie2_{REF}_indexing.log" - if config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : - bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"],"{}/bowtie2/{{REF}}.1.bt2".format(config["genome"]["host_name"])) - bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}".format(config["genome"]["host_name"])) - elif not config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : - bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}.1.bt2".format(config["genome"]["name"])) - bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"]+"{}/bowtie2/{{REF}}".format(config["genome"]["name"])) - else: - bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"]+"{REF}/bowtie2/{REF}.1.bt2") - bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"]+"{REF}/bowtie2/{REF}") + bowtie2_index_output_done = os.path.join(config["genome"]["genome_directory"]+"{REF}/bowtie2/{REF}.1.bt2") + bowtie2_index_output_prefix = os.path.join(config["genome"]["genome_directory"]+"{REF}/bowtie2/{REF}") include: os.path.join(RULES, "bowtie2_index.rules") @@ -210,16 +212,8 @@ if config["star_mapping"]["do"]: star_index_fasta = unpack(mapping_index) star_mapping_splice_file = unpack(annot_index) star_index_log = "02-Mapping/STAR/logs/STAR_{REF}_indexing.log" - - if config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : - star_index_output_done = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}.1.bt2".format(config["genome"]["host_name"]) - star_index_output_dir = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}".format(config["genome"]["host_name"]) - elif not config["genome"]["host_mapping"] and config["genome"]["rRNA_mapping"] : - star_index_output_done = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}.1.bt2".format(config["genome"]["name"]) - star_index_output_dir = config["genome"]["genome_directory"]+"{}/STAR/{{REF}}".format(config["genome"]["name"]) - else: - star_index_output_done = config["genome"]["genome_directory"]+"{REF}/STAR/{REF}.1.bt2" - star_index_output_dir = config["genome"]["genome_directory"]+"{REF}/STAR/{REF}" + star_index_output_done = config["genome"]["genome_directory"]+"{REF}/STAR/{REF}.1.bt2" + star_index_output_dir = config["genome"]["genome_directory"]+"{REF}/STAR/{REF}" include: os.path.join(RULES, "star_index.rules") @@ -250,9 +244,7 @@ if config["star_mapping"]["do"]: include: os.path.join(RULES, "star_mapping_pass2.rules") - - - + #---------------------------------- # counting step #---------------------------------- @@ -285,7 +277,7 @@ if config["feature_counts"]["do"]: ref.pop() feature_counts_input = counting_index - feature_counts_output_count = "03-Counting/{MAP}/{REF}/{SAMPLE}_{REF}_feature.out" + feature_counts_output_count = "03-Counting/{REF}/{MAP}/{SAMPLE}_{REF}_feature.out" feature_counts_gff = counting_gff feature_counts_logs_err = "03-Counting/logs/{MAP}/{SAMPLE}_{REF}_feature.err" feature_counts_logs_out = "03-Counting/logs/{MAP}/{SAMPLE}_{REF}_feature.out" @@ -299,8 +291,8 @@ if config["feature_counts"]["do"]: #---------------------------------- flagstat_input = counting_index -flagstat_logs = "02-Mapping/Flagstats/{MAP}/{REF}/logs/{SAMPLE}_{REF}.out" -flagstat_output = "02-Mapping/Flagstats/{MAP}/{REF}/{SAMPLE}_{REF}_stats.out" +flagstat_logs = "02-Mapping/Flagstats/{REF}/{MAP}/logs/{SAMPLE}_{REF}.out" +flagstat_output = "02-Mapping/Flagstats/{REF}/{MAP}/{SAMPLE}_{REF}_stats.out" final_output.extend(expand(flagstat_output, SAMPLE=samples, REF=ref, MAP=mapper)) include: os.path.join(RULES, "flagstat.rules") @@ -310,13 +302,37 @@ include: os.path.join(RULES, "flagstat.rules") #---------------------------------- if config["bamCoverage"]["do"]: bamCoverage_input = counting_index - bamCoverage_logs = "04-Coverage/{MAP}/{REF}/logs/{SAMPLE}_{REF}.out" - bamCoverage_output = "04-Coverage/{MAP}/{REF}/{SAMPLE}_{REF}.bw" + bamCoverage_logs = "04-Coverage/{REF}/{MAP}/logs/{SAMPLE}_{REF}.out" + bamCoverage_output = "04-Coverage/{REF}/{MAP}/{SAMPLE}_{REF}.bw" bamCoverage_options = config['bamCoverage']['options'] final_output.extend(expand(bamCoverage_output, SAMPLE=samples, REF=ref, MAP=mapper)) include: os.path.join(RULES, "bamCoverage.rules") +#---------------------------------- +# PseudoMapping with Kallisto +#---------------------------------- + +if config["pseudomapping"]["do"]: + kallisto_index_fasta = config["pseudomapping"]["fasta"] + kallisto_index_output = (os.path.splitext(config["pseudomapping"]["fasta"])[0])+".idx" + kallisto_index_kmer = config["pseudomapping"]["kmer"] + kallisto_index_log = "02-Mapping/STAR/logs/Kallisto_indexing.log" + include: os.path.join(RULES, "kallisto_index.rules") + + + #pseudo mapping + kallisto_quant_fastq = adapters_output + kallisto_quant_index = (os.path.splitext(config["pseudomapping"]["fasta"])[0])+".idx" + kallisto_quant_output_dir = "02-Mapping/Kallisto/{SAMPLE}" + kallisto_quant_options = config["pseudomapping"]["options"] + kallisto_quant_gtf = config["pseudomapping"]["gtf"] + kallisto_pseudo_log = "02-Mapping/Kallisto/logs/{SAMPLE}_pseudomap.log" + final_output.extend(expand(kallisto_quant_output_dir, SAMPLE=samples)) + include: os.path.join(RULES, "kallisto_quant.rules") + + + #---------------------------------- # MultiQC report #---------------------------------- diff --git a/config/cluster_config.json b/config/cluster_config.json index 7cfece639c54fed284ef7d322c594fe81ab4d304..30defdfcf4dd9bf661e69d3c8513c7b41cc30d63 100644 --- a/config/cluster_config.json +++ b/config/cluster_config.json @@ -3,6 +3,10 @@ { "ram" : "10G" }, + "multiqc" : + { + "ram" : "20G" + }, "star_mapping_pass1" : { "ram" : "32G" diff --git a/config/config.yaml b/config/config.yaml index 934d39e0e58cc432531493c7e443c87413ac8a01..1bbad1d52461c8bc4e3f9b87449c82f3ee0bd44d 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -100,8 +100,8 @@ fastqc: adapters: remove: yes - tool: alienTrimmer # could be alienTrimmer - adapter_list: file:config/TruSeq_Stranded_RNA.fa + tool: alienTrimmer # could be alienTrimmer or cutadapt + cutadapt_file: file:config/TruSeq_Stranded_RNA.fa alien_file: config/TruSeq_Stranded_RNA.fa m: 25 mode: a @@ -136,11 +136,32 @@ bowtie2_mapping: star_mapping: - do: no + do: yes # for small genome, STAR may abort with segmentation fault. Setting sjdb Over hang to 250 skips this issue options: "--outFilterMismatchNoverLmax 0.05 --outSAMunmapped Within --sjdbOverhang 250" threads: 4 +#=============================================================================== +# pseudo mapping with kallisto +# +# :Parameters: +# +# - fasta: Fasta file for the kallisto index to be used for quantification +# - gtf: GTF file for transcriptome information (required for --genomebam) +# - kmer: k-mer (odd) length (default: 31, max value: 31) +# - options: any options recognised by bowtie2 tool +# - threads: number of threads to be used +#=============================================================================== + + +pseudomapping: + do: yes + fasta: ../genome/hg38/hg38.fa + gtf: ../genome/hg38/hg38.gff + options: "" + kmer: 31 + threads: 4 + ############################################################################# # feature_counts used to count reads against features # diff --git a/workflow/rules/cutadapt.rules b/workflow/rules/cutadapt.rules index 31afc066fef07ecaeb3882f26b13327b5531bb99..cc6ebec87575525630beab4bc3b23877b975b705 100755 --- a/workflow/rules/cutadapt.rules +++ b/workflow/rules/cutadapt.rules @@ -32,7 +32,7 @@ rule cutadapt: params: wkdir = adapters_wkdir, options = adapters_options, - adapters = adapters_adapt_list, + adapters = config['adapters']['cutadapt_file'], mode = adapters_mode, min = adapters_min, qual = adapters_qual diff --git a/workflow/rules/kallisto_index.rules b/workflow/rules/kallisto_index.rules index 879418d56953836e660239488e48c948a7e9d80c..738aeda637bd8d47245bc8df463ad0765477b24d 100755 --- a/workflow/rules/kallisto_index.rules +++ b/workflow/rules/kallisto_index.rules @@ -35,7 +35,7 @@ rule kallisto_index: log: kallisto_index_log threads: - config['kallisto']['threads'] + config['pseudomapping']['threads'] envmodules: "kallisto/0.46.2" shell: diff --git a/workflow/rules/kallisto_quant.rules b/workflow/rules/kallisto_quant.rules index 41275412c0ae955a896ff27a153fb39ee47934da..56f2b296972bf05b59ce22ebff6e489d44aa9ffb 100755 --- a/workflow/rules/kallisto_quant.rules +++ b/workflow/rules/kallisto_quant.rules @@ -30,18 +30,22 @@ rule kallisto_quant: output: directory(kallisto_quant_output_dir) params: - options = kallisto_quant_options + options = kallisto_quant_options, + gtf = kallisto_quant_gtf singularity: "rnaflow.img" log: kallisto_pseudo_log threads: - config['kallisto']['threads'] + config['pseudomapping']['threads'] envmodules: "kallisto/0.46.2" shell: """ - - kallisto quant -t {threads} {params.options} -o {output} -i {input.index} {input.fastq} - + if [[ -s {params.gtf} ]] + then + kallisto quant -t {threads} {params.options} -o {output} -i {input.index} {input.fastq} --gtf {params.gtf} + else + kallisto quant -t {threads} {params.options} -o {output} -i {input.index} {input.fastq} + fi """ diff --git a/workflow/rules/sortmerna.rules b/workflow/rules/sortmerna.rules new file mode 100755 index 0000000000000000000000000000000000000000..c176c464aef02d69c9257aa4d1193558d09fb03f --- /dev/null +++ b/workflow/rules/sortmerna.rules @@ -0,0 +1,63 @@ +######################################################################### +# RNAflow: an automated pipeline to analyse transcriptomic data # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2021-2022 Institut Pasteur (Paris). # +# # +# This file is part of RNAflow workflow. # +# # +# RNAflow is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAflow is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAflow (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + + +rule sortmerna: + input: + fastq = sortmerna_input, + fasta = sortmerna_fasta + output: + rRNA = sortmerna_outfile_rRNA, + no_rRNA = sortmerna_outfile_no_rRNA + singularity: + "rnaflow.img" + log: + err = sortmerna_logs_err, + out = sortmerna_logs_out + shadow: "shallow" + threads: 6 + envmodules: + "sortmerna/2.1b" + shell: + """ + set +o pipefail + #tmp="{input.fastq}" + #infiles=($tmp) + fasta={input.fasta} + index=${{fasta%%.fa}} + + if [[ ! -s ${{index}}.stats ]] + then + indexdb_rna --ref ${{fasta}},${{index}} + fi + + + sortmerna --ref $${{fasta}},${{index}} -a {threads} --reads {input.fastq} --aligned outfile_rRNA --fastx --sam --num_alignments 1 --other outfile_noRNA --log -v > {log.out} 2> {log.err} + + + pigz -fc outfile_rRNA > {output.rRNA} + pigz -fc outfile_noRNA {output.no_rRNA} + + + """