diff --git a/ Snakefile b/ Snakefile new file mode 100644 index 0000000000000000000000000000000000000000..801140ac6695227b7e43e0210a2df388261336c0 --- /dev/null +++ b/ Snakefile @@ -0,0 +1,215 @@ +######################################################################### +# RNAflow: an automated pipeline to analyse transcriptomic data # +# # +# Authors: Rachel Legendre # +# Copyright (c) 2021-2022 Institut Pasteur (Paris). # +# # +# This file is part of RNAflow workflow. # +# # +# RNAflow is free software: you can redistribute it and/or modify # +# it under the terms of the GNU General Public License as published by # +# the Free Software Foundation, either version 3 of the License, or # +# (at your option) any later version. # +# # +# RNAflow is distributed in the hope that it will be useful, # +# but WITHOUT ANY WARRANTY; without even the implied warranty of # +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the # +# GNU General Public License for more details . # +# # +# You should have received a copy of the GNU General Public License # +# along with RNAflow (LICENSE). # +# If not, see <https://www.gnu.org/licenses/>. # +######################################################################### + + +import pandas as pd +from fnmatch import fnmatch +from re import sub, match +from itertools import repeat, chain +import os + + +#------------------------------------------------------- +# read config files + +configfile: "config/config.yaml" +RULES = os.path.join("workflow", "rules") + + +#------------------------------------------------------- +# list of all files in the directory 'input_dir' + +filenames = [f for f in os.listdir(config["input_dir"]) if match(r'.*'+config["input_readtag"]+config["input_extension"]+'', f)] +samples = [sub(config["input_readtag"]+config["input_extension"], '', file) for file in filenames] +receptor = [ x.strip() for x in (config["design"]["receptor"]).split(",")] +conds = [ x.strip() for x in (config["design"]["condition"]).split(",")] + + +#------------------------------------------------------- +# paired-end data gestion +read_tag = config["input_readtag"] +rt1 = read_tag.replace("[12]", "1") +rt2 = read_tag.replace("[12]", "2") + +R1 = [1 for this in filenames if rt1 in this] +R2 = [1 for this in filenames if rt2 in this] + +if len(R2) == 0: + paired = False +else: + if R1 == R2: + paired = True + else: + raise ValueError("Mix of paired and single-end data: please provide a homogenenous dataset") + + +# ----------------------------------------------- +#initialize global variables +__sample_dir = config["input_dir"] +__analysis_dir = config["analysis_dir"] + +__input_data = [__sample_dir + "/{{SAMPLE}}{}.fastq.gz".format(rt1)] +if paired: + __input_data += [__sample_dir + "/{{SAMPLE}}{}.fastq.gz".format(rt2)] +# global variable for all output files +expected_output = [] + + + +# ----------------------------------------------- +#begin of the rules + +## quality control +__fastqc__input_fastq = __input_data +__fastqc__output_done = "00-Fastqc/{SAMPLE}_R1_fastqc.fastq.gz" +__fastqc__wkdir = "00-Fastqc" +__fastqc__log = "00-Fastqc/logs/{SAMPLE}_R1_fastqc_raw.log" +expected_output.extend(expand(__fastqc__output_done, SAMPLE=samples)) + +include: os.path.join(RULES, "fastqc.rules") + + +## remove adapters +if config["adapters"]["remove"] : + + adapter_tool = config["adapters"]["tool_choice"] +## TODO add AlienTrimmer + if adapter_tool in ["cutadapt", "atropos"]: + adapter_tool = "adapters" + __adapters__input_fastq = __input_data + __adapters__wkdir = "01-Trimming" + __adapters__output = ["01-Trimming/{SAMPLE}_R1_trim.fastq.gz"] + if paired: + __adapters__output += ["01-Trimming/{SAMPLE}_R2_trim.fastq.gz"] + + # Set parameters + __adapters__adapt_list = config["adapters"]["adapter_list"] + __adapters__options = config["adapters"]["options"] + __adapters__mode = config["adapters"]["mode"] + __adapters_min = config["adapters"]["m"] + __adapters_qual = config["adapters"]["quality"] + __adapters__log = "01-Trimming/logs/{SAMPLE}_trim.txt" + expected_output.extend(expand(__adapters__output, SAMPLE=samples)) + include: os.path.join(RULES, "adapters.rules") + + else: + raise ValueError("Invalid choice of tool using for trimming in config file. Use either cutadapt or atropos or AlienTrimmer") +else: + __adapters__output = __input_data + + + + + +## Indexing genome +ref = [ config["genome"]["name"] ] + +if config["genome"]["host_mapping"]: + ref += [ config["genome"]["host_name"] ] + + + +def mapping_index(wildcards): + if (wildcards.REF == config["genome"]["name"]): + input = config["genome"]["fasta_file"] + elif (wildcards.REF == config["genome"]["host_name"]): + input = config["genome"]["host_fasta_file"] + return(input) + + +if config["genome"]["index"]: + # indexing for bowtie2 + __bowtie2_index__fasta = mapping_index + __bowtie2_index__output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2") + __bowtie2_index__output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}") + __bowtie2_index__log = "02-Mapping/logs/bowtie2_{REF}_indexing.log" + include: os.path.join(RULES, "bowtie2_index.rules") + +else: + __bowtie2_index__output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2") + __bowtie2_index__output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}") + + +# Mapping step + +__bowtie2_mapping__input = __adapters__output +__bowtie2_mapping__index_done = __bowtie2_index__output_done +__bowtie2_mapping__sort = "02-Mapping/{SAMPLE}_{REF}_sort.bam" +__bowtie2_mapping__bam = "02-Mapping/{SAMPLE}_{REF}.bam" +__bowtie2_mapping__sortprefix = "02-Mapping/{SAMPLE}_{REF}_sort" +__bowtie2_mapping__logs_err = "02-Mapping/logs/{SAMPLE}_{REF}_mapping.err" +__bowtie2_mapping__logs_out = "02-Mapping/logs/{SAMPLE}_{REF}_mapping.out" +__bowtie2_mapping__prefix_index = __bowtie2_index__output_prefix +__bowtie2_mapping__options = config["bowtie2_mapping"]["options"] +expected_output.extend(expand(__bowtie2_mapping__sort, SAMPLE=samples, REF=ref)) +include: os.path.join(RULES, "bowtie2_mapping.rules") + + +# then we performed peak calling only in files against reference genome and not spike genome ! +ref = config["genome"]["name"] + +# Genome coverage for plus and minus strand +strand = ["minus", "plus"] +__genomecov__input = "02-Mapping/{{SAMPLE}}_{}_sort.bam".format(ref) +__genomecov__logs = "03-Coverage/logs/{SAMPLE}_{STRAND}_coverage.out" +__genomecov__output = "03-Coverage/{SAMPLE}_{STRAND}_coverage.csv" +__genomecov__genome = config["genome"]["fasta_file"] +__genomecov__options = "" + +expected_output.extend(expand(__genomecov__output, SAMPLE=samples, STRAND=strand)) +include: os.path.join(RULES, "genomecov.rules") + + + +# RNAsig R script +__rnasig__target = "config/design_strand.txt" +__rnasig__output_dir = "." +__rnasig__covdir = "03-Coverage/" +__rnasig__gfffile = config['genome']['gff_file'] +__rnasig__report = "04-RNAsig/finalplot.pdf" +__rnasig__logs_out = "04-RNAsig/logs/rnasig.out" +__rnasig__logs_err = "04-RNAsig/logs/rnasig.err" + +expected_output.extend([__rnasig__report]) +include: os.path.join(RULES, "rnasig.rules") + +# Multiqc rule +config['multiqc']['options'] += " -c config/multiqc_config.yaml" +__multiqc__input = expected_output +__multiqc__input_dir = "." +__multiqc__logs = "04-Multiqc/multiqc.log" +__multiqc__output = "04-Multiqc" +__multiqc__options = config['multiqc']['options'] +__multiqc__output_dir = "04-Multiqc" + +expected_output = [__multiqc__output] +include: os.path.join(RULES, "multiqc.rules") + + +rule rnasig: + input: expected_output + + +onsuccess: + # copy metrics json in the corresponding multiQC output when you are in exploratory mode + import shutil