Add new file

bb0461a3 · Rachel LEGENDRE · c98b480b · bb0461a3
Commit bb0461a3 authored 4 years ago by Rachel LEGENDRE
--- a/Snakefile
+++ b/Snakefile
+#########################################################################
+# RNAflow: an automated pipeline to analyse transcriptomic data         #
+#                                                                       #
+# Authors: Rachel Legendre                                              #
+# Copyright (c) 2021-2022  Institut Pasteur (Paris).                    #
+#                                                                       #
+# This file is part of RNAflow workflow.                                #
+#                                                                       #
+# RNAflow is free software: you can redistribute it and/or modify       #
+# it under the terms of the GNU General Public License as published by  #
+# the Free Software Foundation, either version 3 of the License, or     #
+# (at your option) any later version.                                   #
+#                                                                       #
+# RNAflow is distributed in the hope that it will be useful,            #
+# but WITHOUT ANY WARRANTY; without even the implied warranty of        #
+# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the          #
+# GNU General Public License for more details .                         #
+#                                                                       #
+# You should have received a copy of the GNU General Public License     #
+# along with RNAflow (LICENSE).                                         #
+# If not, see <https://www.gnu.org/licenses/>.                          #
+#########################################################################
+import pandas as pd
+from fnmatch import fnmatch
+from re import sub, match
+from itertools import repeat, chain
+import os
+#-------------------------------------------------------
+# read config files
+configfile: "config/config.yaml"
+RULES = os.path.join("workflow", "rules")
+#-------------------------------------------------------
+# list of all files in the directory 'input_dir'
+filenames = [f for f in os.listdir(config["input_dir"]) if match(r'.*'+config["input_readtag"]+config["input_extension"]+'', f)] 
+samples = [sub(config["input_readtag"]+config["input_extension"], '', file) for file in filenames]
+receptor = [ x.strip() for x in (config["design"]["receptor"]).split(",")]
+conds = [ x.strip() for x in (config["design"]["condition"]).split(",")]
+#-------------------------------------------------------
+# paired-end data gestion
+read_tag = config["input_readtag"]
+rt1 = read_tag.replace("[12]", "1")
+rt2 = read_tag.replace("[12]", "2")
+R1 = [1 for this in filenames if rt1 in this]
+R2 = [1 for this in filenames if rt2 in this]
+if len(R2) == 0:
+    paired = False
+else:
+    if R1 == R2:
+        paired = True
+    else:
+        raise ValueError("Mix of paired and single-end data: please provide a homogenenous dataset")
+# -----------------------------------------------
+#initialize global variables
+__sample_dir = config["input_dir"]
+__analysis_dir = config["analysis_dir"]
+__input_data = [__sample_dir + "/{{SAMPLE}}{}.fastq.gz".format(rt1)]
+if paired:
+    __input_data += [__sample_dir + "/{{SAMPLE}}{}.fastq.gz".format(rt2)]    
+# global variable for all output files
+expected_output = []
+# -----------------------------------------------
+#begin of the rules
+## quality control
+__fastqc__input_fastq = __input_data
+__fastqc__output_done = "00-Fastqc/{SAMPLE}_R1_fastqc.fastq.gz"
+__fastqc__wkdir = "00-Fastqc"
+__fastqc__log = "00-Fastqc/logs/{SAMPLE}_R1_fastqc_raw.log"
+expected_output.extend(expand(__fastqc__output_done, SAMPLE=samples))
+include: os.path.join(RULES, "fastqc.rules")
+## remove adapters
+if config["adapters"]["remove"] :
+    adapter_tool = config["adapters"]["tool_choice"]
+## TODO add AlienTrimmer
+    if adapter_tool in ["cutadapt", "atropos"]:
+        adapter_tool = "adapters"
+        __adapters__input_fastq = __input_data
+        __adapters__wkdir = "01-Trimming"
+        __adapters__output = ["01-Trimming/{SAMPLE}_R1_trim.fastq.gz"]
+        if paired:
+            __adapters__output += ["01-Trimming/{SAMPLE}_R2_trim.fastq.gz"]
+        # Set parameters
+        __adapters__adapt_list = config["adapters"]["adapter_list"]
+        __adapters__options = config["adapters"]["options"]
+        __adapters__mode = config["adapters"]["mode"]
+        __adapters_min  = config["adapters"]["m"]
+        __adapters_qual = config["adapters"]["quality"]
+	__adapters__log = "01-Trimming/logs/{SAMPLE}_trim.txt"
+        expected_output.extend(expand(__adapters__output, SAMPLE=samples))
+        include: os.path.join(RULES, "adapters.rules")
+    else:
+        raise ValueError("Invalid choice of tool using for trimming in config file. Use either cutadapt or atropos or AlienTrimmer")
+else:
+    __adapters__output = __input_data
+## Indexing genome
+ref = [ config["genome"]["name"] ]
+if config["genome"]["host_mapping"]:
+    ref += [ config["genome"]["host_name"] ]
+def mapping_index(wildcards):
+    if (wildcards.REF == config["genome"]["name"]):
+        input = config["genome"]["fasta_file"]
+    elif (wildcards.REF == config["genome"]["host_name"]):
+        input = config["genome"]["host_fasta_file"]
+    return(input)
+if config["genome"]["index"]: 
+    # indexing for bowtie2
+    __bowtie2_index__fasta = mapping_index
+    __bowtie2_index__output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2")
+    __bowtie2_index__output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}")
+    __bowtie2_index__log = "02-Mapping/logs/bowtie2_{REF}_indexing.log"
+    include: os.path.join(RULES, "bowtie2_index.rules")
+else:
+    __bowtie2_index__output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2")
+    __bowtie2_index__output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}")
+# Mapping step
+__bowtie2_mapping__input = __adapters__output
+__bowtie2_mapping__index_done = __bowtie2_index__output_done
+__bowtie2_mapping__sort = "02-Mapping/{SAMPLE}_{REF}_sort.bam"
+__bowtie2_mapping__bam = "02-Mapping/{SAMPLE}_{REF}.bam"
+__bowtie2_mapping__sortprefix = "02-Mapping/{SAMPLE}_{REF}_sort"
+__bowtie2_mapping__logs_err = "02-Mapping/logs/{SAMPLE}_{REF}_mapping.err"
+__bowtie2_mapping__logs_out = "02-Mapping/logs/{SAMPLE}_{REF}_mapping.out"
+__bowtie2_mapping__prefix_index = __bowtie2_index__output_prefix
+__bowtie2_mapping__options =  config["bowtie2_mapping"]["options"]
+expected_output.extend(expand(__bowtie2_mapping__sort, SAMPLE=samples, REF=ref))
+include: os.path.join(RULES, "bowtie2_mapping.rules")
+# then we performed peak calling only in files against reference genome and not spike genome !
+ref = config["genome"]["name"] 
+# Genome coverage for plus and minus strand
+strand = ["minus", "plus"]
+__genomecov__input = "02-Mapping/{{SAMPLE}}_{}_sort.bam".format(ref)
+__genomecov__logs = "03-Coverage/logs/{SAMPLE}_{STRAND}_coverage.out"
+__genomecov__output = "03-Coverage/{SAMPLE}_{STRAND}_coverage.csv"
+__genomecov__genome = config["genome"]["fasta_file"]
+__genomecov__options = "" 
+expected_output.extend(expand(__genomecov__output, SAMPLE=samples, STRAND=strand))
+include: os.path.join(RULES, "genomecov.rules")
+# RNAsig R script
+__rnasig__target = "config/design_strand.txt"
+__rnasig__output_dir = "."
+__rnasig__covdir = "03-Coverage/"
+__rnasig__gfffile = config['genome']['gff_file']
+__rnasig__report = "04-RNAsig/finalplot.pdf"
+__rnasig__logs_out = "04-RNAsig/logs/rnasig.out"
+__rnasig__logs_err = "04-RNAsig/logs/rnasig.err"
+expected_output.extend([__rnasig__report])
+include: os.path.join(RULES, "rnasig.rules")
+# Multiqc rule
+config['multiqc']['options'] += " -c config/multiqc_config.yaml" 
+__multiqc__input = expected_output
+__multiqc__input_dir = "."
+__multiqc__logs = "04-Multiqc/multiqc.log"
+__multiqc__output = "04-Multiqc"
+__multiqc__options = config['multiqc']['options']
+__multiqc__output_dir = "04-Multiqc"
+expected_output = [__multiqc__output]
+include: os.path.join(RULES, "multiqc.rules")
+rule rnasig:
+    input: expected_output
+onsuccess:
+    # copy metrics json in the corresponding multiQC output when you are in exploratory mode
+    import shutil