Skip to content
Snippets Groups Projects
Commit bb0461a3 authored by Rachel  LEGENDRE's avatar Rachel LEGENDRE
Browse files

Add new file

parent c98b480b
No related branches found
No related tags found
No related merge requests found
Snakefile 0 → 100644
#########################################################################
# RNAflow: an automated pipeline to analyse transcriptomic data #
# #
# Authors: Rachel Legendre #
# Copyright (c) 2021-2022 Institut Pasteur (Paris). #
# #
# This file is part of RNAflow workflow. #
# #
# RNAflow is free software: you can redistribute it and/or modify #
# it under the terms of the GNU General Public License as published by #
# the Free Software Foundation, either version 3 of the License, or #
# (at your option) any later version. #
# #
# RNAflow is distributed in the hope that it will be useful, #
# but WITHOUT ANY WARRANTY; without even the implied warranty of #
# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the #
# GNU General Public License for more details . #
# #
# You should have received a copy of the GNU General Public License #
# along with RNAflow (LICENSE). #
# If not, see <https://www.gnu.org/licenses/>. #
#########################################################################
import pandas as pd
from fnmatch import fnmatch
from re import sub, match
from itertools import repeat, chain
import os
#-------------------------------------------------------
# read config files
configfile: "config/config.yaml"
RULES = os.path.join("workflow", "rules")
#-------------------------------------------------------
# list of all files in the directory 'input_dir'
filenames = [f for f in os.listdir(config["input_dir"]) if match(r'.*'+config["input_readtag"]+config["input_extension"]+'', f)]
samples = [sub(config["input_readtag"]+config["input_extension"], '', file) for file in filenames]
receptor = [ x.strip() for x in (config["design"]["receptor"]).split(",")]
conds = [ x.strip() for x in (config["design"]["condition"]).split(",")]
#-------------------------------------------------------
# paired-end data gestion
read_tag = config["input_readtag"]
rt1 = read_tag.replace("[12]", "1")
rt2 = read_tag.replace("[12]", "2")
R1 = [1 for this in filenames if rt1 in this]
R2 = [1 for this in filenames if rt2 in this]
if len(R2) == 0:
paired = False
else:
if R1 == R2:
paired = True
else:
raise ValueError("Mix of paired and single-end data: please provide a homogenenous dataset")
# -----------------------------------------------
#initialize global variables
__sample_dir = config["input_dir"]
__analysis_dir = config["analysis_dir"]
__input_data = [__sample_dir + "/{{SAMPLE}}{}.fastq.gz".format(rt1)]
if paired:
__input_data += [__sample_dir + "/{{SAMPLE}}{}.fastq.gz".format(rt2)]
# global variable for all output files
expected_output = []
# -----------------------------------------------
#begin of the rules
## quality control
__fastqc__input_fastq = __input_data
__fastqc__output_done = "00-Fastqc/{SAMPLE}_R1_fastqc.fastq.gz"
__fastqc__wkdir = "00-Fastqc"
__fastqc__log = "00-Fastqc/logs/{SAMPLE}_R1_fastqc_raw.log"
expected_output.extend(expand(__fastqc__output_done, SAMPLE=samples))
include: os.path.join(RULES, "fastqc.rules")
## remove adapters
if config["adapters"]["remove"] :
adapter_tool = config["adapters"]["tool_choice"]
## TODO add AlienTrimmer
if adapter_tool in ["cutadapt", "atropos"]:
adapter_tool = "adapters"
__adapters__input_fastq = __input_data
__adapters__wkdir = "01-Trimming"
__adapters__output = ["01-Trimming/{SAMPLE}_R1_trim.fastq.gz"]
if paired:
__adapters__output += ["01-Trimming/{SAMPLE}_R2_trim.fastq.gz"]
# Set parameters
__adapters__adapt_list = config["adapters"]["adapter_list"]
__adapters__options = config["adapters"]["options"]
__adapters__mode = config["adapters"]["mode"]
__adapters_min = config["adapters"]["m"]
__adapters_qual = config["adapters"]["quality"]
__adapters__log = "01-Trimming/logs/{SAMPLE}_trim.txt"
expected_output.extend(expand(__adapters__output, SAMPLE=samples))
include: os.path.join(RULES, "adapters.rules")
else:
raise ValueError("Invalid choice of tool using for trimming in config file. Use either cutadapt or atropos or AlienTrimmer")
else:
__adapters__output = __input_data
## Indexing genome
ref = [ config["genome"]["name"] ]
if config["genome"]["host_mapping"]:
ref += [ config["genome"]["host_name"] ]
def mapping_index(wildcards):
if (wildcards.REF == config["genome"]["name"]):
input = config["genome"]["fasta_file"]
elif (wildcards.REF == config["genome"]["host_name"]):
input = config["genome"]["host_fasta_file"]
return(input)
if config["genome"]["index"]:
# indexing for bowtie2
__bowtie2_index__fasta = mapping_index
__bowtie2_index__output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2")
__bowtie2_index__output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}")
__bowtie2_index__log = "02-Mapping/logs/bowtie2_{REF}_indexing.log"
include: os.path.join(RULES, "bowtie2_index.rules")
else:
__bowtie2_index__output_done = os.path.join(config["genome"]["genome_directory"],"{REF}.1.bt2")
__bowtie2_index__output_prefix = os.path.join(config["genome"]["genome_directory"],"{REF}")
# Mapping step
__bowtie2_mapping__input = __adapters__output
__bowtie2_mapping__index_done = __bowtie2_index__output_done
__bowtie2_mapping__sort = "02-Mapping/{SAMPLE}_{REF}_sort.bam"
__bowtie2_mapping__bam = "02-Mapping/{SAMPLE}_{REF}.bam"
__bowtie2_mapping__sortprefix = "02-Mapping/{SAMPLE}_{REF}_sort"
__bowtie2_mapping__logs_err = "02-Mapping/logs/{SAMPLE}_{REF}_mapping.err"
__bowtie2_mapping__logs_out = "02-Mapping/logs/{SAMPLE}_{REF}_mapping.out"
__bowtie2_mapping__prefix_index = __bowtie2_index__output_prefix
__bowtie2_mapping__options = config["bowtie2_mapping"]["options"]
expected_output.extend(expand(__bowtie2_mapping__sort, SAMPLE=samples, REF=ref))
include: os.path.join(RULES, "bowtie2_mapping.rules")
# then we performed peak calling only in files against reference genome and not spike genome !
ref = config["genome"]["name"]
# Genome coverage for plus and minus strand
strand = ["minus", "plus"]
__genomecov__input = "02-Mapping/{{SAMPLE}}_{}_sort.bam".format(ref)
__genomecov__logs = "03-Coverage/logs/{SAMPLE}_{STRAND}_coverage.out"
__genomecov__output = "03-Coverage/{SAMPLE}_{STRAND}_coverage.csv"
__genomecov__genome = config["genome"]["fasta_file"]
__genomecov__options = ""
expected_output.extend(expand(__genomecov__output, SAMPLE=samples, STRAND=strand))
include: os.path.join(RULES, "genomecov.rules")
# RNAsig R script
__rnasig__target = "config/design_strand.txt"
__rnasig__output_dir = "."
__rnasig__covdir = "03-Coverage/"
__rnasig__gfffile = config['genome']['gff_file']
__rnasig__report = "04-RNAsig/finalplot.pdf"
__rnasig__logs_out = "04-RNAsig/logs/rnasig.out"
__rnasig__logs_err = "04-RNAsig/logs/rnasig.err"
expected_output.extend([__rnasig__report])
include: os.path.join(RULES, "rnasig.rules")
# Multiqc rule
config['multiqc']['options'] += " -c config/multiqc_config.yaml"
__multiqc__input = expected_output
__multiqc__input_dir = "."
__multiqc__logs = "04-Multiqc/multiqc.log"
__multiqc__output = "04-Multiqc"
__multiqc__options = config['multiqc']['options']
__multiqc__output_dir = "04-Multiqc"
expected_output = [__multiqc__output]
include: os.path.join(RULES, "multiqc.rules")
rule rnasig:
input: expected_output
onsuccess:
# copy metrics json in the corresponding multiQC output when you are in exploratory mode
import shutil
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment