Commit dbf90e52 authored by Blaise Li's avatar Blaise Li
Browse files

Homogenization between workflows.

Factorized linking to raw data.

It turns out that using `include:` can fail when there are symlinks in
the path to the snakefile.
parent 14c5b15d
......@@ -257,30 +257,22 @@ rule all:
expand(OPJ(output_dir, "{trimmer}", "figures", aligner, "{lib}_by_{norm_type}_mean", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d_meta_profile.{fig_format}" % (MIN_DIST, META_MIN_LEN)), trimmer=TRIMMERS, lib=LIBS, norm_type=NORM_TYPES, orientation=["all"], biotype=METAGENE_BIOTYPES, fig_format=FIG_FORMATS),
def lib2data(wildcards):
lib = wildcards.lib
rep = wildcards.rep
raw = lib2raw[lib][rep]
return f"ln -sf {raw} {lib}_{rep}.fastq.gz"
rule link_raw_data:
"""This rule installs the raw data in a local directory using symlinks.
The location of the original files is taken from the configuration."""
output:
OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
params:
directory = data_dir,
shell_command = lib2data,
message:
"Making link to raw data {output}."
shell:
"""
(
cd {params.directory}
{params.shell_command}
)
"""
include: "../snakemake_wrappers/includes/link_raw_data.snakefile"
# def lib2data(wildcards):
# return lib2raw[wildcards.lib][wildcards.rep]
#
#
# rule link_raw_data:
# """This rule installs the raw data in a local directory using symlinks.
# The location of the original files is taken from the configuration."""
# input:
# raw = lib2data,
# output:
# link = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
# message:
# "Making link {output.link} to raw data {input.raw}."
# run:
# os.symlink(os.path.abspath(input.raw), output.link)
rule trim_and_dedup:
......@@ -378,7 +370,7 @@ rule map_on_genome:
output:
# sam files take a lot of space
sam = temp(OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_{type}_on_C_elegans.sam")),
nomap = OPJ(output_dir, "{trimmer}", aligner, "not_mapped_C_elegans", "{lib}_{rep}_{type}_unmapped_on_C_elegans.fastq.gz"),
nomap_fastq = OPJ(output_dir, "{trimmer}", aligner, "not_mapped_C_elegans", "{lib}_{rep}_{type}_unmapped_on_C_elegans.fastq.gz"),
params:
index = index,
settings = "",
......@@ -392,7 +384,7 @@ rule map_on_genome:
# genome_dir="${{HOME}}/Genomes"
# genome="C_elegans"
# bowtie2_genome_db="${{genome_dir}}/${{genome}}/Caenorhabditis_elegans/Ensembl/WBcel235/Sequence/Bowtie2Index/genome"
# cmd="bowtie2 --seed 123 -t --mm -x ${{bowtie2_genome_db}} -U {input.fastq} --no-unal --un-gz {output.nomap} -S {output.sam}"
# cmd="bowtie2 --seed 123 -t --mm -x ${{bowtie2_genome_db}} -U {input.fastq} --no-unal --un-gz {output.nomap_fastq} -S {output.sam}"
# echo ${{cmd}} 1> {log.log} 2> {log.err}
# eval ${{cmd}} 1>> {log.log} 2>> {log.err}
# """
......
"""
Snakefile to analyse RNA-seq data.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
#TODO: for each contrast, scatterplots with highlight of differentially expressed genes (with p-value colour code)
# and same scale for different biotypes
......@@ -180,28 +184,25 @@ rule all:
counts_files,
def lib2data(wildcards):
lib = wildcards.lib
rep = wildcards.rep
raw = lib2raw[lib][rep]
return f"ln -s {raw} {lib}_{rep}.fastq.gz"
rule link_raw_data:
output:
OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
params:
directory = data_dir,
shell_command = lib2data,
message:
"Making link to raw data {output}."
shell:
"""
(
cd {params.directory}
{params.shell_command}
)
"""
include: "../snakemake_wrappers/includes/link_raw_data.snakefile"
# def lib2data(wildcards):
# return lib2raw[wildcards.lib][wildcards.rep]
#
#
# rule link_raw_data:
# """This rule installs the raw data in a local directory using symlinks.
# The location of the original files is taken from the configuration."""
# input:
# raw = lib2data,
# output:
# link = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
# params:
# directory = data_dir,
# shell_command = lib2data,
# message:
# "Making link {output.link} to raw data {input.raw}."
# run:
# os.symlink(os.path.abspath(input.raw), output.link)
def mapping_command(aligner):
......
......@@ -3,6 +3,10 @@ Snakefile to analyse small_RNA-seq data.
TODO: Some figures and summaries may be overridden when changing the mapper. The mapper name should be added to their path.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
# TODO: meta-profiles with IP and input on the same meta profile for Ortiz_oogenic and Ortiz_spermatogenic
# Make external script to generate meta-profiles on a custom list of libraries, and a given list of genes.
......@@ -265,6 +269,8 @@ convert_dir = config["convert_dir"]
output_dir = config["output_dir"]
log_dir = config["log_dir"]
data_dir = config["data_dir"]
#log_dir = OPJ(output_dir, "logs")
#data_dir = OPJ(output_dir, "data")
# To put the results of small_RNA_seq_annotate
reads_dir = OPJ(output_dir, aligner, "mapped_C_elegans", "reads")
counts_dir = OPJ(output_dir, aligner, "mapped_C_elegans", "annotation")
......@@ -574,29 +580,25 @@ rule all:
OPJ(output_dir, "figures", "{small_type}_norm_counts_distrib.{fig_format}"),
small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu", "pisimi"], fig_format=FIG_FORMATS),
def lib2data(wildcards):
#lib = wildcards.lib
#rep = wildcards.rep
#raw = lib2raw[lib][rep]
#return f"ln -sf {raw} {lib}_{rep}.fastq.gz"
return lib2raw[wildcards.lib][wildcards.rep]
rule link_raw_data:
"""This rule installs the raw data in a local directory using symlinks.
The location of the original files is taken from the configuration."""
input:
raw = lib2data,
output:
link = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
#params:
# directory = data_dir,
# shell_command = lib2data,
message:
"Making link {output.link} to raw data {input.raw}."
run:
os.symlink(os.path.abspath(input.raw), output.link)
relative_include_path = "../snakemake_wrappers/includes/link_raw_data.snakefile"
absolute_include_path = os.path.join(workflow.basedir, relative_include_path)
assert os.path.exists(absolute_include_path)
include: relative_include_path
# def lib2data(wildcards):
# return lib2raw[wildcards.lib][wildcards.rep]
#
#
# rule link_raw_data:
# """This rule installs the raw data in a local directory using symlinks.
# The location of the original files is taken from the configuration."""
# input:
# raw = lib2data,
# output:
# link = OPJ(data_dir, "{lib}_{rep}.fastq.gz"),
# message:
# "Making link {output.link} to raw data {input.raw}."
# run:
# os.symlink(os.path.abspath(input.raw), output.link)
rule trim_and_dedup:
......@@ -668,7 +670,7 @@ def source_fastq(wildcards):
elif read_type == size_selected:
return rules.select_size_range.output.selected
elif read_type == "nomap":
return rules.map_on_genome.output.nomap
return rules.map_on_genome.output.nomap_fastq
elif read_type == "nomap_siRNA":
return rules.extract_nomap_siRNAs.output.nomap_si
elif read_type == "prot_siRNA":
......@@ -712,7 +714,7 @@ rule map_on_genome:
fastq = rules.select_size_range.output.selected,
output:
sam = temp(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "%s_on_C_elegans.sam" % size_selected)),
nomap = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_%s_unmapped_on_C_elegans.fastq.gz" % size_selected),
nomap_fastq = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_%s_unmapped_on_C_elegans.fastq.gz" % size_selected),
params:
index = index,
settings = "-L 6 -i S,1,0.8 -N 0",
......@@ -731,7 +733,7 @@ rule extract_nomap_siRNAs:
"""Extract from the non-mappers those that end in poly-T, and could be mappable after T-tail trimming."""
input:
OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_%s_unmapped_on_C_elegans.fastq.gz" % size_selected),
#rules.map_on_genome.output.nomap,
#rules.map_on_genome.output.nomap_fastq,
output:
nomap_si = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_%s_unmapped_siRNA.fastq.gz" % size_selected),
nb_nomap_si = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_%s_unmapped_siRNA.txt" % size_selected),
......@@ -756,7 +758,7 @@ rule remap_on_genome:
fastq = source_fastq,
output:
sam = temp(OPJ(output_dir, aligner, "mapped_C_elegans", "{lib}_{rep}", "{read_type}_on_C_elegans.sam")),
nomap = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_{read_type}_unmapped_on_C_elegans.fastq.gz"),
nomap_fastq = OPJ(output_dir, aligner, "not_mapped_C_elegans", "{lib}_{rep}_{read_type}_unmapped_on_C_elegans.fastq.gz"),
# Here we don't map size_selected
wildcard_constraints:
read_type = "|".join([
......
import os
# Not necessary, because variables defined at include time seem to be
# availble here (same for data_dir).
#lib2raw = config["lib2raw"]
def lib2data(wildcards):
return lib2raw[wildcards.lib][wildcards.rep]
rule link_raw_data:
"""This rule installs the raw data in a local directory using symlinks.
The location of the original files is taken from the configuration."""
input:
raw = lib2data,
output:
link = os.path.join(data_dir, "{lib}_{rep}.fastq.gz"),
message:
"Making link {output.link} to raw data {input.raw}."
run:
os.symlink(os.path.abspath(input.raw), output.link)
......@@ -3,7 +3,7 @@ from snakemake.shell import shell
cmd = """
genome_dir="${{HOME}}/Genomes"
genome="C_elegans"
cmd="bowtie2 --seed 123 -t {snakemake.params.settings} --mm -x {snakemake.params.index} -U {snakemake.input.fastq} --no-unal --un-gz {snakemake.output.nomap} -S {snakemake.output.sam}"
cmd="bowtie2 --seed 123 -t {snakemake.params.settings} --mm -x {snakemake.params.index} -U {snakemake.input.fastq} --no-unal --un-gz {snakemake.output.nomap_fastq} -S {snakemake.output.sam}"
echo ${{cmd}} > {snakemake.log.log}
eval ${{cmd}} 1>> {snakemake.log.log} 2> {snakemake.log.err}
"""
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment