Skip to content
Snippets Groups Projects
Commit f6700ce5 authored by Blaise Li's avatar Blaise Li
Browse files

Started auto-methods in Ribo-seq pipeline.

The idea is to associate a methods file to certain rules in order to
make it easier to write methods when publishing results. Rules can copy
the methods files of upstream rules before appending their part.
parent 366ed973
No related branches found
No related tags found
No related merge requests found
......@@ -76,6 +76,7 @@ from gzip import open as gzopen
from re import sub
from pickle import load
from fileinput import input as finput
from shutil import copyfile
from sys import stderr
from subprocess import Popen, PIPE, CalledProcessError
# Useful data structures
......@@ -85,6 +86,7 @@ from collections import defaultdict, Counter
# Useful for functional style
from itertools import chain, combinations, product, repeat, starmap
from functools import reduce
from operator import attrgetter
from operator import or_ as union
from cytoolz import concat, merge_with, take_nth, valmap
......@@ -315,6 +317,7 @@ aligner = config["aligner"]
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
genome_version = genome_dict.get("version", "WBcel235")
chrom_sizes = get_chrom_sizes(genome_dict["size"])
chrom_sizes.update(valmap(int, genome_dict.get("extra_chromosomes", {})))
genomelen = sum(chrom_sizes.values())
......@@ -650,6 +653,7 @@ rule trim:
trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_trimmed.fastq.gz"),
nb_raw = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_raw.txt"),
nb_trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_trimmed.txt"),
methods = OPJ(data_dir, "trimmed", "methods", "{lib}_{rep}_trimmed_methods.txt"),
#threads: 2
message:
"Trimming adaptor from raw data, removing random 5' {trim5}-mers and 3' {trim3}-mers for {wildcards.lib}_{wildcards.rep}."
......@@ -667,6 +671,9 @@ rule trim:
| trim_random_nt {params.trim5} {params.trim3} 2>> {log.cutadapt} \\
| gzip > {output.trimmed} \\
2> {log.trim}
v=$(cutadapt --version)
echo "The 3' adaptor ({params.adapter}) was trimmed from the raw reads using cutadapt (version ${{v}})" > {output.methods}
echo "The 5' and 3' 4 nt UMIs were removed from the trimmed reads using cutadapt (version ${{v}}) with options -u {params.trim5} and -u -{params.trim3}" >> {output.methods}
"""
......@@ -679,20 +686,26 @@ rule select_size_range:
"""Select (and count) reads in the correct size range."""
input:
# rules.trim_and_dedup.output.trimmed
rules.trim.output.trimmed
trimmed = rules.trim.output.trimmed,
rules.trim.output.methods
output:
selected = OPJ(data_dir, "trimmed", "{lib}_{rep}_%s.fastq.gz" % size_selected),
nb_selected = OPJ(data_dir, "trimmed", "{lib}_{rep}_nb_%s.txt" % size_selected),
methods = OPJ(data_dir, "trimmed", "methods", "{lib}_{rep}_%s_methods.txt" % size_selected),
params:
awk_filter = awk_size_filter,
message:
"Selecting reads size %s for {wildcards.lib}_{wildcards.rep}." % size_selected
shell:
"""
bioawk -c fastx '{params.awk_filter} {{print "@"$name" "$4"\\n"$seq"\\n+\\n"$qual}}' {input} \\
bioawk -c fastx '{params.awk_filter} {{print "@"$name" "$4"\\n"$seq"\\n+\\n"$qual}}' {input.trimmed} \\
| tee >(count_fastq_reads {output.nb_selected}) \\
| gzip > {output.selected}
"""
cat {input.methods} > {output.methods}
v=$(bioawk --version | bioawk '{{print $3}}')
gitv=$(grep -A 1 bioawk /usr/local/share/doc/program_versions.txt | tail -1)
echo "After removing UMIs, the reads from %s to %s nt were selected using bioawk version ${{v}} (git commit ${{gitv}})" >> {output.methods}
""" % (MIN_LEN, MAX_LEN)
@wc_applied
......@@ -718,19 +731,49 @@ def source_fastq(wildcards):
raise NotImplementedError("Unknown read type: %s" % read_type)
@wc_applied
def source_fastq_methods(wildcards):
"""Determine the fastq file corresponding to a given read type."""
read_type = wildcards.read_type
if read_type == "raw":
return []
# return rules.link_raw_data.output.methods
elif read_type == "trimmed":
return rules.trim.output.methods
elif read_type == size_selected:
return rules.select_size_range.output.methods
elif read_type == "nomap":
return rules.map_on_genome.output.methods
elif read_type == "RPF":
return []
# return rules.extract_RPF.output.methods
elif read_type.startswith("RPF29"):
return []
# AttributeError: 'Wildcards' object has no attribute 'rpf_subtype'
# return rules.extract_RPF_subtype.output.sub_rpf
# return OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s_fwd_on_rpf_biotypes.fastq.gz" % genome)
else:
raise NotImplementedError("Unknown read type: %s" % read_type)
rule map_on_genome:
input:
# fastq = rules.select_size_range.output.selected,
fastq = source_fastq,
methods = source_fastq_methods,
output:
sam = temp(OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{read_type}_on_%s.sam" % genome)),
methods = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "methods", "{read_type}_on_%s_methods.txt" % genome)
nomap_fastq = OPJ(aligner, f"not_mapped_{genome}", "{lib}_{rep}_{read_type}_unmapped_on_%s.fastq.gz" % genome),
params:
aligner = aligner,
index = index,
settings = alignment_settings[aligner],
# for methods
genome_name = f"{genome} genome ({genome_version})",
read_name = attrgetter("read_type"),
message:
"Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on C. elegans genome."
"Mapping {wildcards.lib}_{wildcards.rep}_{wildcards.read_type} on {params.genome_name}."
benchmark:
OPJ(log_dir, "map_on_genome", "{lib}_{rep}_{read_type}_benchmark.txt")
log:
......@@ -841,8 +884,12 @@ rule extract_RPF:
aligner, f"mapped_{genome}", "{lib}_{rep}",
f"{size_selected}_on_{genome}_sorted.bam"),
gtfs = [OPJ(annot_dir, f"{biotype}.gtf") for biotype in RPF_BIOTYPES],
methods = OPJ(
aligner, f"mapped_{genome}", "{lib}_{rep}", "methods",
f"{size_selected}_on_{genome}_methods.txt"),
output:
rpf = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_rpf_biotypes.fastq.gz")
methods = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "methods", f"{size_selected}_on_{genome}_fwd_on_rpf_biotypes_methods.txt")
params:
tmp_filtered_bam = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_rpf_biotypes.bam"),
shell:
......@@ -853,6 +900,10 @@ rule extract_RPF:
samtools fastq {params.tmp_filtered_bam} \\
| gzip > {output.rpf}
rm -f {params.tmp_filtered_bam}
cp {input.methods} {output.methods}
sv=$(samtools --version | head -1 | awk '{{print $2}}')
b==$(bedtools --version | awk '{{print $2}}')
echo "Reads mapping on sense orientation on annotated protein coding genes were considered as Ribosome-protected fragments (RPF). Such reads were extracted from mapping results using samtools ${{sv}} and bedtools ${{bv}} and re-mapped on the genome" >> {output.methods}
"""
# rule extract_RPF:
# """Version using intermediate bam file (untested)"""
......@@ -874,7 +925,10 @@ rule extract_RPF:
# Use bam or RPF ? Maybe simpler to use RPF
rule extract_RPF_subtype:
input:
rpf = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_rpf_biotypes.fastq.gz")
#rpf = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", f"{size_selected}_on_{genome}_fwd_on_rpf_biotypes.fastq.gz")
#methods = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "methods", f"{size_selected}_on_{genome}_fwd_on_rpf_biotypes_methods.txt")
rpf = rules.extract_RPF.rpf,
methods = rules.extract_RPF.methods,
# protein_bam = OPJ(
# aligner, f"mapped_{genome}", "{lib}_{rep}",
# f"{size_selected}_on_{genome}_fwd_on_protein_coding.bam"),
......@@ -883,6 +937,7 @@ rule extract_RPF_subtype:
# f"{size_selected}_on_{genome}_fwd_on_protein_coding.bam.bai"),
output:
sub_rpf = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "{rpf_subtype}_on_%s_fwd_on_rpf_biotypes.fastq.gz" % genome)
methods = OPJ(aligner, f"mapped_{genome}", "{lib}_{rep}", "methods", "{rpf_subtype}_on_%s_fwd_on_rpf_biotypes_methods.txt" % genome)
log:
log = OPJ(log_dir, "extract_RPF_subtype","{lib}_{rep}", "{rpf_subtype}_on_%s_fwd_on_rpf_biotypes.log" % genome),
warnings = OPJ(log_dir, "extract_RPF_subtype","{lib}_{rep}", "{rpf_subtype}_on_%s_fwd_on_rpf_biotypes.warnings" % genome),
......@@ -912,6 +967,9 @@ rule extract_RPF_subtype:
nb_written += 1
# fastq_out.write(f"@{name}\n{sequence}\n+\n{qualities}\n")
logfile.write(f"{nb_written} reads written in {output.sub_rpf}\n")
copyfile(input.methods, output.methods)
with open(output.methods, "a") as methods:
methods.write(f"{filter_fun.__doc__}\n")
def feature_orientation2stranded(wildcards):
......
......@@ -37,7 +37,7 @@ git+https://bioinfo_utils:izoZHUvhFJU8689hqM5u@gitlab.pasteur.fr/bli/libcelegans
git+https://bioinfo_utils:uqiFbzWyBtzxYn7br82P@gitlab.pasteur.fr/bli/libdeseq.git@196ee3d15a125fad0d821256cc69f96c9129bf61#egg=libdeseq==0.2
git+https://bioinfo_utils:DP-sqNot8AdsMek_KJiC@gitlab.pasteur.fr/bli/libhts.git@3e835227ade102141c51449ddb7e71cc9459e96e#egg=libhts==0.3
git+https://bioinfo_utils:aCyAV57zB-zzvux9Zhox@gitlab.pasteur.fr/bli/libreads.git@aed0b24fc888752a5f75832ad33ccc7c9251a4c6#egg=libreads==0.2
git+https://bioinfo_utils:putYXvKZD8_vFkYkhgTw@gitlab.pasteur.fr/bli/libriboseq.git@95a0837ea703054a98f2f3b098818000828c6f8f#egg=libriboseq==0.1
git+https://bioinfo_utils:putYXvKZD8_vFkYkhgTw@gitlab.pasteur.fr/bli/libriboseq.git@2d4a9a6578b578ef492265cfaf3168fb6cdf67c8#egg=libriboseq==0.1
git+https://bioinfo_utils:FjgTxcUGPsGVQ-yZZVyS@gitlab.pasteur.fr/bli/libsmallrna.git@b44e5256fd5d42a63c8f0a6765e0d6af829d753d#egg=libsmallrna==0.2
git+https://bioinfo_utils:tfuTQsSZWMtC5xXJNFJh@gitlab.pasteur.fr/bli/libworkflows.git@1b3bec78cbc5cc15f307b3d761c534d35a1faf00#egg=libworkflows==0.3
mappy==2.17
......@@ -84,7 +84,7 @@ simplegeneric==0.8.1
six==1.14.0
smmap2==2.0.5
snakemake==5.19.3
git+https://bioinfo_utils:Ues9rP1tyfzs1zs5BzpY@gitlab.pasteur.fr/bli/snakemake_wrappers.git@ab4bdff451297b847369265e8a95b09de14e224b#egg=snakemake-wrappers==0.4
git+https://bioinfo_utils:Ues9rP1tyfzs1zs5BzpY@gitlab.pasteur.fr/bli/snakemake_wrappers.git@1ee7e494f42870122c79942b3a4c50ebf1dbea75#egg=snakemake-wrappers==0.4
snowballstemmer==2.0.0
soupsieve==1.9.5
Sphinx==2.3.1
......
snakemake_wrappers @ 1ee7e494
Subproject commit ab4bdff451297b847369265e8a95b09de14e224b
Subproject commit 1ee7e494f42870122c79942b3a4c50ebf1dbea75
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment