Commit d6404f28 authored by Blaise Li's avatar Blaise Li
Browse files

Preliminary degradome-seq pipeline.

The pipeline only maps and counts aligned reads in biotypes.
parent 4af07397
"""
Snakefile to analyse Degradome-seq data.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
import os
OPJ = os.path.join
from glob import glob
import pandas as pd
from smincludes import rules as irules
from libworkflows import (
cleanup_and_backup,
ensure_relative,
feature_orientation2stranded,
get_chrom_sizes,
read_feature_counts,
sum_feature_counts)
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
# key: library name
# value: dictionary
# key: replicate number
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
REPS = config["replicates"]
CONDITIONS = [{
"lib" : lib,
"rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
cond["lib"],
cond["rep"])) for cond in CONDITIONS]
# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
# cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
BIOTYPES = config["biotypes"]
ORIENTATIONS = ["all"]
aligner = config["aligner"]
alignment_settings = {
"bowtie2": "",
"hisat2": "",
"crac": "-k 20 --stranded --use-x-in-cigar"}
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
output_dir = config["output_dir"]
log_dir = OPJ(output_dir, f"logs_{genome}")
data_dir = OPJ(output_dir, "data")
# Limit risks of ambiguity by imposing replicates to be numbers
# and restricting possible forms of some other wildcards
wildcard_constraints:
lib="|".join(LIBS),
rep="\d+",
rule all:
input:
#expand(
# OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
# lib=LIBS, rep=REPS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome),
lib=LIBS, rep=REPS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count",
f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
biotype=BIOTYPES, orientation=ORIENTATIONS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
"all_on_%s_{orientation}_counts.txt" % genome),
orientation=ORIENTATIONS),
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
rule trim_reads:
input:
raw = rules.link_raw_data.output.link,
output:
trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
params:
trim3 = 4,
shell:
"""
seqtk trimfq -b {params.trim3} {input.raw} | gzip > {output.trimmed}
"""
rule map_on_genome:
input:
fastq = rules.trim_reads.output.trimmed,
output:
# temp because it uses a lot of space
sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
params:
aligner = aligner,
index = genome_db,
settings = alignment_settings[aligner],
message:
"Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
log:
log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
resources:
io=5
threads:
4
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
rule sam2indexedbam:
input:
sam = rules.map_on_genome.output.sam,
output:
sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)),
index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)),
message:
"Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}."
log:
log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.log"),
err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.err"),
threads:
8
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"
def biotype2annot(wildcards):
if wildcards.biotype.endswith("_rmsk_families"):
biotype = wildcards.biotype[:-9]
else:
biotype = wildcards.biotype
return OPJ(annot_dir, f"{biotype}.gtf")
rule feature_count_reads:
input:
sorted_bam = rules.sam2indexedbam.output.sorted_bam,
index = rules.sam2indexedbam.output.index,
output:
counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
params:
stranded = feature_orientation2stranded(LIB_TYPE),
annot = biotype2annot,
# pickled dictionary that associates gene ids to gene names
converter = genome_dict["converter"]
message:
"Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
benchmark:
OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
log:
log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
shell:
"""
tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
featureCounts -v 2> {log.log}
echo ${{cmd}} 1>> {log.log}
eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
rm -rf ${{tmpdir}}
cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
"""
rule summarize_counts:
"""For a given library, write a summary of the read counts for the various biotypes."""
input:
biotype_counts_files = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{{lib}}_{{rep}}_on_%s" % genome, "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
output:
summary = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", "{lib}_{rep}_on_%s_{orientation}_counts.txt" % genome)
run:
sum_counter = sum_feature_counts
with open(output.summary, "w") as summary_file:
header = "\t".join(BIOTYPES)
summary_file.write("%s\n" % header)
sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
summary_file.write("%s\n" % sums)
rule gather_read_counts_summaries:
"""Gather read count summaries across libraries: lib_rep -> all."""
input:
summary_tables = expand(OPJ(
output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
"{name}_on_%s_{{orientation}}_counts.txt" % genome), name=COND_NAMES),
output:
summary_table = OPJ(
output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
"all_on_%s_{orientation}_counts.txt" % genome),
run:
#summary_files = (OPJ(
summary_files = [OPJ(
output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
f"{cond_name}_on_{genome}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
try:
summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
except ValueError:
warnings.warn("Some data may be missing. Using floats instead of ints.\n")
summaries = pd.concat((pd.read_table(summary_file).T.astype(float) for summary_file in summary_files), axis=1)
summaries.columns = COND_NAMES
summaries.to_csv(output.summary_table, na_rep="NA", sep="\t")
rule gather_counts:
"""For a given biotype, gather counts from all libraries in one table."""
input:
counts_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
output:
counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
wildcard_constraints:
# Avoid ambiguity with join_all_counts
biotype = "|".join(BIOTYPES)
run:
# Gathering the counts data
############################
#counts_files = (OPJ(
counts_files = [OPJ(
output_dir,
aligner,
f"mapped_{genome}",
"feature_count",
f"{cond_name}_on_{genome}",
f"{cond_name}_{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
#"{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
counts_data = pd.concat(
map(read_feature_counts, counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
onsuccess:
print("Degradome-seq analysis finished.")
cleanup_and_backup(output_dir, config)
onerror:
shell(f"rm -rf {output_dir}_err")
shell(f"cp -rp {output_dir} {output_dir}_err")
cleanup_and_backup(output_dir + "_err", config)
print("Degradome-seq analysis failed.")
run_pipeline.sh
\ No newline at end of file
......@@ -53,6 +53,9 @@ case "${PROGNAME}" in
"run_Ribo-seq_pipeline")
snakefile="${BASEDIR}/Ribo-seq/Ribo-seq.snakefile"
;;
"run_Degradome-seq_pipeline")
snakefile="${BASEDIR}/Degradome-seq/Degradome-seq.snakefile"
;;
*)
snakefile="${1}"
shift
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment