Skip to content
Snippets Groups Projects
Commit d6404f28 authored by Blaise Li's avatar Blaise Li
Browse files

Preliminary degradome-seq pipeline.

The pipeline only maps and counts aligned reads in biotypes.
parent 4af07397
No related branches found
No related tags found
No related merge requests found
"""
Snakefile to analyse Degradome-seq data.
"""
import sys
major, minor = sys.version_info[:2]
if major < 3 or (major == 3 and minor < 6):
sys.exit("Need at least python 3.6\n")
import os
OPJ = os.path.join
from glob import glob
import pandas as pd
from smincludes import rules as irules
from libworkflows import (
cleanup_and_backup,
ensure_relative,
feature_orientation2stranded,
get_chrom_sizes,
read_feature_counts,
sum_feature_counts)
# http://sailfish.readthedocs.io/en/master/library_type.html
LIB_TYPE = config["lib_type"]
# key: library name
# value: dictionary
# key: replicate number
# value: path to raw data
lib2raw = config["lib2raw"]
LIBS = list(lib2raw.keys())
REPS = config["replicates"]
CONDITIONS = [{
"lib" : lib,
"rep" : rep} for rep in REPS for lib in LIBS]
# We use this for various things in order to have always the same library order:
COND_NAMES = ["_".join((
cond["lib"],
cond["rep"])) for cond in CONDITIONS]
# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
# cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
BIOTYPES = config["biotypes"]
ORIENTATIONS = ["all"]
aligner = config["aligner"]
alignment_settings = {
"bowtie2": "",
"hisat2": "",
"crac": "-k 20 --stranded --use-x-in-cigar"}
########################
# Genome configuration #
########################
genome_dict = config["genome_dict"]
genome = genome_dict["name"]
chrom_sizes = get_chrom_sizes(genome_dict["size"])
genomelen = sum(chrom_sizes.values())
genome_db = genome_dict["db"][aligner]
# bed file binning the genome in 10nt bins
genome_binned = genome_dict["binned"]
annot_dir = genome_dict["annot_dir"]
# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
convert_dir = genome_dict["convert_dir"]
gene_lists_dir = genome_dict["gene_lists_dir"]
avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
output_dir = config["output_dir"]
log_dir = OPJ(output_dir, f"logs_{genome}")
data_dir = OPJ(output_dir, "data")
# Limit risks of ambiguity by imposing replicates to be numbers
# and restricting possible forms of some other wildcards
wildcard_constraints:
lib="|".join(LIBS),
rep="\d+",
rule all:
input:
#expand(
# OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
# lib=LIBS, rep=REPS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome),
lib=LIBS, rep=REPS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count",
f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
biotype=BIOTYPES, orientation=ORIENTATIONS),
expand(
OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
"all_on_%s_{orientation}_counts.txt" % genome),
orientation=ORIENTATIONS),
include: ensure_relative(irules["link_raw_data"], workflow.basedir)
rule trim_reads:
input:
raw = rules.link_raw_data.output.link,
output:
trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
params:
trim3 = 4,
shell:
"""
seqtk trimfq -b {params.trim3} {input.raw} | gzip > {output.trimmed}
"""
rule map_on_genome:
input:
fastq = rules.trim_reads.output.trimmed,
output:
# temp because it uses a lot of space
sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
params:
aligner = aligner,
index = genome_db,
settings = alignment_settings[aligner],
message:
"Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
log:
log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
resources:
io=5
threads:
4
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
rule sam2indexedbam:
input:
sam = rules.map_on_genome.output.sam,
output:
sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)),
index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)),
message:
"Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}."
log:
log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.log"),
err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.err"),
threads:
8
wrapper:
"file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"
def biotype2annot(wildcards):
if wildcards.biotype.endswith("_rmsk_families"):
biotype = wildcards.biotype[:-9]
else:
biotype = wildcards.biotype
return OPJ(annot_dir, f"{biotype}.gtf")
rule feature_count_reads:
input:
sorted_bam = rules.sam2indexedbam.output.sorted_bam,
index = rules.sam2indexedbam.output.index,
output:
counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
params:
stranded = feature_orientation2stranded(LIB_TYPE),
annot = biotype2annot,
# pickled dictionary that associates gene ids to gene names
converter = genome_dict["converter"]
message:
"Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
benchmark:
OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
log:
log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
shell:
"""
tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
featureCounts -v 2> {log.log}
echo ${{cmd}} 1>> {log.log}
eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
rm -rf ${{tmpdir}}
cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
"""
rule summarize_counts:
"""For a given library, write a summary of the read counts for the various biotypes."""
input:
biotype_counts_files = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{{lib}}_{{rep}}_on_%s" % genome, "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
output:
summary = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", "{lib}_{rep}_on_%s_{orientation}_counts.txt" % genome)
run:
sum_counter = sum_feature_counts
with open(output.summary, "w") as summary_file:
header = "\t".join(BIOTYPES)
summary_file.write("%s\n" % header)
sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
summary_file.write("%s\n" % sums)
rule gather_read_counts_summaries:
"""Gather read count summaries across libraries: lib_rep -> all."""
input:
summary_tables = expand(OPJ(
output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
"{name}_on_%s_{{orientation}}_counts.txt" % genome), name=COND_NAMES),
output:
summary_table = OPJ(
output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
"all_on_%s_{orientation}_counts.txt" % genome),
run:
#summary_files = (OPJ(
summary_files = [OPJ(
output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
f"{cond_name}_on_{genome}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
try:
summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
except ValueError:
warnings.warn("Some data may be missing. Using floats instead of ints.\n")
summaries = pd.concat((pd.read_table(summary_file).T.astype(float) for summary_file in summary_files), axis=1)
summaries.columns = COND_NAMES
summaries.to_csv(output.summary_table, na_rep="NA", sep="\t")
rule gather_counts:
"""For a given biotype, gather counts from all libraries in one table."""
input:
counts_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
output:
counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
wildcard_constraints:
# Avoid ambiguity with join_all_counts
biotype = "|".join(BIOTYPES)
run:
# Gathering the counts data
############################
#counts_files = (OPJ(
counts_files = [OPJ(
output_dir,
aligner,
f"mapped_{genome}",
"feature_count",
f"{cond_name}_on_{genome}",
f"{cond_name}_{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
#"{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
counts_data = pd.concat(
map(read_feature_counts, counts_files),
axis=1).fillna(0).astype(int)
counts_data.columns = COND_NAMES
# Simple_repeat|Simple_repeat|(TTTTTTG)n:1
# Simple_repeat|Simple_repeat|(TTTTTTG)n:2
# Simple_repeat|Simple_repeat|(TTTTTTG)n:3
# Simple_repeat|Simple_repeat|(TTTTTTG)n:4
# -> Simple_repeat|Simple_repeat|(TTTTTTG)n
if wildcards.biotype.endswith("_rmsk_families"):
repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
# Sum the counts for a given repeat family
counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
counts_data.index.names = ["gene"]
counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
onsuccess:
print("Degradome-seq analysis finished.")
cleanup_and_backup(output_dir, config)
onerror:
shell(f"rm -rf {output_dir}_err")
shell(f"cp -rp {output_dir} {output_dir}_err")
cleanup_and_backup(output_dir + "_err", config)
print("Degradome-seq analysis failed.")
run_pipeline.sh
\ No newline at end of file
......@@ -53,6 +53,9 @@ case "${PROGNAME}" in
"run_Ribo-seq_pipeline")
snakefile="${BASEDIR}/Ribo-seq/Ribo-seq.snakefile"
;;
"run_Degradome-seq_pipeline")
snakefile="${BASEDIR}/Degradome-seq/Degradome-seq.snakefile"
;;
*)
snakefile="${1}"
shift
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment