Preliminary degradome-seq pipeline.

The pipeline only maps and counts aligned reads in biotypes.

Preliminary degradome-seq pipeline.
d6404f28 · Blaise Li · 4af07397 · d6404f28 · d6404f28 · d6404f28
Commit d6404f28 authored 7 years ago by Blaise Li
--- a/Degradome-seq/Degradome-seq.snakefile
+++ b/Degradome-seq/Degradome-seq.snakefile
+"""
+Snakefile to analyse Degradome-seq data.
+"""
+import sys
+major, minor = sys.version_info[:2]
+if major < 3 or (major == 3 and minor < 6):
+    sys.exit("Need at least python 3.6\n")
+
+import os
+OPJ = os.path.join
+from glob import glob
+import pandas as pd
+
+from smincludes import rules as irules
+from libworkflows import (
+    cleanup_and_backup,
+    ensure_relative,
+    feature_orientation2stranded,
+    get_chrom_sizes,
+    read_feature_counts,
+    sum_feature_counts)
+# http://sailfish.readthedocs.io/en/master/library_type.html
+LIB_TYPE = config["lib_type"]
+# key: library name
+# value: dictionary
+#   key: replicate number
+#   value: path to raw data
+lib2raw = config["lib2raw"]
+LIBS = list(lib2raw.keys())
+REPS = config["replicates"]
+CONDITIONS = [{
+    "lib" : lib,
+    "rep" : rep} for rep in REPS for lib in LIBS]
+# We use this for various things in order to have always the same library order:
+COND_NAMES = ["_".join((
+    cond["lib"],
+    cond["rep"])) for cond in CONDITIONS]
+# COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
+#     cond_name=pd.Series(COND_NAMES).values).set_index("cond_name")
+
+
+BIOTYPES = config["biotypes"]
+ORIENTATIONS = ["all"]
+
+aligner = config["aligner"]
+alignment_settings = {
+    "bowtie2": "",
+    "hisat2": "",
+    "crac": "-k 20 --stranded --use-x-in-cigar"}
+
+########################
+# Genome configuration #
+########################
+genome_dict = config["genome_dict"]
+genome = genome_dict["name"]
+chrom_sizes = get_chrom_sizes(genome_dict["size"])
+genomelen = sum(chrom_sizes.values())
+genome_db = genome_dict["db"][aligner]
+# bed file binning the genome in 10nt bins
+genome_binned = genome_dict["binned"]
+annot_dir = genome_dict["annot_dir"]
+# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+convert_dir = genome_dict["convert_dir"]
+gene_lists_dir = genome_dict["gene_lists_dir"]
+avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
+output_dir = config["output_dir"]
+log_dir = OPJ(output_dir, f"logs_{genome}")
+data_dir = OPJ(output_dir, "data")
+
+# Limit risks of ambiguity by imposing replicates to be numbers
+# and restricting possible forms of some other wildcards
+wildcard_constraints:
+    lib="|".join(LIBS),
+    rep="\d+",
+
+
+rule all:
+    input:
+        #expand(
+        #    OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
+        #    lib=LIBS, rep=REPS),
+        expand(
+            OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome),
+            lib=LIBS, rep=REPS),
+        expand(
+            OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count",
+                f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
+            biotype=BIOTYPES, orientation=ORIENTATIONS),
+        expand(
+            OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
+                "all_on_%s_{orientation}_counts.txt" % genome),
+            orientation=ORIENTATIONS),
+
+
+include: ensure_relative(irules["link_raw_data"], workflow.basedir)
+
+
+rule trim_reads:
+    input:
+        raw = rules.link_raw_data.output.link,
+    output:
+        trimmed = OPJ(data_dir, "trimmed", "{lib}_{rep}.fastq.gz"),
+    params:
+        trim3 = 4,
+    shell:
+        """
+        seqtk trimfq -b {params.trim3} {input.raw} | gzip > {output.trimmed} 
+        """
+
+
+rule map_on_genome:
+    input:
+        fastq = rules.trim_reads.output.trimmed,
+    output:
+        # temp because it uses a lot of space
+        sam = temp(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s.sam" % genome)),
+        nomap_fastq = OPJ(output_dir, aligner, f"not_mapped_{genome}", "{lib}_{rep}_unmapped_on_%s.fastq.gz" % genome),
+    params:
+        aligner = aligner,
+        index = genome_db,
+        settings = alignment_settings[aligner],
+    message:
+        "Mapping {wildcards.lib}_{wildcards.rep} on %s." % genome
+    log:
+        log = OPJ(log_dir, "map_on_genome_{lib}_{rep}.log"),
+        err = OPJ(log_dir, "map_on_genome_{lib}_{rep}.err")
+    resources:
+        io=5
+    threads:
+        4
+    wrapper:
+        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/map_on_genome"
+
+
+rule sam2indexedbam:
+    input:
+        sam = rules.map_on_genome.output.sam,
+    output:
+        sorted_bam = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam" % genome)),
+        index = protected(OPJ(output_dir, aligner, f"mapped_{genome}", "{lib}_{rep}_on_%s_sorted.bam.bai" % genome)),
+    message:
+        "Sorting and indexing sam file for {wildcards.lib}_{wildcards.rep}."
+    log:
+        log = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.log"),
+        err = OPJ(log_dir, "sam2indexedbam_{lib}_{rep}.err"),
+    threads:
+        8
+    wrapper:
+        "file:///pasteur/homes/bli/src/bioinfo_utils/snakemake_wrappers/sam2indexedbam"
+
+
+def biotype2annot(wildcards):
+    if wildcards.biotype.endswith("_rmsk_families"):
+        biotype = wildcards.biotype[:-9]
+    else:
+        biotype = wildcards.biotype
+    return OPJ(annot_dir, f"{biotype}.gtf")
+
+
+rule feature_count_reads:
+    input:
+        sorted_bam = rules.sam2indexedbam.output.sorted_bam,
+        index = rules.sam2indexedbam.output.index,
+    output:
+        counts = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts.txt"),
+        counts_converted = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{biotype}_{orientation}_counts_gene_names.txt"),
+    params:
+        stranded = feature_orientation2stranded(LIB_TYPE),
+        annot = biotype2annot,
+        # pickled dictionary that associates gene ids to gene names
+        converter = genome_dict["converter"]
+    message:
+        "Counting {wildcards.orientation} {wildcards.biotype} reads for {wildcards.lib}_{wildcards.rep} with featureCounts."
+    benchmark:
+        OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}_benchmark.txt")
+    log:
+        log = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.log"),
+        err = OPJ(log_dir, "feature_count_reads", "{lib}_{rep}_{biotype}_{orientation}.err"),
+    shell:
+        """
+        tmpdir=$(mktemp -dt "feature_{wildcards.lib}_{wildcards.rep}_{wildcards.biotype}_{wildcards.orientation}.XXXXXXXXXX")
+        cmd="featureCounts -a {params.annot} -o {output.counts} -t transcript -g "gene_id" -O -M --primary -s {params.stranded} --fracOverlap 0 --tmpDir ${{tmpdir}} {input.sorted_bam}"
+        featureCounts -v 2> {log.log}
+        echo ${{cmd}} 1>> {log.log}
+        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
+        rm -rf ${{tmpdir}}
+        cat {output.counts} | id2name.py {params.converter} > {output.counts_converted}
+        """
+
+
+rule summarize_counts:
+    """For a given library, write a summary of the read counts for the various biotypes."""
+    input:
+        biotype_counts_files = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{{lib}}_{{rep}}_on_%s" % genome, "{{lib}}_{{rep}}_{biotype}_{{orientation}}_counts.txt"), biotype=BIOTYPES),
+    output:
+        summary = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries", "{lib}_{rep}_on_%s_{orientation}_counts.txt" % genome)
+    run:
+        sum_counter = sum_feature_counts
+        with open(output.summary, "w") as summary_file:
+            header = "\t".join(BIOTYPES)
+            summary_file.write("%s\n" % header)
+            sums = "\t".join((str(sum_counter(counts_file)) for counts_file in input.biotype_counts_files))
+            summary_file.write("%s\n" % sums)
+
+
+rule gather_read_counts_summaries:
+    """Gather read count summaries across libraries: lib_rep -> all."""
+    input:
+        summary_tables = expand(OPJ(
+            output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
+            "{name}_on_%s_{{orientation}}_counts.txt" % genome), name=COND_NAMES),
+    output:
+        summary_table = OPJ(
+            output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
+            "all_on_%s_{orientation}_counts.txt" % genome),
+    run:
+        #summary_files = (OPJ(
+        summary_files = [OPJ(
+            output_dir, aligner, f"mapped_{genome}", "feature_count", "summaries",
+            f"{cond_name}_on_{genome}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
+        try:
+            summaries = pd.concat((pd.read_table(summary_file).T.astype(int) for summary_file in summary_files), axis=1)
+        except ValueError:
+            warnings.warn("Some data may be missing. Using floats instead of ints.\n")
+            summaries = pd.concat((pd.read_table(summary_file).T.astype(float) for summary_file in summary_files), axis=1)
+        summaries.columns = COND_NAMES
+        summaries.to_csv(output.summary_table, na_rep="NA", sep="\t")
+
+
+rule gather_counts:
+    """For a given biotype, gather counts from all libraries in one table."""
+    input:
+        counts_tables = expand(OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", "{lib}_{rep}_on_%s" % genome, "{lib}_{rep}_{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
+    output:
+        counts_table = OPJ(output_dir, aligner, f"mapped_{genome}", "feature_count", f"all_on_{genome}", "{biotype}_{orientation}_counts.txt"),
+    wildcard_constraints:
+        # Avoid ambiguity with join_all_counts
+        biotype = "|".join(BIOTYPES)
+    run:
+        # Gathering the counts data
+        ############################
+        #counts_files = (OPJ(
+        counts_files = [OPJ(
+            output_dir,
+            aligner,
+            f"mapped_{genome}",
+            "feature_count",
+            f"{cond_name}_on_{genome}",
+            f"{cond_name}_{wildcards.biotype}_{wildcards.orientation}_counts.txt") for cond_name in COND_NAMES]
+            #"{biotype}_{orientation}_counts.txt".format(biotype=wildcards.biotype, orientation=wildcards.orientation)) for cond_name in COND_NAMES)
+        counts_data = pd.concat(
+            map(read_feature_counts, counts_files),
+            axis=1).fillna(0).astype(int)
+        counts_data.columns = COND_NAMES
+        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
+        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
+        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
+        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
+        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
+        if wildcards.biotype.endswith("_rmsk_families"):
+            repeat_families = [":".join(name.split(":")[:-1]) for name in counts_data.index]
+            # Sum the counts for a given repeat family
+            counts_data = counts_data.assign(family=repeat_families).groupby("family").sum()
+        counts_data.index.names = ["gene"]
+        counts_data.to_csv(output.counts_table, na_rep="NA", sep="\t")
+
+
+onsuccess:
+    print("Degradome-seq analysis finished.")
+    cleanup_and_backup(output_dir, config)
+
+onerror:
+    shell(f"rm -rf {output_dir}_err")
+    shell(f"cp -rp {output_dir} {output_dir}_err")
+    cleanup_and_backup(output_dir + "_err", config)
+    print("Degradome-seq analysis failed.")
--- a/run_Degradome-seq_pipeline
+++ b/run_Degradome-seq_pipeline
+run_pipeline.sh
\ No newline at end of file
--- a/run_pipeline.sh
+++ b/run_pipeline.sh
@@ -53,6 +53,9 @@ case "${PROGNAME}" in
    "run_Ribo-seq_pipeline")
        snakefile="${BASEDIR}/Ribo-seq/Ribo-seq.snakefile"
        ;;
+    "run_Degradome-seq_pipeline")
+        snakefile="${BASEDIR}/Degradome-seq/Degradome-seq.snakefile"
+        ;;
    *)
        snakefile="${1}"
        shift