Adding more biotypes, fixing various bug.

cff1d2d4 · Blaise Li · 01da51a2 · cff1d2d4
Commit cff1d2d4 authored 6 years ago by Blaise Li
--- a/PRO-seq/PRO-seq.snakefile
+++ b/PRO-seq/PRO-seq.snakefile
@@ -59,11 +59,11 @@ import matplotlib.pyplot as plt
 import seaborn as sns
 import husl
-from libdeseq import do_deseq2,
+from libdeseq import do_deseq2
 from libhts import median_ratio_to_pseudo_ref_size_factors, status_setter, plot_lfc_distribution, plot_MA
-from libworkflows import ensure_relative, cleanup_and_backup
+from libworkflows import ensure_relative, cleanup_and_backup, texscape
 from libworkflows import get_chrom_sizes, column_converter
-from libworkflows import strip_split, last_lines, save_plot, test_na_file
+from libworkflows import strip_split, file_len, last_lines, save_plot, test_na_file
 from libworkflows import make_id_list_getter, filter_combinator, SHELL_FUNCTIONS, warn_context
 from libworkflows import feature_orientation2stranded
 from libworkflows import read_htseq_counts, sum_htseq_counts
@@ -140,8 +140,15 @@ COND_COLUMNS = pd.DataFrame(CONDITIONS).assign(
 COUNT_BIOTYPES = config["count_biotypes"]
 ANNOT_BIOTYPES = config["annot_biotypes"]
 #METAGENE_BIOTYPES=["protein_coding", "DNA_transposons_rmsk", "RNA_transposons_rmsk"]
-METAGENE_BIOTYPES=["protein_coding"]
+#METAGENE_BIOTYPES=["protein_coding"]
-ID_LISTS = ["lfc_statuses", "germline_specific", "histone", "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014"]
+METAGENE_BIOTYPES=["protein_coding", "protein_coding_5UTR", "protein_coding_CDS", "protein_coding_3UTR"]
+ID_LISTS = [
+    "lfc_statuses",
+    "germline_specific",
+    "histone",
+    "spermatogenic_Ortiz_2014", "oogenic_Ortiz_2014",
+    "piRNA_dependent_prot_si_down4_WR_RT_top200", "piRNA_dependent_prot_si_down4",
+    "csr1_prot_si_supertargets_common"]
 annot_dir = config["annot_dir"]
 gene_lists_dir = "/pasteur/entites/Mhe/bli/Gene_lists"
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
@@ -336,7 +343,7 @@ rule trim_and_dedup:
        err = OPJ(log_dir, "{trimmer}", "trim_and_dedup", "{lib}_{rep}.err"),
    run:
        shell_commands = """
-THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
+THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {wildcards.trimmer} {input} \\
    {params.adapter} {params.trim5} {params.trim3} \\
    {output.adapt} {output.noadapt} {log.trim} \\
    {output.nb_raw} {output.nb_adapt} {output.nb_adapt_deduped} \\
@@ -345,6 +352,7 @@ THREADS="{threads}" {params.process_type}_trim_and_dedup.sh {trimmer} {input} \\
        shell(shell_commands)
+# TODO: Do not deduplicate, or at least do not use the noadapt_deduped: The 3' UMI is not present.
 rule map_on_genome:
    input:
        fastq = OPJ(data_dir, "trimmed_{trimmer}", "{lib}_{rep}_{type}_deduped.fastq.gz"),
@@ -465,6 +473,7 @@ rule check_last_base:
                print(length, nb_reads_this_length, *[str(counter[letter] / nb_reads_this_length) for letter in "ACGTN"], sep="\t", file=output_file)
+# TODO: use Python to make the plot
 rule plot_last_base:
    input:
        OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{lib}_{rep}_adapt_on_C_elegans_last_bases.txt")
@@ -629,6 +638,7 @@ rule do_PCA:
        OPJ(output_dir, "{trimmer}", "figures", aligner, "{counter}", "{biotype}_{orientation}_PCA.{fig_format}"),
    message:
        "Summarizing counts for {wildcards.biotype}_{wildcards.orientation}"
+    threads: 12  # trying to avoid TimeoutError and "LOCKERROR: matplotlib is trying to acquire the lock [...]"
    log:
        OPJ(log_dir, "{trimmer}", "do_PCA_{biotype}_{orientation}.log")
    run:
@@ -1032,7 +1042,12 @@ rule join_all_counts:
    """concat counts for all biotypes into all"""
    input:
        #counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=[bty for bty in COUNT_BIOTYPES if bty[-9:] != "_families"]),
-        counts_tables = expand(OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans", "{{counter}}", "all_on_C_elegans", "{biotype}_{{orientation}}_counts.txt"), biotype=COUNT_BIOTYPES),
+        counts_tables = expand(
+            OPJ(output_dir, "{{trimmer}}", aligner, "mapped_C_elegans",
+                "{{counter}}", "all_on_C_elegans",
+                "{biotype}_{{orientation}}_counts.txt"),
+            # We only count "protein_coding", not "protein_codin_{5UTR,CDS,3UTR}"
+            biotype=[b for b in COUNT_BIOTYPES if not b.startswith("protein_coding_")]),
    output:
        counts_table = OPJ(output_dir, "{trimmer}", aligner, "mapped_C_elegans", "{counter}", "all_on_C_elegans", "alltypes_{orientation}_counts.txt"),
    run:
@@ -1196,8 +1211,11 @@ rule make_MA_plot:
                group2colour=group2colour,
                lfc_range = params.lfc_range,
                fold_type = "log2FoldChange",
-                title="MA-plot for %s, %s_%s" % (
+                title=texscape(
-                    wildcards.contrast, wildcards.orientation, wildcards.biotype))
+                    "MA-plot for %s, %s_%s" % (
+                        wildcards.contrast,
+                        wildcards.orientation,
+                        wildcards.biotype)))
 ##################
@@ -1522,6 +1540,13 @@ def meta_params(wildcards):
            "-m %d" % META_SCALE,
            "--unscaled3prime %d" % UNSCALED_INSIDE,
            "-a %d" % META_MARGIN])
+    if biotype.startswith("protein_coding_"):
+        return " ".join([
+            "-b %d" % 0,
+            "--unscaled5prime %d" % UNSCALED_INSIDE,
+            "-m %d" % META_SCALE,
+            "--unscaled3prime %d" % UNSCALED_INSIDE,
+            "-a %d" % 0])
    else:
        raise NotImplementedError("Metagene analyses for %s not implemented." % biotype)
@@ -1543,25 +1568,27 @@ rule plot_meta_profile_mean:
        plot_TSS_log = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.log" % (MIN_DIST, META_MIN_LEN)),
        plot_TSS_err = OPJ(log_dir, "{trimmer}", "plot_meta_profile", "{lib}", "{orientation}_on_merged_isolated_%d_{biotype}_min_%d.err" % (MIN_DIST, META_MIN_LEN)),
    threads: 12  # to limit memory usage, actually
-    shell:
+    run:
-        """
+        if file_len(input.bed):
-        tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
+            shell("""tmpdir=$(mktemp -dt "plot_meta_profile_{wildcards.trimmer}_{wildcards.lib}_{wildcards.orientation}_{wildcards.biotype}_%d.XXXXXXXXXX")
-        computeMatrix scale-regions -S {input.bigwig} \\
+computeMatrix scale-regions -S {input.bigwig} \\
-            -R {input.bed} \\
+    -R {input.bed} \\
-            {params.meta_params} \\
+    {params.meta_params} \\
-            -p 1 \\
+    -p 1 \\
-            -out ${{tmpdir}}/meta_profile.gz \\
+    -out ${{tmpdir}}/meta_profile.gz \\
-            --skipZeros \\
+    --skipZeros \\
-            1> {log.plot_TSS_log} \\
+    1> {log.plot_TSS_log} \\
-            2> {log.plot_TSS_err} \\
+    2> {log.plot_TSS_err} \\
-            || error_exit "computeMatrix failed"
+    || error_exit "computeMatrix failed"
-        plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
+plotProfile -m ${{tmpdir}}/meta_profile.gz -out {output} \\
-            1>> {log.plot_TSS_log} \\
+    1>> {log.plot_TSS_log} \\
-            2>> {log.plot_TSS_err} \\
+    2>> {log.plot_TSS_err} \\
-            || error_exit "plotProfile failed"
+    || error_exit "plotProfile failed"
-        rm -rf ${{tmpdir}}
+rm -rf ${{tmpdir}}
-        """ % META_MIN_LEN
+""" % META_MIN_LEN)
+        else:
+            warnings.warn("No regions selected in {input.bed}. Generating empty figure.\n")
+            shell("""> {output}""")
 onsuccess:
    print("PRO-seq analysis finished.")