Adapting TPM and RPM counting from RNA-seq.

28d6e34e · Blaise Li · 38c641d1 · 28d6e34e
Commit 28d6e34e authored 4 years ago by Blaise Li
--- a/CLIP/iCLIP.snakefile
+++ b/CLIP/iCLIP.snakefile
@@ -208,9 +208,16 @@ counting = [
    ## Will be pulled in as dependencies of other needed results:
    # expand(OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
    ##
-    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], orientation=ORIENTATIONS),
-    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"), trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
-    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome), trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], norm=NORM_TYPES, orientation=["all"]),
+    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
+        trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], orientation=ORIENTATIONS),
+    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
+        trimmer=TRIMMERS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
+    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "{lib}_{rep}_{read_type}_on_%s_by_{norm}_{orientation}.bw" % genome),
+        trimmer=TRIMMERS, lib=LIBS, rep=REPS, read_type=POST_TRIMMING + SIZE_SELECTED + ["deduped"], norm=NORM_TYPES, orientation=["all"]),
+    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPM.txt"),
+        trimmer=TRIMMERS, read_type=["deduped"], biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
+    expand(OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count", "{lib}_mean_{read_type}_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
+        trimmer=TRIMMERS, lib=LIBS, read_type=["deduped"], biotype=["protein_coding"], orientation=ORIENTATIONS),
 ]

 #TODO:
@@ -647,7 +654,6 @@ rule feature_count_reads:
        eval ${{cmd}} 1>> {log.log} 2> {log.err} || error_exit "featureCounts failed"
        rm -rf ${{tmpdir}}
        cat {output.counts} | wormid2name > {output.counts_converted}
-        # cat {output.counts} | id2name.py ${{converter}} > {output.counts_converted}
        """


@@ -667,10 +673,13 @@ rule summarize_feature_counts:


 rule gather_read_counts_summaries:
+    """Gather read count summaries across libraries: lib_rep -> all."""
    input:
-        summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS),
+        summary_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "summaries",
+            "{lib}_{rep}_{{read_type}}_on_%s_{{orientation}}_counts.txt" % genome), lib=LIBS, rep=REPS),
    output:
-        summary_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries", "all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
+        summary_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "summaries",
+            "all_{read_type}_on_%s_{orientation}_counts.txt" % genome),
    run:
        summary_files = (OPJ(
            wildcards.trimmer,
@@ -689,9 +698,11 @@ rule gather_read_counts_summaries:
 rule gather_counts:
    """For a given biotype, gather counts from all libraries in one table."""
    input:
-        counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count", "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
+        counts_tables = expand(OPJ("{{trimmer}}", aligner, "mapped_%s" % genome, "feature_count",
+            "{lib}_{rep}_{{read_type}}_on_%s" % genome, "{{biotype}}_{{orientation}}_counts.txt"), lib=LIBS, rep=REPS),
    output:
-        counts_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count", "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
+        counts_table = OPJ("{trimmer}", aligner, "mapped_%s" % genome, "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
    # wildcard_constraints:
    #     # Avoid ambiguity with join_all_counts
    #     biotype = "|".join(COUNT_BIOTYPES)
@@ -734,6 +745,100 @@ rule gather_counts:
        counts_data.to_csv(output.counts_table, sep="\t")


+rule compute_RPK:
+    """For a given biotype, compute the corresponding RPK value (reads per kilobase)."""
+    input:
+        # TODO: Why wildcards seems to be None?
+        #counts_data = rules.gather_counts.output.counts_table,
+        counts_data = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
+    output:
+        rpk_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPK.txt"),
+    params:
+        feature_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
+    wrapper:
+        f"file://{wrappers_dir}/compute_RPK"
+
+
+rule compute_sum_million_RPK:
+    input:
+        rpk_file = rules.compute_RPK.output.rpk_file,
+    output:
+        sum_rpk_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_sum_million_RPK.txt"),
+    run:
+        sum_rpk = pd.read_table(
+            input.rpk_file,
+            index_col=0).sum()
+        (sum_rpk / 1000000).to_csv(output.sum_rpk_file, sep="\t")
+
+
+# Compute TPM using total number of mappers divided by genome length
+# (not sum of RPK across biotypes: some mappers may not be counted)
+# No, doesn't work: mappers / genome length not really comparable
+# Needs to be done on all_types
+rule compute_TPM:
+    """For a given biotype, compute the corresponding TPM value (reads per kilobase per million mappers)."""
+    input:
+        rpk_file = rules.compute_RPK.output.rpk_file
+    output:
+        tpm_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_TPM.txt"),
+    # The sum must be done over all counted features
+    wildcard_constraints:
+        biotype = "|".join(["protein_coding"])
+    # run:
+    #     rpk = pd.read_table(input.rpk_file, index_col="gene")
+    #     tpm = 1000000 * rpk / rpk.sum()
+    #     tpm.to_csv(output.tpm_file, sep="\t")
+    wrapper:
+        f"file://{wrappers_dir}/compute_TPM"
+
+
+# Useful to compute translation efficiency in the Ribo-seq pipeline
+rule compute_mean_TPM:
+    input:
+        all_tmp_file = rules.compute_TPM.output.tpm_file
+    output:
+        tpm_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "{lib}_mean_{read_type}_on_%s" % genome, "{lib}_mean_{biotype}_{orientation}_TPM.txt"),
+    wildcard_constraints:
+        biotype = "|".join(["protein_coding"])
+    run:
+        tpm = pd.read_table(
+            input.all_tmp_file, index_col="gene",
+            usecols=["gene", *[f"{wildcards.lib}_{rep}" for rep in REPS]])
+        tpm_mean = tpm.mean(axis=1)
+        # This is a Series
+        tpm_mean.name = wildcards.lib
+        tpm_mean.to_csv(output.tpm_file, sep="\t", header=True)
+
+
+rule compute_RPM:
+    input:
+        counts_data = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_counts.txt"),
+        #summary_table = rules.gather_read_counts_summaries.output.summary_table,
+        summary_table = OPJ(
+            "{trimmer}", aligner, f"mapped_{genome}", "feature_count", "summaries",
+            "all_{read_type}_on_%s_fwd_counts.txt" % genome),
+    output:
+        rpm_file = OPJ("{trimmer}", aligner, f"mapped_{genome}", "feature_count",
+            "all_{read_type}_on_%s" % genome, "{biotype}_{orientation}_RPM.txt"),
+    run:
+        # Reading column counts from {input.counts_table}
+        counts_data = pd.read_table(
+            input.counts_data,
+            index_col="gene")
+        # Reading number of protein_coding fwd mappers from {input.summary_table}
+        norm = pd.read_table(input.summary_table, index_col=0).loc["protein_coding"]
+        # Computing counts per million protein_coding fwd mappers
+        RPM = 1000000 * counts_data / norm
+        RPM.to_csv(output.rpm_file, sep="\t")
+
+# TODO: add other steps found in RNA-seq pipeline ?
+
 rule compute_median_ratio_to_pseudo_ref_size_factors:
    input:
        counts_table = rules.gather_counts.output.counts_table,