Folds of RPM for counts of remapped reads.

485dd97f · Blaise Li · e69418a4 · 485dd97f
Commit 485dd97f authored 6 years ago by Blaise Li
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -359,7 +359,7 @@ mapping_dir = OPJ(output_dir, aligner, f"mapped_{genome}")
 reads_dir = OPJ(mapping_dir, "reads")
 annot_counts_dir = OPJ(mapping_dir, "annotation")
 feature_counts_dir = OPJ(mapping_dir, "feature_count")
-READ_TYPES_FOR_COUNTING = [size_selected, "all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"]))
+READ_TYPES_FOR_COUNTING = ["all_sisiuRNA", "sisiuRNA"] + list(map(lambda x: "%s%s" % (x, "RNA"), ["prot_sisiu"]))
 # Actually, we don't need to enforce this: snakemake will map whatever read type is needed for counting
 assert set(READ_TYPES_FOR_COUNTING) < set(READ_TYPES_FOR_MAPPING), "Cannot count reads that were not mapped."
 # Reads remapped and counted using featureCounts
@@ -480,7 +480,8 @@ wildcard_constraints:
    biotype="|".join(set(COUNT_BIOTYPES + ANNOT_BIOTYPES)),
    id_list="|".join(GENE_LISTS),
    type_set="|".join(["all", "protein_coding", "protein_coding_TE"]),
-    small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu|" + "|".join(REMAPPED_COUNTED),
+    small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu",
+    # read_type = "|".join(REMAPPED_COUNTED),
    # read_type="|".join([
    #     "raw", "trimmed", "deduped", f"{size_selected}",
    #     "nomap", "nomap_siRNA", "mapped",
@@ -607,7 +608,22 @@ read_graphs = [
 ip_fold_heatmaps = []
 de_fold_heatmaps = []
 ip_fold_boxplots = []
+# Temporary, until used for boxplots:
+remapped_folds = []
 if contrasts_dict["ip"]:
+    remapped_folds = expand(
+        OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all", "{counted_type}_mean_log2_RPM_fold.txt"),
+        counted_type=REMAPPED_COUNTED),
+    # snakemake -n OK
+    # expand(
+    #     OPJ(mapping_dir, f"RPM_folds_{size_selected}", "all",
+    #         "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome),
+    #     read_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
+    # snakemake -n OK
+    # expand(
+    #     OPJ(mapping_dir, f"RPM_folds_{size_selected}", "{contrast}",
+    #         "{contrast}_{small_type}_on_%s_{biotype}_{orientation}_transcript_RPM_folds.txt" % genome),
+    #     contrast=IP_CONTRASTS, small_type=READ_TYPES_FOR_COUNTING, biotype=COUNT_BIOTYPES, orientation=ORIENTATIONS),
    ip_fold_heatmaps = expand(
        OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.{fig_format}"),
        small_type=["pisimi", "prot_si"], fold_type=["mean_log2_RPM_fold"], fig_format=FIG_FORMATS)
@@ -714,12 +730,9 @@ rule all:
        expand(OPJ(mapping_dir, f"deseq2_{size_selected}", "all", "pisimi_{fold_type}.txt"), fold_type=["log2FoldChange"]),
        #expand(OPJ(mapping_dir, "RPM_folds_%s" % size_selected, "{contrast}", "{contrast}_{small_type}_RPM_folds.txt"), contrast=IP_CONTRASTS, small_type=DE_TYPES),
        exploratory_graphs,
+        remapped_folds,
        fold_boxplots,
        #expand(OPJ(output_dir, "figures", "{small_type}_unit_clustermap.{fig_format}"), small_type=SMALL_TYPES, fig_format=FIG_FORMATS),
-        # snakemake -n OK
-        expand(OPJ(
-            feature_counts_dir,
-            "all_{small_type}_counts.txt"), small_type=REMAPPED_COUNTED),
        #expand(OPJ(
        #    feature_counts_dir,
        #    "all_{read_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome), read_type=READ_TYPES_FOR_COUNTING, biotype=ANNOT_BIOTYPES, orientation=ORIENTATIONS),
@@ -727,11 +740,6 @@ rule all:
        #     feature_counts_dir,
        #     "all_{small_type}_on_%s_{biotype}_{orientation}_transcript_counts.txt" % genome),
        #     small_type=READ_TYPES_FOR_MAPPING, biotype=set(COUNT_BIOTYPES + ANNOT_BIOTYPES), orientation=ORIENTATIONS),
-        # snakemake -n not OK (Missing input files for rule gather_small_RNA_counts)
-        #expand(
-        #    OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
-        #    small_type=REMAPPED_COUNTED),
-        ##
        expand(
            OPJ(annot_counts_dir, f"all_{size_selected}_on_{genome}", "{small_type}_RPM.txt"),
            small_type=["mi", "prot_si", "te_si", "pseu_si", "satel_si", "simrep_si", "prot_siu", "te_siu", "pseu_siu",  "pisimi"]),
@@ -1075,8 +1083,8 @@ rule gather_feature_counts:
    input:
        counts_tables = expand(
            OPJ(feature_counts_dir,
-            "{lib}_{rep}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome),
+                "{cond_name}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_{{feature_type}}_counts.txt" % genome),
-            filtered_product, lib=LIBS, rep=REPS),
+            cond_name=COND_NAMES),
    output:
        counts_table = OPJ(
            feature_counts_dir,
@@ -1334,8 +1342,6 @@ rule join_si_reads:
 #     df = pd.pd.DataFrame(speeds).set_index(0)
-# Reminder:
-# annot_counts_dir = OPJ(mapping_dir, "annotation")
 rule gather_small_RNA_counts:
    """For a given small_type, gather counts from all libraries in one table."""
    input:
@@ -1803,6 +1809,8 @@ rule gather_RPM_folds:
    output:
        all_folds = OPJ(
            mapping_dir, f"RPM_folds_{size_selected}", "all", "{small_type}_mean_log2_RPM_fold.txt"),
+    wildcard_constraints:
+        small_type="[spm]i|pisimi|siu|prot_si|te_si|pseu_si|satel_si|simrep_si|prot_siu|te_siu|pseu_siu|satel_siu|simrep_siu|sisiu|all_si|all_siu|all_sisiu"
    log:
        log = OPJ(log_dir, "gather_RPM_folds", "{small_type}.log"),
    run:
@@ -1814,6 +1822,88 @@ rule gather_RPM_folds:
                output.all_folds, sep="\t")
+# TODO: replace counted_type with all subwildcards. Why does it work with cond_name?
+# Wildcards in input files cannot be determined from output files:
+# 'read_type'
+rule compute_remapped_RPM_folds:
+    input:
+        counts_table = rules.gather_feature_counts.output.counts_table,
+        #exon_lengths = rules.compute_genes_exon_lengths.output.exon_lengths,
+        summary_table = rules.gather_read_counts_summaries.output.summary_table,
+        #tags_table = rules.associate_small_type.output.tags_table,
+    output:
+        fold_results = OPJ(
+            mapping_dir, f"RPM_folds_{size_selected}",
+            "{contrast}", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_{feature_type}_RPM_folds.txt" % genome),
+    log:
+        log = OPJ(log_dir, "compute_RPM_folds", "{contrast}_{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome),
+    run:
+        with open(log.log, "w") as logfile:
+            (cond, ref) = CONTRAST2PAIR[wildcards.contrast]
+            logfile.write(f"cond: {cond}, ref: {ref}\n")
+            column_list = expand("{lib}_{rep}", lib=[ref, cond], rep=REPS)
+            logfile.write(f"Reading columns {column_list} from {input.counts_table}\n")
+            counts_data = pd.read_table(
+                input.counts_table,
+                usecols=["gene", *column_list],
+                index_col="gene")
+            logfile.write(f"Found {len(counts_data.index)} genes\n")
+            #logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
+            #exon_lengths = pd.read_table(
+            #    input.exon_lengths_file,
+            #    index_col="gene")
+            #logfile.write(f"Found {len(exon_lengths.index)} genes\n")
+            #logfile.write("Determining set of common genes\n")
+            #common = counts_data.index.intersection(exon_lengths.index)
+            #logfile.write(f"Found {len(common)} common genes\n")
+            #logfile.write("Computing counts per kilobase\n")
+            #RPK = 1000 * counts_data.loc[common].div(exon_lengths.loc[common]["union_exon_len"], axis="index")
+            logfile.write(f"Reading number of non-structural mappers from {input.summary_table}\n")
+            norm = pd.read_table(input.summary_table, index_col=0).loc["non_structural"][column_list]
+            logfile.write(str(norm))
+            logfile.write("Computing counts per million non-structural mappers\n")
+            RPM = 1000000 * counts_data / norm
+            logfile.write("Computing RPM log2 fold changes\n")
+            # Add 0.25 as RPM "pseudo-count" (edgeR does this ? https://www.biostars.org/p/102568/#102604)
+            lfc = np.log2(pd.DataFrame(
+                {f"log2({cond}_{rep}/{ref}_{rep})" : (RPM[f"{cond}_{rep}"] + 0.25) / (RPM[f"{ref}_{rep}"] + 0.25) for rep in REPS}))
+            # Compute the mean
+            lfc = lfc.assign(mean_log2_RPM_fold=lfc.mean(axis=1))
+            # Converting gene IDs
+            ######################
+            with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
+                lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
+            with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+            #logfile.write("Adding small read type info\n")
+            ##pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
+            #pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
+            pd.concat((RPM, lfc), axis=1).to_csv(output.fold_results, sep="\t")
+rule gather_remapped_RPM_folds:
+    """Gathers RPM folds across contrasts."""
+    input:
+        fold_results = expand(OPJ(
+            mapping_dir, f"RPM_folds_{size_selected}",
+            "{contrast}", "{contrast}_{{read_type}}_on_%s_{{biotype}}_{{orientation}}_transcript_RPM_folds.txt" % genome), contrast=IP_CONTRASTS),
+    output:
+        all_folds = OPJ(
+            mapping_dir, f"RPM_folds_{size_selected}", "all", "{read_type}_on_%s_{biotype}_{orientation}_transcript_mean_log2_RPM_fold.txt" % genome),
+    log:
+        log = OPJ(log_dir, "gather_RPM_folds", "{read_type}_on_%s_{biotype}_{orientation}_transcript.log" % genome),
+    run:
+        pd.DataFrame({contrast : pd.read_table(
+            # OPJ(mapping_dir, f"RPM_folds_{size_selected}",
+            #     f"{contrast}", f"{contrast}_{wildcards.counted_type}_RPM_folds.txt"),
+            OPJ(mapping_dir, f"RPM_folds_{size_selected}",
+                f"{contrast}", f"{contrast}_{wildcards.read_type}_on_{genome}_{wildcards.biotype}_{wildcards.orientation}_transcript_RPM_folds.txt"),
+            index_col=["gene", "cosmid", "name"],
+            usecols=["gene", "cosmid", "name", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
+                output.all_folds, sep="\t")
+# TODO add remapped_RPM_folds
 def source_gathered_folds(wildcards):
    if wildcards.fold_type == "mean_log2_RPM_fold":
        return rules.gather_RPM_folds.output.all_folds