Fixing all_si_22G crashes.

bd7d9415 · Blaise Li · 313d0ce7 · bd7d9415
Commit bd7d9415 authored 5 years ago by Blaise Li
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -1705,57 +1705,66 @@ rule gather_small_RNA_counts:
        counts_table = OPJ(
            annot_counts_dir,
            "all_{mapped_type}_on_%s" % genome, "{small_type}_counts.txt"),
+    log:
+        log = OPJ(log_dir, "gather_small_RNA_counts", "{mapped_type}_{small_type}.log"),
    run:
-        # Gathering the counts data
-        ############################
-        counts_files = (OPJ(
-            annot_counts_dir,
-            f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
-            f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
-        if wildcards.small_type == "pi":
-            drop = ["piRNA"]
-        elif wildcards.small_type == "mi":
-            drop = ["miRNA"]
-        elif wildcards.small_type in {
-                *{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
-                *{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
-            drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
-        elif wildcards.small_type in {
-                *{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
-                *{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
-            drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
-        elif wildcards.small_type in {
-                *{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
-                *{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
-            drop = ["pseudogene"]
-        elif wildcards.small_type in {
-                *{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
-                *{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
-            drop = ["satellites_rmsk"]
-        elif wildcards.small_type in {
-                *{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
-                *{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
-            drop = ["simple_repeats_rmsk"]
-        elif wildcards.small_type in {"all_si", "all_siu"}:
-            drop = ["all_siRNA"]
-        else:
-            raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
-        counts_data = pd.concat(
-            (pd.read_table(
-                counts_file,
-                index_col=0,
-                na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
-            axis=1).fillna(0).astype(int)
-        counts_data.columns = COND_NAMES
-        # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
-        # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
-        # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
-        # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
-        # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
-        if wildcards.small_type in RMSK_SISIU_TYPES:
-            counts_data = sum_by_family(counts_data)
-        counts_data.index.names = ["gene"]
-        counts_data.to_csv(output.counts_table, sep="\t")
+        with open(log.log, "w") as logfile:
+            # Gathering the counts data
+            ############################
+            counts_files = (OPJ(
+                annot_counts_dir,
+                f"{cond_name}_{wildcards.mapped_type}_on_{genome}",
+                f"{wildcards.small_type}_counts.txt") for cond_name in COND_NAMES)
+            if wildcards.small_type == "pi":
+                drop = ["piRNA"]
+            elif wildcards.small_type == "mi":
+                drop = ["miRNA"]
+            elif wildcards.small_type in {
+                    *{f"prot_si_{suffix}" for suffix in SI_SUFFIXES},
+                    *{f"prot_siu_{suffix}" for suffix in SI_SUFFIXES}}:
+                drop = ["protein_coding_CDS", "protein_coding_UTR", "protein_coding_exon_junction", "protein_coding_pure_intron"]
+            elif wildcards.small_type in {
+                    *{f"te_si_{suffix}" for suffix in SI_SUFFIXES},
+                    *{f"te_siu_{suffix}" for suffix in SI_SUFFIXES}}:
+                drop = ["DNA_transposons_rmsk", "RNA_transposons_rmsk"]
+            elif wildcards.small_type in {
+                    *{f"pseu_si_{suffix}" for suffix in SI_SUFFIXES},
+                    *{f"pseu_siu_{suffix}" for suffix in SI_SUFFIXES}}:
+                drop = ["pseudogene"]
+            elif wildcards.small_type in {
+                    *{f"satel_si_{suffix}" for suffix in SI_SUFFIXES},
+                    *{f"satel_siu_{suffix}" for suffix in SI_SUFFIXES}}:
+                drop = ["satellites_rmsk"]
+            elif wildcards.small_type in {
+                    *{f"simrep_si_{suffix}" for suffix in SI_SUFFIXES},
+                    *{f"simrep_siu_{suffix}" for suffix in SI_SUFFIXES}}:
+                drop = ["simple_repeats_rmsk"]
+            elif wildcards.small_type in {
+                    *{f"all_si_{suffix}" for suffix in SI_SUFFIXES},
+                    *{f"all_siu_{suffix}" for suffix in SI_SUFFIXES}}:
+                drop = [f"all_si_{suffix}RNA" for suffix in SI_SUFFIXES]
+            elif wildcards.small_type in {"all_si", "all_siu"}:
+                drop = ["all_siRNA"]
+            else:
+                # Unknown read type: all_si_22G
+                raise NotImplementedError("Unknown read type: %s" % wildcards.small_type)
+            logfile.write(f"Will drop {drop} for {wildcards.small_type}.\n")
+            counts_data = pd.concat(
+                (pd.read_table(
+                    counts_file,
+                    index_col=0,
+                    na_filter=False).drop(drop, errors="ignore") for counts_file in counts_files),
+                axis=1).fillna(0).astype(int)
+            counts_data.columns = COND_NAMES
+            # Simple_repeat|Simple_repeat|(TTTTTTG)n:1
+            # Simple_repeat|Simple_repeat|(TTTTTTG)n:2
+            # Simple_repeat|Simple_repeat|(TTTTTTG)n:3
+            # Simple_repeat|Simple_repeat|(TTTTTTG)n:4
+            # -> Simple_repeat|Simple_repeat|(TTTTTTG)n
+            if wildcards.small_type in RMSK_SISIU_TYPES:
+                counts_data = sum_by_family(counts_data)
+            counts_data.index.names = ["gene"]
+            counts_data.to_csv(output.counts_table, sep="\t")


 def counts_to_join(wildcards):
@@ -1960,11 +1969,15 @@ rule associate_small_type:
        tags_table.to_csv(output.tags_table, sep="\t")


-def add_tags_column(data, tags_table, tag_name):
+# TODO: check what this does with index column name (can it loose "gene"?)
+def add_tags_column(data, tags_table, tag_name, logfile=None):
    """Adds a column *tag_name* to *data* based on the DataFrame *tag_table*
    associating tags to row names."""
    # Why "inner" ?
    df = pd.concat((data, pd.read_table(tags_table, index_col=0)), join="inner", axis=1)
+    if logfile is not None:
+        logfile.write(f"Index in data is: {data.index.name}\n")
+        logfile.write(f"Index in df is: {df.index.name}\n")
    df.columns = (*data.columns, tag_name)
    return df

@@ -2180,6 +2193,9 @@ rule compute_RPM_folds:
                input.counts_table,
                usecols=["gene", *column_list],
                index_col="gene")
+            assert counts_data.index.name == "gene", f"Wrong index: {counts_data.index.name}"
+            # logfile.write(f"Columns in counts_data are: {counts_data.columns}")
+            # logfile.write(f"Index in counts_data is: {counts_data.index.name}")
            logfile.write(f"Found {len(counts_data.index)} genes\n")
            #logfile.write(f"Reading exon lengths from {exon_lengths_file}\n")
            #exon_lengths = pd.read_table(
@@ -2196,6 +2212,9 @@ rule compute_RPM_folds:
            logfile.write(str(norm))
            logfile.write("Computing counts per million non-structural mappers\n")
            RPM = 1000000 * counts_data / norm
+            # logfile.write(f"Columns in RPM are: {RPM.columns}\n")
+            # logfile.write(f"Index in RPM is: {RPM.index.name}\n")
+            assert RPM.index.name == "gene", f"Wrong index: {RPM.index.name}"
            logfile.write("Computing RPM log2 fold changes\n")
            # Maybe not a very sensible way to compute the mean folds: it pairs replicates, but the pairing is arbitrary
            # TODO: Should we compute the fold of the means instead?
@@ -2212,9 +2231,28 @@ rule compute_RPM_folds:
                lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
            with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
                lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
-            logfile.write("Adding small read type info\n")
+            # logfile.write(f"Columns in lfc are: {lfc.columns}\n")
+            # logfile.write(f"Index in lfc is: {lfc.index.name}\n")
+            assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
+            logfile.write(f"Adding small read type info from {input.tags_table}\n")
            #pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
-            pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
+            tags_table = add_tags_column(lfc, input.tags_table, "small_type", logfile)
+            logfile.write(f"Columns in tags_table are: {tags_table.columns}\n")
+            logfile.write(f"Index in tags_table is: {tags_table.index.name}\n")
+            lfc_idx = lfc.index
+            RPM_idx = RPM.index
+            tags_table_idx = tags_table.index
+            lfc_xor_RPM = lfc_idx.symmetric_difference(RPM_idx)
+            lfc_xor_tags_table = lfc_idx.symmetric_difference(tags_table_idx)
+            RPM_xor_tags_table = RPM_idx.symmetric_difference(tags_table_idx)
+            logfile.write(f"Index difference:\nlfc_xor_RPM: {lfc_xor_RPM}\nlfc_xor_tags_table: {lfc_xor_tags_table}\nRPM_xor_tags_table: {RPM_xor_tags_table}\n")
+            # with_tags = pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1)
+            with_tags = pd.concat((RPM, tags_table), axis=1)
+            logfile.write(f"Columns in with_tags are: {with_tags.columns}\n")
+            logfile.write(f"Index in with_tags is: {with_tags.index.name}\n")
+            # pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
+            logfile.write(f"Then writing to {output.fold_results}\n")
+            with_tags.to_csv(output.fold_results, sep="\t")


 rule gather_RPM_folds:
@@ -2233,12 +2271,17 @@ rule gather_RPM_folds:
    benchmark:
        OPJ(log_dir, "gather_RPM_folds", "{small_type}_benchmark.txt"),
    run:
-        pd.DataFrame({contrast : pd.read_table(
-            OPJ(mapping_dir, f"RPM_folds_{size_selected}",
-                f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
-            index_col=["gene", "cosmid", "name", "small_type"],
-            usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
-                output.all_folds, sep="\t")
+        with open(log.log, "w") as logfile:
+            actual_input = [
+                OPJ(mapping_dir, f"RPM_folds_{size_selected}",
+                    f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt") for contrast in IP_CONTRASTS]
+            logfile.write(f"Gathering RPM folds from:\n{actual_input}\nShould be from:\n{input.fold_results}\n")
+            pd.DataFrame({contrast : pd.read_table(
+                OPJ(mapping_dir, f"RPM_folds_{size_selected}",
+                    f"{contrast}", f"{contrast}_{wildcards.small_type}_RPM_folds.txt"),
+                index_col=["gene", "cosmid", "name", "small_type"],
+                usecols=["gene", "cosmid", "name", "small_type", "mean_log2_RPM_fold"])["mean_log2_RPM_fold"] for contrast in IP_CONTRASTS}).to_csv(
+                    output.all_folds, sep="\t")


 # TODO: update consumers with mapping_type