Tryng to fix gene id conversion for miRNAs.

8919f01f · Blaise Li · 34dcef40 · 8919f01f · 8919f01f
Commit 8919f01f authored 2 years ago by Blaise Li
--- a/Genome_preparation/extract_info_from_gtf.py
+++ b/Genome_preparation/extract_info_from_gtf.py
@@ -134,6 +134,9 @@ def main():
                    ]) + ";"
                    dest_files[biotype].write("\t".join(fields) + "\n")
+    # Note that "convert_dir" is not set in the the genome config.
+    # This is just one converter, hopefully accessible via the "converter" config element.
+    # The other converters will be obtained from the libcelegans package.
    with open(f"{base}_id2name.pickle", "wb") as pickle_file:
        print(f"Storing id2name in {pickle_file.name}")
        dump(id2name, pickle_file, HIGHEST_PROTOCOL)

--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -435,8 +435,21 @@ genome_db = genome_dict["db"][aligner]
 genome_binned = genome_dict["binned"]
 annot_dir = genome_dict["annot_dir"]
 exon_lengths_file = OPJ(annot_dir, "union_exon_lengths.txt"),
-# TODO: figure out the difference between OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]
+# What are the difference between
+# OPJ(convert_dir, "wormid2name.pickle") and genome_dict["converter"]?
+# /!\ gene_ids_data_dir contains more conversion dicts,
+# but is not influenced by genome preparation customization,
+# like splitting of miRNAs into 3p and 5p.
 convert_dir = genome_dict.get("convert_dir", gene_ids_data_dir)
+# For wormid2name, load in priority the one
+# that might contain custom gene names, like for splitted miRNAs
+with open(
+        genome_dict.get(
+            "converter",
+            OPJ(convert_dir, "wormid2name.pickle")),
+        "rb") as dict_file:
+    wormid2name = load(dict_file)
 gene_lists_dir = genome_dict["gene_lists_dir"]
 avail_id_lists = set(glob(OPJ(gene_lists_dir, "*_ids.txt")))
 index = genome_db
@@ -2373,8 +2386,9 @@ rule compute_RPM_folds:
            ######################
            with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
-            with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+            #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+            #    lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+            lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1))
            # logfile.write(f"Columns in lfc are: {lfc.columns}\n")
            # logfile.write(f"Index in lfc is: {lfc.index.name}\n")
            assert lfc.index.name == "gene", f"Wrong index: {lfc.index.name}"
@@ -2483,8 +2497,9 @@ rule compute_remapped_RPM_folds:
            ######################
            with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                lfc = lfc.assign(cosmid=lfc.apply(column_converter(load(dict_file)), axis=1))
-            with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+            #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+            #    lfc = lfc.assign(name=lfc.apply(column_converter(load(dict_file)), axis=1))
+            lfc = lfc.assign(name=lfc.apply(column_converter(wormid2name), axis=1))
            #logfile.write("Adding small read type info\n")
            ##pd.concat((counts_data.loc[common], RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
            #pd.concat((RPM, add_tags_column(lfc, input.tags_table, "small_type")), axis=1).to_csv(output.fold_results, sep="\t")
@@ -3891,8 +3906,9 @@ rule small_RNA_differential_expression:
                    ######################
                    with open(OPJ(convert_dir, "wormid2cosmid.pickle"), "rb") as dict_file:
                        res = res.assign(cosmid=res.apply(column_converter(load(dict_file)), axis=1))
-                    with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
+                    #with open(OPJ(convert_dir, "wormid2name.pickle"), "rb") as dict_file:
-                        res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    #    res = res.assign(name=res.apply(column_converter(load(dict_file)), axis=1))
+                    res = res.assign(name=res.apply(column_converter(wormid2name), axis=1))
                    ##########################################
                    # res.to_csv(output.deseq_results, sep="\t", na_rep="NA", decimal=",")
                    res.to_csv(output.deseq_results, sep="\t", na_rep="NA")