Use transcript_id as gene_id for miRNA.

The goal is to have distinct counts for -5p and -3p miRNAs.

Use transcript_id as gene_id for miRNA.
4b4fb422 · Blaise Li · 9154ba5c · 4b4fb422
Commit 4b4fb422 authored 2 years ago by Blaise Li
--- a/Genome_preparation/extract_info_from_gtf.py
+++ b/Genome_preparation/extract_info_from_gtf.py
@@ -30,9 +30,8 @@ base, ext = splitext(in_gtf)
 biotype_counter = defaultdict(int)
 biotypes2id = {
    "rRNA": "gene_id",
-    # TODO: use "transcript_id" to have distinct 3p and 5p entries
+    # Use "transcript_id" to have distinct 3p and 5p entries
-    # "miRNA": "transcript_id",
+    "miRNA": "transcript_id",
-    "miRNA": "gene_id",
    "piRNA": "gene_id",
    "snRNA": "gene_id",
    "lincRNA": "gene_id",
@@ -43,8 +42,7 @@ biotypes2id = {
    "ncRNA": "gene_id",
    "pseudogene": "gene_id"}
 gene_name_getters = {
-    # TODO: use this
+    "miRNA": mir_name_getter
-    # "miRNA": mir_name_getter
 }
 # Mapping gene_id to gene_name
@@ -73,10 +71,9 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf:
        if fields[2] == "transcript":
            annots = dict([
                (k, v.strip('"')) for (k, v)
-                in [f.split(" ") for f in fields[8].split("; ")[:-1]]])
+                in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]])
            if "exon_number" not in annots:
                biotype = annots["gene_biotype"]
-                dest_files[biotype].write(line)
                biotype_counter[biotype] += 1
                end = int(fields[4])
                length = end + 1 - start
@@ -88,6 +85,15 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf:
                    post_process_name = gene_name_getters.get(
                        biotype, default_gene_name_getter)
                    id2name[gene_id] = post_process_name(annots, gene_id)
+                # We want miRNA gene_id and gene_name modified
+                # dest_files[biotype].write(line)
+                annots["gene_id"] = gene_id
+                annots["gene_name"] = id2name[gene_id]
+                fields[8] = "; ".join([
+                    f"{k} \"{v}\""
+                    for (k, v) in annots.items()
+                ]) + ";"
+                dest_files[biotype].write("\t".join(fields) + "\n")
 with open(f"{base}_id2name.pickle", "wb") as pickle_file:
    print(f"Storing id2name in {pickle_file.name}")