Make miRNA splitting optionnal.

4f12e074 · Blaise Li · 4b4fb422 · 4f12e074
Commit 4f12e074 authored 2 years ago by Blaise Li
--- a/Genome_preparation/extract_info_from_gtf.py
+++ b/Genome_preparation/extract_info_from_gtf.py
@@ -5,6 +5,7 @@ It generates separate gtf files for the different transcript types.
 It pickles a python dictionary converting from gene_id to gene_name.
 """

+import argparse
 from os.path import splitext
 import sys
 from collections import defaultdict
@@ -15,92 +16,130 @@ from pickle import dump, HIGHEST_PROTOCOL


 def default_gene_name_getter(annots, *_):
+    """
+    Simply use the "gene_name" element of *annots* as gene name.
+
+    The extra arguments are there for signature compatibility and are ignored.
+    """
    return annots["gene_name"]


 def mir_name_getter(annots, gene_id):
+    """
+    Generate miRNA name with either "-5p" or "-3p" suffix.
+
+    The name is based on the "gene_name" element of *annots*.
+
+    *gene_id* should actually be the transcript_id,
+    ending with either "a" or "b".
+    """
    if gene_id[-1] == "a":
        return annots["gene_name"] + "-5p"
    if gene_id[-1] == "b":
        return annots["gene_name"] + "-3p"
    raise ValueError("miRNA transcript id should end with either 'a' or 'b'")

-in_gtf = sys.argv[1]
-base, ext = splitext(in_gtf)
-biotype_counter = defaultdict(int)
-biotypes2id = {
-    "rRNA": "gene_id",
-    # Use "transcript_id" to have distinct 3p and 5p entries
-    "miRNA": "transcript_id",
-    "piRNA": "gene_id",
-    "snRNA": "gene_id",
-    "lincRNA": "gene_id",
-    "tRNA": "gene_id",
-    "snoRNA": "gene_id",
-    "protein_coding": "gene_id",
-    "antisense": "gene_id",
-    "ncRNA": "gene_id",
-    "pseudogene": "gene_id"}
-gene_name_getters = {
-    "miRNA": mir_name_getter
-}
-
-# Mapping gene_id to gene_name
-id2name = {}
-
-max_lengths = defaultdict(int)
-# To check that it is sorted
-chromosomes = [None]
-start = 0
-with ExitStack() as stack, open(in_gtf, "r") as gtf:
-    dest_files = {
-        biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
-        for biotype in biotypes2id.keys()
+
+def main():
+    """
+    Main function of the script.
+    """
+    parser = argparse.ArgumentParser(
+        description=__doc__,
+        formatter_class=argparse.ArgumentDefaultsHelpFormatter)
+    parser.add_argument(
+        "-g", "--in_gtf",
+        required=True,
+        help="Path to the main gtf file.")
+    parser.add_argument(
+        "-s", "--split_mi",
+        action="store_true",
+        default=False,
+        help="To have the -5p and -3p distinguished "
+        "in the resulting annotations.")
+    args = parser.parse_args()
+    in_gtf = args.in_gtf
+    base, ext = splitext(in_gtf)
+    biotype_counter = defaultdict(int)
+    biotypes2id = {
+        "rRNA": "gene_id",
+        "miRNA": "gene_id",
+        # Use "transcript_id" to have distinct 3p and 5p entries
+        # "miRNA": "transcript_id",
+        "piRNA": "gene_id",
+        "snRNA": "gene_id",
+        "lincRNA": "gene_id",
+        "tRNA": "gene_id",
+        "snoRNA": "gene_id",
+        "protein_coding": "gene_id",
+        "antisense": "gene_id",
+        "ncRNA": "gene_id",
+        "pseudogene": "gene_id"}
+    gene_name_getters = {
+        #"miRNA": mir_name_getter
    }
-    for line in gtf:
-        fields = line.strip().split("\t")
-        chrom = fields[0]
-        if chrom != chromosomes[-1]:
-            assert chrom not in chromosomes, "Chromosomes do not seem to be grouped."
-            chromosomes.append(chrom)
-            start = 0
-        else:
-            assert int(fields[3]) >= start, "Positions do not seem to be sorted."
-        start = int(fields[3])
-        # We only use "transcript" annotation.
-        if fields[2] == "transcript":
-            annots = dict([
-                (k, v.strip('"')) for (k, v)
-                in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]])
-            if "exon_number" not in annots:
-                biotype = annots["gene_biotype"]
-                biotype_counter[biotype] += 1
-                end = int(fields[4])
-                length = end + 1 - start
-                max_lengths[biotype] = max(length, max_lengths[biotype])
-                gene_id = annots[biotypes2id[biotype]]
-                if gene_id in id2name:
-                    assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id
-                else:
-                    post_process_name = gene_name_getters.get(
-                        biotype, default_gene_name_getter)
-                    id2name[gene_id] = post_process_name(annots, gene_id)
-                # We want miRNA gene_id and gene_name modified
-                # dest_files[biotype].write(line)
-                annots["gene_id"] = gene_id
-                annots["gene_name"] = id2name[gene_id]
-                fields[8] = "; ".join([
-                    f"{k} \"{v}\""
-                    for (k, v) in annots.items()
-                ]) + ";"
-                dest_files[biotype].write("\t".join(fields) + "\n")
-
-with open(f"{base}_id2name.pickle", "wb") as pickle_file:
-    print(f"Storing id2name in {pickle_file.name}")
-    dump(id2name, pickle_file, HIGHEST_PROTOCOL)
-
-# Inform user about number of entries.
-for (biotype, count) in biotype_counter.items():
-    print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
-
-sys.exit(0)
+
+    if args.split_mi:
+        biotypes2id["miRNA"] = "transcript_id"
+        gene_name_getters["miRNA"] = mir_name_getter
+
+    # Mapping gene_id to gene_name
+    id2name = {}
+
+    max_lengths = defaultdict(int)
+    # To check that it is sorted
+    chromosomes = [None]
+    start = 0
+    with ExitStack() as stack, open(in_gtf, "r") as gtf:
+        dest_files = {
+            biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
+            for biotype in biotypes2id
+        }
+        for line in gtf:
+            fields = line.strip().split("\t")
+            chrom = fields[0]
+            if chrom != chromosomes[-1]:
+                assert chrom not in chromosomes, "Chromosomes do not seem to be grouped."
+                chromosomes.append(chrom)
+                start = 0
+            else:
+                assert int(fields[3]) >= start, "Positions do not seem to be sorted."
+            start = int(fields[3])
+            # We only use "transcript" annotation.
+            if fields[2] == "transcript":
+                annots = {
+                    k: v.strip('"') for (k, v)
+                    in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]}
+                if "exon_number" not in annots:
+                    biotype = annots["gene_biotype"]
+                    biotype_counter[biotype] += 1
+                    end = int(fields[4])
+                    length = end + 1 - start
+                    max_lengths[biotype] = max(length, max_lengths[biotype])
+                    gene_id = annots[biotypes2id[biotype]]
+                    if gene_id in id2name:
+                        msg = f"Gene {gene_id} already registered with another name."
+                        assert annots["gene_name"] == id2name[gene_id], msg
+                    else:
+                        post_process_name = gene_name_getters.get(
+                            biotype, default_gene_name_getter)
+                        id2name[gene_id] = post_process_name(annots, gene_id)
+                    # We want miRNA gene_id and gene_name modified
+                    # dest_files[biotype].write(line)
+                    annots["gene_id"] = gene_id
+                    annots["gene_name"] = id2name[gene_id]
+                    fields[8] = "; ".join([
+                        f"{k} \"{v}\""
+                        for (k, v) in annots.items()
+                    ]) + ";"
+                    dest_files[biotype].write("\t".join(fields) + "\n")
+
+    with open(f"{base}_id2name.pickle", "wb") as pickle_file:
+        print(f"Storing id2name in {pickle_file.name}")
+        dump(id2name, pickle_file, HIGHEST_PROTOCOL)
+
+    # Inform user about number of entries.
+    for (biotype, count) in biotype_counter.items():
+        print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
+
+sys.exit(main())