Script extracting gtf info switched to python3.

9154ba5c · Blaise Li · cc85b753 · 9154ba5c
Commit 9154ba5c authored 2 years ago by Blaise Li
--- a/Genome_preparation/extract_info_from_gtf.py
+++ b/Genome_preparation/extract_info_from_gtf.py
-#!/usr/bin/env python2
-# vim: set fileencoding=<utf-8> :
-"""This script extracts various information from an Ensembl gtf annotation file.
+#!/usr/bin/env python3
+"""
+This script extracts various information from an Ensembl gtf annotation file.
 It generates separate gtf files for the different transcript types.
-It pickles a python dictionary converting from gene_id to gene_name."""
-
-# TODO: convert to python3
-from __future__ import print_function
+It pickles a python dictionary converting from gene_id to gene_name.
+"""

 from os.path import splitext
 import sys
 from collections import defaultdict
+from contextlib import ExitStack

 # To store dictionaries
-from cPickle import dump, HIGHEST_PROTOCOL
+from pickle import dump, HIGHEST_PROTOCOL
+
+
+def default_gene_name_getter(annots, *_):
+    return annots["gene_name"]
+

+def mir_name_getter(annots, gene_id):
+    if gene_id[-1] == "a":
+        return annots["gene_name"] + "-5p"
+    if gene_id[-1] == "b":
+        return annots["gene_name"] + "-3p"
+    raise ValueError("miRNA transcript id should end with either 'a' or 'b'")

 in_gtf = sys.argv[1]
 base, ext = splitext(in_gtf)
 biotype_counter = defaultdict(int)
-# TODO: Use ExitStack to have a context manager.
-# Open gtf files
-biotypes = {"rRNA" : open("%s_rRNA%s" % (base, ext), "w"),
-            "miRNA" : open("%s_miRNA%s" % (base, ext), "w"),
-            "piRNA" : open("%s_piRNA%s" % (base, ext), "w"),
-            "snRNA" : open("%s_snRNA%s" % (base, ext), "w"),
-            "lincRNA" : open("%s_lincRNA%s" % (base, ext), "w"),
-            "tRNA" : open("%s_tRNA%s" % (base, ext), "w"),
-            "snoRNA" : open("%s_snoRNA%s" % (base, ext), "w"),
-            "protein_coding" : open("%s_protein_coding%s" % (base, ext), "w"),
-            "antisense" : open("%s_antisense%s" % (base, ext), "w"),
-            "ncRNA" : open("%s_ncRNA%s" % (base, ext), "w"),
-            "pseudogene" : open("%s_pseudogene%s" % (base, ext), "w")}
+biotypes2id = {
+    "rRNA": "gene_id",
+    # TODO: use "transcript_id" to have distinct 3p and 5p entries
+    # "miRNA": "transcript_id",
+    "miRNA": "gene_id",
+    "piRNA": "gene_id",
+    "snRNA": "gene_id",
+    "lincRNA": "gene_id",
+    "tRNA": "gene_id",
+    "snoRNA": "gene_id",
+    "protein_coding": "gene_id",
+    "antisense": "gene_id",
+    "ncRNA": "gene_id",
+    "pseudogene": "gene_id"}
+gene_name_getters = {
+    # TODO: use this
+    # "miRNA": mir_name_getter
+}

 # Mapping gene_id to gene_name
 id2name = {}
@@ -39,7 +54,11 @@ max_lengths = defaultdict(int)
 # To check that it is sorted
 chromosomes = [None]
 start = 0
-with open(in_gtf, "r") as gtf:
+with ExitStack() as stack, open(in_gtf, "r") as gtf:
+    dest_files = {
+        biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
+        for biotype in biotypes2id.keys()
+    }
    for line in gtf:
        fields = line.strip().split("\t")
        chrom = fields[0]
@@ -52,27 +71,30 @@ with open(in_gtf, "r") as gtf:
        start = int(fields[3])
        # We only use "transcript" annotation.
        if fields[2] == "transcript":
-            annots = dict([(k, v.strip('"')) for (k, v) in [f.split(" ") for f in fields[8].split("; ")[:-1]]])
+            annots = dict([
+                (k, v.strip('"')) for (k, v)
+                in [f.split(" ") for f in fields[8].split("; ")[:-1]]])
            if "exon_number" not in annots:
                biotype = annots["gene_biotype"]
-                biotypes[biotype].write(line)
+                dest_files[biotype].write(line)
                biotype_counter[biotype] += 1
                end = int(fields[4])
                length = end + 1 - start
                max_lengths[biotype] = max(length, max_lengths[biotype])
-                gene_id = annots["gene_id"]
+                gene_id = annots[biotypes2id[biotype]]
                if gene_id in id2name:
                    assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id
                else:
-                    id2name[gene_id] = annots["gene_name"]
+                    post_process_name = gene_name_getters.get(
+                        biotype, default_gene_name_getter)
+                    id2name[gene_id] = post_process_name(annots, gene_id)

-with open("%s_id2name.pickle" % base, "w") as pickle_file:
-    print("Storing id2name in %s" % pickle_file.name)
+with open(f"{base}_id2name.pickle", "wb") as pickle_file:
+    print(f"Storing id2name in {pickle_file.name}")
    dump(id2name, pickle_file, HIGHEST_PROTOCOL)

-# Close gtf files and inform user about number of entries.
+# Inform user about number of entries.
 for (biotype, count) in biotype_counter.items():
-    print(biotype, count, "(max length = %d)" % max_lengths[biotype], sep="\t")
-    biotypes[biotype].close()
+    print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")

 sys.exit(0)