Skip to content
Snippets Groups Projects
Commit 9154ba5c authored by Blaise Li's avatar Blaise Li
Browse files

Script extracting gtf info switched to python3.

parent cc85b753
No related branches found
No related tags found
No related merge requests found
#!/usr/bin/env python2 #!/usr/bin/env python3
# vim: set fileencoding=<utf-8> : """
"""This script extracts various information from an Ensembl gtf annotation file. This script extracts various information from an Ensembl gtf annotation file.
It generates separate gtf files for the different transcript types. It generates separate gtf files for the different transcript types.
It pickles a python dictionary converting from gene_id to gene_name.""" It pickles a python dictionary converting from gene_id to gene_name.
"""
# TODO: convert to python3
from __future__ import print_function
from os.path import splitext from os.path import splitext
import sys import sys
from collections import defaultdict from collections import defaultdict
from contextlib import ExitStack
# To store dictionaries # To store dictionaries
from cPickle import dump, HIGHEST_PROTOCOL from pickle import dump, HIGHEST_PROTOCOL
def default_gene_name_getter(annots, *_):
return annots["gene_name"]
def mir_name_getter(annots, gene_id):
if gene_id[-1] == "a":
return annots["gene_name"] + "-5p"
if gene_id[-1] == "b":
return annots["gene_name"] + "-3p"
raise ValueError("miRNA transcript id should end with either 'a' or 'b'")
in_gtf = sys.argv[1] in_gtf = sys.argv[1]
base, ext = splitext(in_gtf) base, ext = splitext(in_gtf)
biotype_counter = defaultdict(int) biotype_counter = defaultdict(int)
# TODO: Use ExitStack to have a context manager. biotypes2id = {
# Open gtf files "rRNA": "gene_id",
biotypes = {"rRNA" : open("%s_rRNA%s" % (base, ext), "w"), # TODO: use "transcript_id" to have distinct 3p and 5p entries
"miRNA" : open("%s_miRNA%s" % (base, ext), "w"), # "miRNA": "transcript_id",
"piRNA" : open("%s_piRNA%s" % (base, ext), "w"), "miRNA": "gene_id",
"snRNA" : open("%s_snRNA%s" % (base, ext), "w"), "piRNA": "gene_id",
"lincRNA" : open("%s_lincRNA%s" % (base, ext), "w"), "snRNA": "gene_id",
"tRNA" : open("%s_tRNA%s" % (base, ext), "w"), "lincRNA": "gene_id",
"snoRNA" : open("%s_snoRNA%s" % (base, ext), "w"), "tRNA": "gene_id",
"protein_coding" : open("%s_protein_coding%s" % (base, ext), "w"), "snoRNA": "gene_id",
"antisense" : open("%s_antisense%s" % (base, ext), "w"), "protein_coding": "gene_id",
"ncRNA" : open("%s_ncRNA%s" % (base, ext), "w"), "antisense": "gene_id",
"pseudogene" : open("%s_pseudogene%s" % (base, ext), "w")} "ncRNA": "gene_id",
"pseudogene": "gene_id"}
gene_name_getters = {
# TODO: use this
# "miRNA": mir_name_getter
}
# Mapping gene_id to gene_name # Mapping gene_id to gene_name
id2name = {} id2name = {}
...@@ -39,7 +54,11 @@ max_lengths = defaultdict(int) ...@@ -39,7 +54,11 @@ max_lengths = defaultdict(int)
# To check that it is sorted # To check that it is sorted
chromosomes = [None] chromosomes = [None]
start = 0 start = 0
with open(in_gtf, "r") as gtf: with ExitStack() as stack, open(in_gtf, "r") as gtf:
dest_files = {
biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
for biotype in biotypes2id.keys()
}
for line in gtf: for line in gtf:
fields = line.strip().split("\t") fields = line.strip().split("\t")
chrom = fields[0] chrom = fields[0]
...@@ -52,27 +71,30 @@ with open(in_gtf, "r") as gtf: ...@@ -52,27 +71,30 @@ with open(in_gtf, "r") as gtf:
start = int(fields[3]) start = int(fields[3])
# We only use "transcript" annotation. # We only use "transcript" annotation.
if fields[2] == "transcript": if fields[2] == "transcript":
annots = dict([(k, v.strip('"')) for (k, v) in [f.split(" ") for f in fields[8].split("; ")[:-1]]]) annots = dict([
(k, v.strip('"')) for (k, v)
in [f.split(" ") for f in fields[8].split("; ")[:-1]]])
if "exon_number" not in annots: if "exon_number" not in annots:
biotype = annots["gene_biotype"] biotype = annots["gene_biotype"]
biotypes[biotype].write(line) dest_files[biotype].write(line)
biotype_counter[biotype] += 1 biotype_counter[biotype] += 1
end = int(fields[4]) end = int(fields[4])
length = end + 1 - start length = end + 1 - start
max_lengths[biotype] = max(length, max_lengths[biotype]) max_lengths[biotype] = max(length, max_lengths[biotype])
gene_id = annots["gene_id"] gene_id = annots[biotypes2id[biotype]]
if gene_id in id2name: if gene_id in id2name:
assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id
else: else:
id2name[gene_id] = annots["gene_name"] post_process_name = gene_name_getters.get(
biotype, default_gene_name_getter)
id2name[gene_id] = post_process_name(annots, gene_id)
with open("%s_id2name.pickle" % base, "w") as pickle_file: with open(f"{base}_id2name.pickle", "wb") as pickle_file:
print("Storing id2name in %s" % pickle_file.name) print(f"Storing id2name in {pickle_file.name}")
dump(id2name, pickle_file, HIGHEST_PROTOCOL) dump(id2name, pickle_file, HIGHEST_PROTOCOL)
# Close gtf files and inform user about number of entries. # Inform user about number of entries.
for (biotype, count) in biotype_counter.items(): for (biotype, count) in biotype_counter.items():
print(biotype, count, "(max length = %d)" % max_lengths[biotype], sep="\t") print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
biotypes[biotype].close()
sys.exit(0) sys.exit(0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment