Skip to content
Snippets Groups Projects
Commit 9154ba5c authored by Blaise Li's avatar Blaise Li
Browse files

Script extracting gtf info switched to python3.

parent cc85b753
Branches
No related tags found
No related merge requests found
#!/usr/bin/env python2
# vim: set fileencoding=<utf-8> :
"""This script extracts various information from an Ensembl gtf annotation file.
#!/usr/bin/env python3
"""
This script extracts various information from an Ensembl gtf annotation file.
It generates separate gtf files for the different transcript types.
It pickles a python dictionary converting from gene_id to gene_name."""
# TODO: convert to python3
from __future__ import print_function
It pickles a python dictionary converting from gene_id to gene_name.
"""
from os.path import splitext
import sys
from collections import defaultdict
from contextlib import ExitStack
# To store dictionaries
from cPickle import dump, HIGHEST_PROTOCOL
from pickle import dump, HIGHEST_PROTOCOL
def default_gene_name_getter(annots, *_):
return annots["gene_name"]
def mir_name_getter(annots, gene_id):
if gene_id[-1] == "a":
return annots["gene_name"] + "-5p"
if gene_id[-1] == "b":
return annots["gene_name"] + "-3p"
raise ValueError("miRNA transcript id should end with either 'a' or 'b'")
in_gtf = sys.argv[1]
base, ext = splitext(in_gtf)
biotype_counter = defaultdict(int)
# TODO: Use ExitStack to have a context manager.
# Open gtf files
biotypes = {"rRNA" : open("%s_rRNA%s" % (base, ext), "w"),
"miRNA" : open("%s_miRNA%s" % (base, ext), "w"),
"piRNA" : open("%s_piRNA%s" % (base, ext), "w"),
"snRNA" : open("%s_snRNA%s" % (base, ext), "w"),
"lincRNA" : open("%s_lincRNA%s" % (base, ext), "w"),
"tRNA" : open("%s_tRNA%s" % (base, ext), "w"),
"snoRNA" : open("%s_snoRNA%s" % (base, ext), "w"),
"protein_coding" : open("%s_protein_coding%s" % (base, ext), "w"),
"antisense" : open("%s_antisense%s" % (base, ext), "w"),
"ncRNA" : open("%s_ncRNA%s" % (base, ext), "w"),
"pseudogene" : open("%s_pseudogene%s" % (base, ext), "w")}
biotypes2id = {
"rRNA": "gene_id",
# TODO: use "transcript_id" to have distinct 3p and 5p entries
# "miRNA": "transcript_id",
"miRNA": "gene_id",
"piRNA": "gene_id",
"snRNA": "gene_id",
"lincRNA": "gene_id",
"tRNA": "gene_id",
"snoRNA": "gene_id",
"protein_coding": "gene_id",
"antisense": "gene_id",
"ncRNA": "gene_id",
"pseudogene": "gene_id"}
gene_name_getters = {
# TODO: use this
# "miRNA": mir_name_getter
}
# Mapping gene_id to gene_name
id2name = {}
......@@ -39,7 +54,11 @@ max_lengths = defaultdict(int)
# To check that it is sorted
chromosomes = [None]
start = 0
with open(in_gtf, "r") as gtf:
with ExitStack() as stack, open(in_gtf, "r") as gtf:
dest_files = {
biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
for biotype in biotypes2id.keys()
}
for line in gtf:
fields = line.strip().split("\t")
chrom = fields[0]
......@@ -52,27 +71,30 @@ with open(in_gtf, "r") as gtf:
start = int(fields[3])
# We only use "transcript" annotation.
if fields[2] == "transcript":
annots = dict([(k, v.strip('"')) for (k, v) in [f.split(" ") for f in fields[8].split("; ")[:-1]]])
annots = dict([
(k, v.strip('"')) for (k, v)
in [f.split(" ") for f in fields[8].split("; ")[:-1]]])
if "exon_number" not in annots:
biotype = annots["gene_biotype"]
biotypes[biotype].write(line)
dest_files[biotype].write(line)
biotype_counter[biotype] += 1
end = int(fields[4])
length = end + 1 - start
max_lengths[biotype] = max(length, max_lengths[biotype])
gene_id = annots["gene_id"]
gene_id = annots[biotypes2id[biotype]]
if gene_id in id2name:
assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id
else:
id2name[gene_id] = annots["gene_name"]
post_process_name = gene_name_getters.get(
biotype, default_gene_name_getter)
id2name[gene_id] = post_process_name(annots, gene_id)
with open("%s_id2name.pickle" % base, "w") as pickle_file:
print("Storing id2name in %s" % pickle_file.name)
with open(f"{base}_id2name.pickle", "wb") as pickle_file:
print(f"Storing id2name in {pickle_file.name}")
dump(id2name, pickle_file, HIGHEST_PROTOCOL)
# Close gtf files and inform user about number of entries.
# Inform user about number of entries.
for (biotype, count) in biotype_counter.items():
print(biotype, count, "(max length = %d)" % max_lengths[biotype], sep="\t")
biotypes[biotype].close()
print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
sys.exit(0)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment