diff --git a/Genome_preparation/extract_info_from_gtf.py b/Genome_preparation/extract_info_from_gtf.py index 99549d0b692ded35c15028f4c0d90717d6954c05..724749ec1d426c617fdcbd4010516be6a1918414 100755 --- a/Genome_preparation/extract_info_from_gtf.py +++ b/Genome_preparation/extract_info_from_gtf.py @@ -1,36 +1,51 @@ -#!/usr/bin/env python2 -# vim: set fileencoding=<utf-8> : -"""This script extracts various information from an Ensembl gtf annotation file. +#!/usr/bin/env python3 +""" +This script extracts various information from an Ensembl gtf annotation file. It generates separate gtf files for the different transcript types. -It pickles a python dictionary converting from gene_id to gene_name.""" - -# TODO: convert to python3 -from __future__ import print_function +It pickles a python dictionary converting from gene_id to gene_name. +""" from os.path import splitext import sys from collections import defaultdict +from contextlib import ExitStack # To store dictionaries -from cPickle import dump, HIGHEST_PROTOCOL +from pickle import dump, HIGHEST_PROTOCOL + + +def default_gene_name_getter(annots, *_): + return annots["gene_name"] + +def mir_name_getter(annots, gene_id): + if gene_id[-1] == "a": + return annots["gene_name"] + "-5p" + if gene_id[-1] == "b": + return annots["gene_name"] + "-3p" + raise ValueError("miRNA transcript id should end with either 'a' or 'b'") in_gtf = sys.argv[1] base, ext = splitext(in_gtf) biotype_counter = defaultdict(int) -# TODO: Use ExitStack to have a context manager. -# Open gtf files -biotypes = {"rRNA" : open("%s_rRNA%s" % (base, ext), "w"), - "miRNA" : open("%s_miRNA%s" % (base, ext), "w"), - "piRNA" : open("%s_piRNA%s" % (base, ext), "w"), - "snRNA" : open("%s_snRNA%s" % (base, ext), "w"), - "lincRNA" : open("%s_lincRNA%s" % (base, ext), "w"), - "tRNA" : open("%s_tRNA%s" % (base, ext), "w"), - "snoRNA" : open("%s_snoRNA%s" % (base, ext), "w"), - "protein_coding" : open("%s_protein_coding%s" % (base, ext), "w"), - "antisense" : open("%s_antisense%s" % (base, ext), "w"), - "ncRNA" : open("%s_ncRNA%s" % (base, ext), "w"), - "pseudogene" : open("%s_pseudogene%s" % (base, ext), "w")} +biotypes2id = { + "rRNA": "gene_id", + # TODO: use "transcript_id" to have distinct 3p and 5p entries + # "miRNA": "transcript_id", + "miRNA": "gene_id", + "piRNA": "gene_id", + "snRNA": "gene_id", + "lincRNA": "gene_id", + "tRNA": "gene_id", + "snoRNA": "gene_id", + "protein_coding": "gene_id", + "antisense": "gene_id", + "ncRNA": "gene_id", + "pseudogene": "gene_id"} +gene_name_getters = { + # TODO: use this + # "miRNA": mir_name_getter +} # Mapping gene_id to gene_name id2name = {} @@ -39,7 +54,11 @@ max_lengths = defaultdict(int) # To check that it is sorted chromosomes = [None] start = 0 -with open(in_gtf, "r") as gtf: +with ExitStack() as stack, open(in_gtf, "r") as gtf: + dest_files = { + biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w")) + for biotype in biotypes2id.keys() + } for line in gtf: fields = line.strip().split("\t") chrom = fields[0] @@ -52,27 +71,30 @@ with open(in_gtf, "r") as gtf: start = int(fields[3]) # We only use "transcript" annotation. if fields[2] == "transcript": - annots = dict([(k, v.strip('"')) for (k, v) in [f.split(" ") for f in fields[8].split("; ")[:-1]]]) + annots = dict([ + (k, v.strip('"')) for (k, v) + in [f.split(" ") for f in fields[8].split("; ")[:-1]]]) if "exon_number" not in annots: biotype = annots["gene_biotype"] - biotypes[biotype].write(line) + dest_files[biotype].write(line) biotype_counter[biotype] += 1 end = int(fields[4]) length = end + 1 - start max_lengths[biotype] = max(length, max_lengths[biotype]) - gene_id = annots["gene_id"] + gene_id = annots[biotypes2id[biotype]] if gene_id in id2name: assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id else: - id2name[gene_id] = annots["gene_name"] + post_process_name = gene_name_getters.get( + biotype, default_gene_name_getter) + id2name[gene_id] = post_process_name(annots, gene_id) -with open("%s_id2name.pickle" % base, "w") as pickle_file: - print("Storing id2name in %s" % pickle_file.name) +with open(f"{base}_id2name.pickle", "wb") as pickle_file: + print(f"Storing id2name in {pickle_file.name}") dump(id2name, pickle_file, HIGHEST_PROTOCOL) -# Close gtf files and inform user about number of entries. +# Inform user about number of entries. for (biotype, count) in biotype_counter.items(): - print(biotype, count, "(max length = %d)" % max_lengths[biotype], sep="\t") - biotypes[biotype].close() + print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t") sys.exit(0)