diff --git a/Genome_preparation/extract_info_from_gtf.py b/Genome_preparation/extract_info_from_gtf.py index 859b2e2579dc41c7e7ca57342c1e1664b118c6b0..e61773a4bf452c8b1973f2c02b30240e5ce3d9a5 100755 --- a/Genome_preparation/extract_info_from_gtf.py +++ b/Genome_preparation/extract_info_from_gtf.py @@ -5,6 +5,7 @@ It generates separate gtf files for the different transcript types. It pickles a python dictionary converting from gene_id to gene_name. """ +import argparse from os.path import splitext import sys from collections import defaultdict @@ -15,92 +16,130 @@ from pickle import dump, HIGHEST_PROTOCOL def default_gene_name_getter(annots, *_): + """ + Simply use the "gene_name" element of *annots* as gene name. + + The extra arguments are there for signature compatibility and are ignored. + """ return annots["gene_name"] def mir_name_getter(annots, gene_id): + """ + Generate miRNA name with either "-5p" or "-3p" suffix. + + The name is based on the "gene_name" element of *annots*. + + *gene_id* should actually be the transcript_id, + ending with either "a" or "b". + """ if gene_id[-1] == "a": return annots["gene_name"] + "-5p" if gene_id[-1] == "b": return annots["gene_name"] + "-3p" raise ValueError("miRNA transcript id should end with either 'a' or 'b'") -in_gtf = sys.argv[1] -base, ext = splitext(in_gtf) -biotype_counter = defaultdict(int) -biotypes2id = { - "rRNA": "gene_id", - # Use "transcript_id" to have distinct 3p and 5p entries - "miRNA": "transcript_id", - "piRNA": "gene_id", - "snRNA": "gene_id", - "lincRNA": "gene_id", - "tRNA": "gene_id", - "snoRNA": "gene_id", - "protein_coding": "gene_id", - "antisense": "gene_id", - "ncRNA": "gene_id", - "pseudogene": "gene_id"} -gene_name_getters = { - "miRNA": mir_name_getter -} - -# Mapping gene_id to gene_name -id2name = {} - -max_lengths = defaultdict(int) -# To check that it is sorted -chromosomes = [None] -start = 0 -with ExitStack() as stack, open(in_gtf, "r") as gtf: - dest_files = { - biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w")) - for biotype in biotypes2id.keys() + +def main(): + """ + Main function of the script. + """ + parser = argparse.ArgumentParser( + description=__doc__, + formatter_class=argparse.ArgumentDefaultsHelpFormatter) + parser.add_argument( + "-g", "--in_gtf", + required=True, + help="Path to the main gtf file.") + parser.add_argument( + "-s", "--split_mi", + action="store_true", + default=False, + help="To have the -5p and -3p distinguished " + "in the resulting annotations.") + args = parser.parse_args() + in_gtf = args.in_gtf + base, ext = splitext(in_gtf) + biotype_counter = defaultdict(int) + biotypes2id = { + "rRNA": "gene_id", + "miRNA": "gene_id", + # Use "transcript_id" to have distinct 3p and 5p entries + # "miRNA": "transcript_id", + "piRNA": "gene_id", + "snRNA": "gene_id", + "lincRNA": "gene_id", + "tRNA": "gene_id", + "snoRNA": "gene_id", + "protein_coding": "gene_id", + "antisense": "gene_id", + "ncRNA": "gene_id", + "pseudogene": "gene_id"} + gene_name_getters = { + #"miRNA": mir_name_getter } - for line in gtf: - fields = line.strip().split("\t") - chrom = fields[0] - if chrom != chromosomes[-1]: - assert chrom not in chromosomes, "Chromosomes do not seem to be grouped." - chromosomes.append(chrom) - start = 0 - else: - assert int(fields[3]) >= start, "Positions do not seem to be sorted." - start = int(fields[3]) - # We only use "transcript" annotation. - if fields[2] == "transcript": - annots = dict([ - (k, v.strip('"')) for (k, v) - in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]]) - if "exon_number" not in annots: - biotype = annots["gene_biotype"] - biotype_counter[biotype] += 1 - end = int(fields[4]) - length = end + 1 - start - max_lengths[biotype] = max(length, max_lengths[biotype]) - gene_id = annots[biotypes2id[biotype]] - if gene_id in id2name: - assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id - else: - post_process_name = gene_name_getters.get( - biotype, default_gene_name_getter) - id2name[gene_id] = post_process_name(annots, gene_id) - # We want miRNA gene_id and gene_name modified - # dest_files[biotype].write(line) - annots["gene_id"] = gene_id - annots["gene_name"] = id2name[gene_id] - fields[8] = "; ".join([ - f"{k} \"{v}\"" - for (k, v) in annots.items() - ]) + ";" - dest_files[biotype].write("\t".join(fields) + "\n") - -with open(f"{base}_id2name.pickle", "wb") as pickle_file: - print(f"Storing id2name in {pickle_file.name}") - dump(id2name, pickle_file, HIGHEST_PROTOCOL) - -# Inform user about number of entries. -for (biotype, count) in biotype_counter.items(): - print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t") - -sys.exit(0) + + if args.split_mi: + biotypes2id["miRNA"] = "transcript_id" + gene_name_getters["miRNA"] = mir_name_getter + + # Mapping gene_id to gene_name + id2name = {} + + max_lengths = defaultdict(int) + # To check that it is sorted + chromosomes = [None] + start = 0 + with ExitStack() as stack, open(in_gtf, "r") as gtf: + dest_files = { + biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w")) + for biotype in biotypes2id + } + for line in gtf: + fields = line.strip().split("\t") + chrom = fields[0] + if chrom != chromosomes[-1]: + assert chrom not in chromosomes, "Chromosomes do not seem to be grouped." + chromosomes.append(chrom) + start = 0 + else: + assert int(fields[3]) >= start, "Positions do not seem to be sorted." + start = int(fields[3]) + # We only use "transcript" annotation. + if fields[2] == "transcript": + annots = { + k: v.strip('"') for (k, v) + in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]} + if "exon_number" not in annots: + biotype = annots["gene_biotype"] + biotype_counter[biotype] += 1 + end = int(fields[4]) + length = end + 1 - start + max_lengths[biotype] = max(length, max_lengths[biotype]) + gene_id = annots[biotypes2id[biotype]] + if gene_id in id2name: + msg = f"Gene {gene_id} already registered with another name." + assert annots["gene_name"] == id2name[gene_id], msg + else: + post_process_name = gene_name_getters.get( + biotype, default_gene_name_getter) + id2name[gene_id] = post_process_name(annots, gene_id) + # We want miRNA gene_id and gene_name modified + # dest_files[biotype].write(line) + annots["gene_id"] = gene_id + annots["gene_name"] = id2name[gene_id] + fields[8] = "; ".join([ + f"{k} \"{v}\"" + for (k, v) in annots.items() + ]) + ";" + dest_files[biotype].write("\t".join(fields) + "\n") + + with open(f"{base}_id2name.pickle", "wb") as pickle_file: + print(f"Storing id2name in {pickle_file.name}") + dump(id2name, pickle_file, HIGHEST_PROTOCOL) + + # Inform user about number of entries. + for (biotype, count) in biotype_counter.items(): + print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t") + +sys.exit(main())