diff --git a/Genome_preparation/extract_info_from_gtf.py b/Genome_preparation/extract_info_from_gtf.py index 724749ec1d426c617fdcbd4010516be6a1918414..859b2e2579dc41c7e7ca57342c1e1664b118c6b0 100755 --- a/Genome_preparation/extract_info_from_gtf.py +++ b/Genome_preparation/extract_info_from_gtf.py @@ -30,9 +30,8 @@ base, ext = splitext(in_gtf) biotype_counter = defaultdict(int) biotypes2id = { "rRNA": "gene_id", - # TODO: use "transcript_id" to have distinct 3p and 5p entries - # "miRNA": "transcript_id", - "miRNA": "gene_id", + # Use "transcript_id" to have distinct 3p and 5p entries + "miRNA": "transcript_id", "piRNA": "gene_id", "snRNA": "gene_id", "lincRNA": "gene_id", @@ -43,8 +42,7 @@ biotypes2id = { "ncRNA": "gene_id", "pseudogene": "gene_id"} gene_name_getters = { - # TODO: use this - # "miRNA": mir_name_getter + "miRNA": mir_name_getter } # Mapping gene_id to gene_name @@ -73,10 +71,9 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf: if fields[2] == "transcript": annots = dict([ (k, v.strip('"')) for (k, v) - in [f.split(" ") for f in fields[8].split("; ")[:-1]]]) + in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]]) if "exon_number" not in annots: biotype = annots["gene_biotype"] - dest_files[biotype].write(line) biotype_counter[biotype] += 1 end = int(fields[4]) length = end + 1 - start @@ -88,6 +85,15 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf: post_process_name = gene_name_getters.get( biotype, default_gene_name_getter) id2name[gene_id] = post_process_name(annots, gene_id) + # We want miRNA gene_id and gene_name modified + # dest_files[biotype].write(line) + annots["gene_id"] = gene_id + annots["gene_name"] = id2name[gene_id] + fields[8] = "; ".join([ + f"{k} \"{v}\"" + for (k, v) in annots.items() + ]) + ";" + dest_files[biotype].write("\t".join(fields) + "\n") with open(f"{base}_id2name.pickle", "wb") as pickle_file: print(f"Storing id2name in {pickle_file.name}")