Skip to content
Snippets Groups Projects
Commit 4f12e074 authored by Blaise Li's avatar Blaise Li
Browse files

Make miRNA splitting optionnal.

parent 4b4fb422
No related branches found
No related tags found
No related merge requests found
...@@ -5,6 +5,7 @@ It generates separate gtf files for the different transcript types. ...@@ -5,6 +5,7 @@ It generates separate gtf files for the different transcript types.
It pickles a python dictionary converting from gene_id to gene_name. It pickles a python dictionary converting from gene_id to gene_name.
""" """
import argparse
from os.path import splitext from os.path import splitext
import sys import sys
from collections import defaultdict from collections import defaultdict
...@@ -15,23 +16,56 @@ from pickle import dump, HIGHEST_PROTOCOL ...@@ -15,23 +16,56 @@ from pickle import dump, HIGHEST_PROTOCOL
def default_gene_name_getter(annots, *_): def default_gene_name_getter(annots, *_):
"""
Simply use the "gene_name" element of *annots* as gene name.
The extra arguments are there for signature compatibility and are ignored.
"""
return annots["gene_name"] return annots["gene_name"]
def mir_name_getter(annots, gene_id): def mir_name_getter(annots, gene_id):
"""
Generate miRNA name with either "-5p" or "-3p" suffix.
The name is based on the "gene_name" element of *annots*.
*gene_id* should actually be the transcript_id,
ending with either "a" or "b".
"""
if gene_id[-1] == "a": if gene_id[-1] == "a":
return annots["gene_name"] + "-5p" return annots["gene_name"] + "-5p"
if gene_id[-1] == "b": if gene_id[-1] == "b":
return annots["gene_name"] + "-3p" return annots["gene_name"] + "-3p"
raise ValueError("miRNA transcript id should end with either 'a' or 'b'") raise ValueError("miRNA transcript id should end with either 'a' or 'b'")
in_gtf = sys.argv[1]
def main():
"""
Main function of the script.
"""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-g", "--in_gtf",
required=True,
help="Path to the main gtf file.")
parser.add_argument(
"-s", "--split_mi",
action="store_true",
default=False,
help="To have the -5p and -3p distinguished "
"in the resulting annotations.")
args = parser.parse_args()
in_gtf = args.in_gtf
base, ext = splitext(in_gtf) base, ext = splitext(in_gtf)
biotype_counter = defaultdict(int) biotype_counter = defaultdict(int)
biotypes2id = { biotypes2id = {
"rRNA": "gene_id", "rRNA": "gene_id",
"miRNA": "gene_id",
# Use "transcript_id" to have distinct 3p and 5p entries # Use "transcript_id" to have distinct 3p and 5p entries
"miRNA": "transcript_id", # "miRNA": "transcript_id",
"piRNA": "gene_id", "piRNA": "gene_id",
"snRNA": "gene_id", "snRNA": "gene_id",
"lincRNA": "gene_id", "lincRNA": "gene_id",
...@@ -42,9 +76,13 @@ biotypes2id = { ...@@ -42,9 +76,13 @@ biotypes2id = {
"ncRNA": "gene_id", "ncRNA": "gene_id",
"pseudogene": "gene_id"} "pseudogene": "gene_id"}
gene_name_getters = { gene_name_getters = {
"miRNA": mir_name_getter #"miRNA": mir_name_getter
} }
if args.split_mi:
biotypes2id["miRNA"] = "transcript_id"
gene_name_getters["miRNA"] = mir_name_getter
# Mapping gene_id to gene_name # Mapping gene_id to gene_name
id2name = {} id2name = {}
...@@ -55,7 +93,7 @@ start = 0 ...@@ -55,7 +93,7 @@ start = 0
with ExitStack() as stack, open(in_gtf, "r") as gtf: with ExitStack() as stack, open(in_gtf, "r") as gtf:
dest_files = { dest_files = {
biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w")) biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
for biotype in biotypes2id.keys() for biotype in biotypes2id
} }
for line in gtf: for line in gtf:
fields = line.strip().split("\t") fields = line.strip().split("\t")
...@@ -69,9 +107,9 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf: ...@@ -69,9 +107,9 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf:
start = int(fields[3]) start = int(fields[3])
# We only use "transcript" annotation. # We only use "transcript" annotation.
if fields[2] == "transcript": if fields[2] == "transcript":
annots = dict([ annots = {
(k, v.strip('"')) for (k, v) k: v.strip('"') for (k, v)
in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]]) in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]}
if "exon_number" not in annots: if "exon_number" not in annots:
biotype = annots["gene_biotype"] biotype = annots["gene_biotype"]
biotype_counter[biotype] += 1 biotype_counter[biotype] += 1
...@@ -80,7 +118,8 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf: ...@@ -80,7 +118,8 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf:
max_lengths[biotype] = max(length, max_lengths[biotype]) max_lengths[biotype] = max(length, max_lengths[biotype])
gene_id = annots[biotypes2id[biotype]] gene_id = annots[biotypes2id[biotype]]
if gene_id in id2name: if gene_id in id2name:
assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id msg = f"Gene {gene_id} already registered with another name."
assert annots["gene_name"] == id2name[gene_id], msg
else: else:
post_process_name = gene_name_getters.get( post_process_name = gene_name_getters.get(
biotype, default_gene_name_getter) biotype, default_gene_name_getter)
...@@ -103,4 +142,4 @@ with open(f"{base}_id2name.pickle", "wb") as pickle_file: ...@@ -103,4 +142,4 @@ with open(f"{base}_id2name.pickle", "wb") as pickle_file:
for (biotype, count) in biotype_counter.items(): for (biotype, count) in biotype_counter.items():
print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t") print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
sys.exit(0) sys.exit(main())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please to comment