Skip to content
Snippets Groups Projects
Commit 4f12e074 authored by Blaise Li's avatar Blaise Li
Browse files

Make miRNA splitting optionnal.

parent 4b4fb422
No related branches found
No related tags found
No related merge requests found
......@@ -5,6 +5,7 @@ It generates separate gtf files for the different transcript types.
It pickles a python dictionary converting from gene_id to gene_name.
"""
import argparse
from os.path import splitext
import sys
from collections import defaultdict
......@@ -15,92 +16,130 @@ from pickle import dump, HIGHEST_PROTOCOL
def default_gene_name_getter(annots, *_):
"""
Simply use the "gene_name" element of *annots* as gene name.
The extra arguments are there for signature compatibility and are ignored.
"""
return annots["gene_name"]
def mir_name_getter(annots, gene_id):
"""
Generate miRNA name with either "-5p" or "-3p" suffix.
The name is based on the "gene_name" element of *annots*.
*gene_id* should actually be the transcript_id,
ending with either "a" or "b".
"""
if gene_id[-1] == "a":
return annots["gene_name"] + "-5p"
if gene_id[-1] == "b":
return annots["gene_name"] + "-3p"
raise ValueError("miRNA transcript id should end with either 'a' or 'b'")
in_gtf = sys.argv[1]
base, ext = splitext(in_gtf)
biotype_counter = defaultdict(int)
biotypes2id = {
"rRNA": "gene_id",
# Use "transcript_id" to have distinct 3p and 5p entries
"miRNA": "transcript_id",
"piRNA": "gene_id",
"snRNA": "gene_id",
"lincRNA": "gene_id",
"tRNA": "gene_id",
"snoRNA": "gene_id",
"protein_coding": "gene_id",
"antisense": "gene_id",
"ncRNA": "gene_id",
"pseudogene": "gene_id"}
gene_name_getters = {
"miRNA": mir_name_getter
}
# Mapping gene_id to gene_name
id2name = {}
max_lengths = defaultdict(int)
# To check that it is sorted
chromosomes = [None]
start = 0
with ExitStack() as stack, open(in_gtf, "r") as gtf:
dest_files = {
biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
for biotype in biotypes2id.keys()
def main():
"""
Main function of the script.
"""
parser = argparse.ArgumentParser(
description=__doc__,
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument(
"-g", "--in_gtf",
required=True,
help="Path to the main gtf file.")
parser.add_argument(
"-s", "--split_mi",
action="store_true",
default=False,
help="To have the -5p and -3p distinguished "
"in the resulting annotations.")
args = parser.parse_args()
in_gtf = args.in_gtf
base, ext = splitext(in_gtf)
biotype_counter = defaultdict(int)
biotypes2id = {
"rRNA": "gene_id",
"miRNA": "gene_id",
# Use "transcript_id" to have distinct 3p and 5p entries
# "miRNA": "transcript_id",
"piRNA": "gene_id",
"snRNA": "gene_id",
"lincRNA": "gene_id",
"tRNA": "gene_id",
"snoRNA": "gene_id",
"protein_coding": "gene_id",
"antisense": "gene_id",
"ncRNA": "gene_id",
"pseudogene": "gene_id"}
gene_name_getters = {
#"miRNA": mir_name_getter
}
for line in gtf:
fields = line.strip().split("\t")
chrom = fields[0]
if chrom != chromosomes[-1]:
assert chrom not in chromosomes, "Chromosomes do not seem to be grouped."
chromosomes.append(chrom)
start = 0
else:
assert int(fields[3]) >= start, "Positions do not seem to be sorted."
start = int(fields[3])
# We only use "transcript" annotation.
if fields[2] == "transcript":
annots = dict([
(k, v.strip('"')) for (k, v)
in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]])
if "exon_number" not in annots:
biotype = annots["gene_biotype"]
biotype_counter[biotype] += 1
end = int(fields[4])
length = end + 1 - start
max_lengths[biotype] = max(length, max_lengths[biotype])
gene_id = annots[biotypes2id[biotype]]
if gene_id in id2name:
assert annots["gene_name"] == id2name[gene_id], "Gene %s already registered with another name." % gene_id
else:
post_process_name = gene_name_getters.get(
biotype, default_gene_name_getter)
id2name[gene_id] = post_process_name(annots, gene_id)
# We want miRNA gene_id and gene_name modified
# dest_files[biotype].write(line)
annots["gene_id"] = gene_id
annots["gene_name"] = id2name[gene_id]
fields[8] = "; ".join([
f"{k} \"{v}\""
for (k, v) in annots.items()
]) + ";"
dest_files[biotype].write("\t".join(fields) + "\n")
with open(f"{base}_id2name.pickle", "wb") as pickle_file:
print(f"Storing id2name in {pickle_file.name}")
dump(id2name, pickle_file, HIGHEST_PROTOCOL)
# Inform user about number of entries.
for (biotype, count) in biotype_counter.items():
print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
sys.exit(0)
if args.split_mi:
biotypes2id["miRNA"] = "transcript_id"
gene_name_getters["miRNA"] = mir_name_getter
# Mapping gene_id to gene_name
id2name = {}
max_lengths = defaultdict(int)
# To check that it is sorted
chromosomes = [None]
start = 0
with ExitStack() as stack, open(in_gtf, "r") as gtf:
dest_files = {
biotype: stack.enter_context(open(f"{base}_{biotype}{ext}", "w"))
for biotype in biotypes2id
}
for line in gtf:
fields = line.strip().split("\t")
chrom = fields[0]
if chrom != chromosomes[-1]:
assert chrom not in chromosomes, "Chromosomes do not seem to be grouped."
chromosomes.append(chrom)
start = 0
else:
assert int(fields[3]) >= start, "Positions do not seem to be sorted."
start = int(fields[3])
# We only use "transcript" annotation.
if fields[2] == "transcript":
annots = {
k: v.strip('"') for (k, v)
in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]}
if "exon_number" not in annots:
biotype = annots["gene_biotype"]
biotype_counter[biotype] += 1
end = int(fields[4])
length = end + 1 - start
max_lengths[biotype] = max(length, max_lengths[biotype])
gene_id = annots[biotypes2id[biotype]]
if gene_id in id2name:
msg = f"Gene {gene_id} already registered with another name."
assert annots["gene_name"] == id2name[gene_id], msg
else:
post_process_name = gene_name_getters.get(
biotype, default_gene_name_getter)
id2name[gene_id] = post_process_name(annots, gene_id)
# We want miRNA gene_id and gene_name modified
# dest_files[biotype].write(line)
annots["gene_id"] = gene_id
annots["gene_name"] = id2name[gene_id]
fields[8] = "; ".join([
f"{k} \"{v}\""
for (k, v) in annots.items()
]) + ";"
dest_files[biotype].write("\t".join(fields) + "\n")
with open(f"{base}_id2name.pickle", "wb") as pickle_file:
print(f"Storing id2name in {pickle_file.name}")
dump(id2name, pickle_file, HIGHEST_PROTOCOL)
# Inform user about number of entries.
for (biotype, count) in biotype_counter.items():
print(biotype, count, f"(max length = {max_lengths[biotype]})", sep="\t")
sys.exit(main())
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment