Skip to content
Snippets Groups Projects
Commit 4b4fb422 authored by Blaise Li's avatar Blaise Li
Browse files

Use transcript_id as gene_id for miRNA.

The goal is to have distinct counts for -5p and -3p miRNAs.
parent 9154ba5c
No related branches found
No related tags found
No related merge requests found
...@@ -30,9 +30,8 @@ base, ext = splitext(in_gtf) ...@@ -30,9 +30,8 @@ base, ext = splitext(in_gtf)
biotype_counter = defaultdict(int) biotype_counter = defaultdict(int)
biotypes2id = { biotypes2id = {
"rRNA": "gene_id", "rRNA": "gene_id",
# TODO: use "transcript_id" to have distinct 3p and 5p entries # Use "transcript_id" to have distinct 3p and 5p entries
# "miRNA": "transcript_id", "miRNA": "transcript_id",
"miRNA": "gene_id",
"piRNA": "gene_id", "piRNA": "gene_id",
"snRNA": "gene_id", "snRNA": "gene_id",
"lincRNA": "gene_id", "lincRNA": "gene_id",
...@@ -43,8 +42,7 @@ biotypes2id = { ...@@ -43,8 +42,7 @@ biotypes2id = {
"ncRNA": "gene_id", "ncRNA": "gene_id",
"pseudogene": "gene_id"} "pseudogene": "gene_id"}
gene_name_getters = { gene_name_getters = {
# TODO: use this "miRNA": mir_name_getter
# "miRNA": mir_name_getter
} }
# Mapping gene_id to gene_name # Mapping gene_id to gene_name
...@@ -73,10 +71,9 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf: ...@@ -73,10 +71,9 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf:
if fields[2] == "transcript": if fields[2] == "transcript":
annots = dict([ annots = dict([
(k, v.strip('"')) for (k, v) (k, v.strip('"')) for (k, v)
in [f.split(" ") for f in fields[8].split("; ")[:-1]]]) in [f.split(" ") for f in fields[8].rstrip(";").split("; ")]])
if "exon_number" not in annots: if "exon_number" not in annots:
biotype = annots["gene_biotype"] biotype = annots["gene_biotype"]
dest_files[biotype].write(line)
biotype_counter[biotype] += 1 biotype_counter[biotype] += 1
end = int(fields[4]) end = int(fields[4])
length = end + 1 - start length = end + 1 - start
...@@ -88,6 +85,15 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf: ...@@ -88,6 +85,15 @@ with ExitStack() as stack, open(in_gtf, "r") as gtf:
post_process_name = gene_name_getters.get( post_process_name = gene_name_getters.get(
biotype, default_gene_name_getter) biotype, default_gene_name_getter)
id2name[gene_id] = post_process_name(annots, gene_id) id2name[gene_id] = post_process_name(annots, gene_id)
# We want miRNA gene_id and gene_name modified
# dest_files[biotype].write(line)
annots["gene_id"] = gene_id
annots["gene_name"] = id2name[gene_id]
fields[8] = "; ".join([
f"{k} \"{v}\""
for (k, v) in annots.items()
]) + ";"
dest_files[biotype].write("\t".join(fields) + "\n")
with open(f"{base}_id2name.pickle", "wb") as pickle_file: with open(f"{base}_id2name.pickle", "wb") as pickle_file:
print(f"Storing id2name in {pickle_file.name}") print(f"Storing id2name in {pickle_file.name}")
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment