Skip to content
Snippets Groups Projects
Commit 36d33ac2 authored by Blaise Li's avatar Blaise Li
Browse files

Option to consider features all-exonic.

This is used for spike-ins, whose gtf file does not contain "exon"
features.
parent 5ac2b62e
No related branches found
No related tags found
No related merge requests found
from subprocess import check_output from subprocess import check_output
__copyright__ = "Copyright (C) 2020-2021 Blaise Li" __copyright__ = "Copyright (C) 2020-2022 Blaise Li"
__licence__ = "GNU GPLv3" __licence__ = "GNU GPLv3"
__version__ = 0.6 __version__ = 0.7
BEDTOOLS_VERSION = check_output( BEDTOOLS_VERSION = check_output(
["bedtools", "--version"]).decode("utf-8").strip().split()[-1] ["bedtools", "--version"]).decode("utf-8").strip().split()[-1]
......
...@@ -141,10 +141,20 @@ class Gene(): ...@@ -141,10 +141,20 @@ class Gene():
self.exons, OVERLAP).nodes())) self.exons, OVERLAP).nodes()))
def gtf_2_genes_exon_lengths(gtf_filename): def gtf_2_genes_exon_lengths(gtf_filename, direct_len=False):
"""Returns a pandas DataFrame where union exon lengths are associated to gene IDs.""" """
Return a pandas DataFrame where union exon lengths are associated to gene IDs.
If *direct_len* is set to `True`, features in the gtf file are assumed to be
"all exonic" and their length is taken directly without further controls.
This can be used for instance when dealing with spike-ins.
"""
gtf_file = open(gtf_filename, "r") gtf_file = open(gtf_filename, "r")
gtf = BedTool(gtf_file) gtf = BedTool(gtf_file)
if direct_len:
return pd.DataFrame(pd.Series(
{feature.attrs["gene_id"] : len(feature) for feature in gtf.features()},
name=("union_exon_len")).rename_axis("gene"))
genes = {} genes = {}
for feature in gtf.features(): for feature in gtf.features():
feat_type = feature[2] feat_type = feature[2]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment