From 36d33ac2c366c0f0702ab50fdcab37261add7b9c Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Tue, 25 Oct 2022 09:44:46 +0200 Subject: [PATCH] Option to consider features all-exonic. This is used for spike-ins, whose gtf file does not contain "exon" features. --- libhts/__init__.py | 4 ++-- libhts/libhts.py | 14 ++++++++++++-- 2 files changed, 14 insertions(+), 4 deletions(-) diff --git a/libhts/__init__.py b/libhts/__init__.py index 3ad8378..1315de5 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1,8 +1,8 @@ from subprocess import check_output -__copyright__ = "Copyright (C) 2020-2021 Blaise Li" +__copyright__ = "Copyright (C) 2020-2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = 0.6 +__version__ = 0.7 BEDTOOLS_VERSION = check_output( ["bedtools", "--version"]).decode("utf-8").strip().split()[-1] diff --git a/libhts/libhts.py b/libhts/libhts.py index a7eff6e..8b9c621 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -141,10 +141,20 @@ class Gene(): self.exons, OVERLAP).nodes())) -def gtf_2_genes_exon_lengths(gtf_filename): - """Returns a pandas DataFrame where union exon lengths are associated to gene IDs.""" +def gtf_2_genes_exon_lengths(gtf_filename, direct_len=False): + """ + Return a pandas DataFrame where union exon lengths are associated to gene IDs. + + If *direct_len* is set to `True`, features in the gtf file are assumed to be + "all exonic" and their length is taken directly without further controls. + This can be used for instance when dealing with spike-ins. + """ gtf_file = open(gtf_filename, "r") gtf = BedTool(gtf_file) + if direct_len: + return pd.DataFrame(pd.Series( + {feature.attrs["gene_id"] : len(feature) for feature in gtf.features()}, + name=("union_exon_len")).rename_axis("gene")) genes = {} for feature in gtf.features(): feat_type = feature[2] -- GitLab