From 36d33ac2c366c0f0702ab50fdcab37261add7b9c Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Tue, 25 Oct 2022 09:44:46 +0200
Subject: [PATCH] Option to consider features all-exonic.

This is used for spike-ins, whose gtf file does not contain "exon"
features.
---
 libhts/__init__.py |  4 ++--
 libhts/libhts.py   | 14 ++++++++++++--
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/libhts/__init__.py b/libhts/__init__.py
index 3ad8378..1315de5 100644
--- a/libhts/__init__.py
+++ b/libhts/__init__.py
@@ -1,8 +1,8 @@
 from subprocess import check_output
 
-__copyright__ = "Copyright (C) 2020-2021 Blaise Li"
+__copyright__ = "Copyright (C) 2020-2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = 0.6
+__version__ = 0.7
 
 BEDTOOLS_VERSION = check_output(
     ["bedtools", "--version"]).decode("utf-8").strip().split()[-1]
diff --git a/libhts/libhts.py b/libhts/libhts.py
index a7eff6e..8b9c621 100644
--- a/libhts/libhts.py
+++ b/libhts/libhts.py
@@ -141,10 +141,20 @@ class Gene():
                     self.exons, OVERLAP).nodes()))
 
 
-def gtf_2_genes_exon_lengths(gtf_filename):
-    """Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
+def gtf_2_genes_exon_lengths(gtf_filename, direct_len=False):
+    """
+    Return a pandas DataFrame where union exon lengths are associated to gene IDs.
+
+    If *direct_len* is set to `True`, features in the gtf file are assumed to be
+    "all exonic" and their length is taken directly without further controls.
+    This can be used for instance when dealing with spike-ins.
+    """
     gtf_file = open(gtf_filename, "r")
     gtf = BedTool(gtf_file)
+    if direct_len:
+        return pd.DataFrame(pd.Series(
+            {feature.attrs["gene_id"] : len(feature) for feature in gtf.features()},
+            name=("union_exon_len")).rename_axis("gene"))
     genes = {}
     for feature in gtf.features():
         feat_type = feature[2]
-- 
GitLab