Option to consider features all-exonic.

This is used for spike-ins, whose gtf file does not contain "exon" features.

Option to consider features all-exonic.
36d33ac2 · Blaise Li · 5ac2b62e · 36d33ac2 · 36d33ac2
Commit 36d33ac2 authored 2 years ago by Blaise Li
--- a/libhts/__init__.py
+++ b/libhts/__init__.py
 from subprocess import check_output
-__copyright__ = "Copyright (C) 2020-2021 Blaise Li"
+__copyright__ = "Copyright (C) 2020-2022 Blaise Li"
 __licence__ = "GNU GPLv3"
-__version__ = 0.6
+__version__ = 0.7
 BEDTOOLS_VERSION = check_output(
    ["bedtools", "--version"]).decode("utf-8").strip().split()[-1]

--- a/libhts/libhts.py
+++ b/libhts/libhts.py
@@ -141,10 +141,20 @@ class Gene():
                    self.exons, OVERLAP).nodes()))
-def gtf_2_genes_exon_lengths(gtf_filename):
+def gtf_2_genes_exon_lengths(gtf_filename, direct_len=False):
-    """Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
+    """
+    Return a pandas DataFrame where union exon lengths are associated to gene IDs.
+    If *direct_len* is set to `True`, features in the gtf file are assumed to be
+    "all exonic" and their length is taken directly without further controls.
+    This can be used for instance when dealing with spike-ins.
+    """
    gtf_file = open(gtf_filename, "r")
    gtf = BedTool(gtf_file)
+    if direct_len:
+        return pd.DataFrame(pd.Series(
+            {feature.attrs["gene_id"] : len(feature) for feature in gtf.features()},
+            name=("union_exon_len")).rename_axis("gene"))
    genes = {}
    for feature in gtf.features():
        feat_type = feature[2]