diff --git a/libhts/__init__.py b/libhts/__init__.py index 3ad83780ed6192ee8c3a66dcb757095675fc89e1..1315de58d9640e19506406dea7b8bf44d502bbac 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1,8 +1,8 @@ from subprocess import check_output -__copyright__ = "Copyright (C) 2020-2021 Blaise Li" +__copyright__ = "Copyright (C) 2020-2022 Blaise Li" __licence__ = "GNU GPLv3" -__version__ = 0.6 +__version__ = 0.7 BEDTOOLS_VERSION = check_output( ["bedtools", "--version"]).decode("utf-8").strip().split()[-1] diff --git a/libhts/libhts.py b/libhts/libhts.py index a7eff6e44c752efc6068c390509e0a2340d8106d..8b9c6212896816ecaaa7edd192fd25ffffb4c877 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -141,10 +141,20 @@ class Gene(): self.exons, OVERLAP).nodes())) -def gtf_2_genes_exon_lengths(gtf_filename): - """Returns a pandas DataFrame where union exon lengths are associated to gene IDs.""" +def gtf_2_genes_exon_lengths(gtf_filename, direct_len=False): + """ + Return a pandas DataFrame where union exon lengths are associated to gene IDs. + + If *direct_len* is set to `True`, features in the gtf file are assumed to be + "all exonic" and their length is taken directly without further controls. + This can be used for instance when dealing with spike-ins. + """ gtf_file = open(gtf_filename, "r") gtf = BedTool(gtf_file) + if direct_len: + return pd.DataFrame(pd.Series( + {feature.attrs["gene_id"] : len(feature) for feature in gtf.features()}, + name=("union_exon_len")).rename_axis("gene")) genes = {} for feature in gtf.features(): feat_type = feature[2]