Functions to compute union exon lengths.

f5991cb0 · Blaise Li · 88cea0dd · f5991cb0 · f5991cb0
Commit f5991cb0 authored 7 years ago by Blaise Li
--- a/libhts/__init__.py
+++ b/libhts/__init__.py
-from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, size_factor_correlations, status_setter
+from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, repeat_bed_2_lengths, size_factor_correlations, status_setter
--- a/libhts/libhts.py
+++ b/libhts/libhts.py
+from functools import reduce
 import warnings
@@ -20,6 +21,127 @@ as_df = r("as.data.frame")
 from rpy2.rinterface import RRuntimeError
 from rpy2.robjects.packages import importr
 deseq2 = importr("DESeq2")
+from pybedtools import BedTool
+import networkx as nx
+class Exon(object):
+    __slots__ = ("chrom", "start", "end")
+    def __init__(self, chrom, start, end):
+        self.chrom = chrom
+        self.start = start
+        self.end = end
+    def overlap(self, other):
+        if self.chrom != other.chrom:
+            return False
+        return (self.start <= other.start < self.end) or (other.start <= self.start < other.end)
+    def merge(self, other):
+        # Not necessary: can be indirectly linked
+        #assert overlap(self, other)
+        return Exon(self.chrom, min(self.start, other.start), max(self.end, other.end))
+    def __len__(self):
+        return self.end - self.start
+overlap = Exon.overlap
+merge = Exon.merge
+class Gene(object):
+    """This object contains information obtained from a gtf file."""
+    __slots__ = ("gene_id", "exons", "union_exon_length")
+    def __init__(self, gene_id):
+        self.gene_id = gene_id
+        #self.transcripts = {}
+        self.exons = nx.Graph()
+        self.union_exon_length = None
+    #def add_transcript(self, feature):
+    #    the_id = feature.attrs["transcript_id"]
+    #    assert the_id not in self.transcripts
+    #    self.transcripts[the_id] = feature
+    def add_exon(self, feature):
+        #the_id = feature.attrs["exon_id"]
+        #assert the_id not in self.exons
+        #self.exons[the_id] = feature
+        exon = Exon(feature.chrom, feature.start, feature.end)
+        if exon not in self.exons:
+            self.exons.add_node(exon)
+    # The merging cannot be done on the full BedTool because we dont want
+    # to merge together exons not belonging to the same gene.
+    def set_union_exon_length(self):
+        """The exons are used to make a BedTool, which enables convenient merging of
+        overlapping features. The sum of the lengths of the merged exons is returned."""
+        if len(self.exons) == 1:
+            # No need to merge when there is only one exon
+            self.union_exon_length = len(next(iter(self.exons.nodes())))
+        else:
+            # Too slow
+            #self.union_exon_length = sum(map(
+            #    len, BedTool(self.exons.values()).merge().features()))
+            #self.union_exon_length = 0
+            # We group nodes that overlap, and merge them
+            #overlapping_exons = nx.quotient_graph(self.exons, overlap)
+            #for node in overlapping_exons.nodes():
+            #    mex = reduce(merge, node)
+            #    self.union_exon_length += len(mex)
+            self.union_exon_length = sum((len(reduce(
+                merge, node)) for node in nx.quotient_graph(self.exons, overlap).nodes()))
+def gtf_2_genes_exon_lengths(gtf_filename):
+    """Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
+    gtf_file = open(gtf_filename, "r")
+    gtf = BedTool(gtf_file)
+    genes = {}
+    for feature in gtf.features():
+        feat_type = feature[2]
+        if feat_type != "exon":
+            continue
+        attrs = feature.attrs
+        gene_id = attrs["gene_id"]
+        if gene_id not in genes:
+            genes[gene_id] = Gene(gene_id)
+        gene = genes[gene_id]
+        try:
+            gene.add_exon(feature)
+        except AssertionError:
+            # A given exon may be registered for several transcripts, hence several gtf entries
+            already = gene.exons[feature.attrs["exon_id"]]
+            assert already.attrs["transcript_id"] != feature.attrs["transcript_id"]
+            assert (already.start, already.end) == (feature.start, feature.end)
+    for gene in genes.values():
+        gene.set_union_exon_length()
+    return pd.DataFrame(pd.Series(
+        {gene.gene_id : gene.union_exon_length for gene in genes.values()},
+        name=("union_exon_len",)).rename_axis("gene"))
+def repeat_bed_2_lengths(repeat_bed):
+    """Computes the lengths of repeatitive elements in a bed file, grouped by families.
+    This assumes that the elements have their names composed of the family name,
+    then a colon, then a number. For instance:
+    Simple_repeat|Simple_repeat|(TTTTTTG)n:1
+    Simple_repeat|Simple_repeat|(TTTTTTG)n:2
+    Simple_repeat|Simple_repeat|(TTTTTTG)n:3
+    Simple_repeat|Simple_repeat|(TTTTTTG)n:4
+    -> Simple_repeat|Simple_repeat|(TTTTTTG)n
+    Returns a DataFrame associating the summed lengths to the family names.
+    """
+    # usecols=[1, 2, 3]: start, end, id
+    # index_col=2: id (relative to the selected columns)
+    start_ends = pd.read_table(repeat_bed, usecols=[1, 2, 3], header=None, index_col=2)
+    # bed lengths
+    lens = start_ends[2] - start_ends[1]
+    lens.name = "union_exon_len"
+    repeat_families = [":".join(name.split(":")[:-1]) for name in start_ends.index]
+    # The reads assigned to a repeated element can come
+    # from the summed length of all the members of the family
+    # We call this "gene" for convenience and compatibility
+    return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()
 def do_deseq2(cond_names, conditions, counts_data,