Commit f5991cb0 authored by Blaise Li's avatar Blaise Li
Browse files

Functions to compute union exon lengths.

parent 88cea0dd
from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, size_factor_correlations, status_setter
from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, repeat_bed_2_lengths, size_factor_correlations, status_setter
from functools import reduce
import warnings
......@@ -20,6 +21,127 @@ as_df = r("")
from rpy2.rinterface import RRuntimeError
from rpy2.robjects.packages import importr
deseq2 = importr("DESeq2")
from pybedtools import BedTool
import networkx as nx
class Exon(object):
__slots__ = ("chrom", "start", "end")
def __init__(self, chrom, start, end):
self.chrom = chrom
self.start = start
self.end = end
def overlap(self, other):
if self.chrom != other.chrom:
return False
return (self.start <= other.start < self.end) or (other.start <= self.start < other.end)
def merge(self, other):
# Not necessary: can be indirectly linked
#assert overlap(self, other)
return Exon(self.chrom, min(self.start, other.start), max(self.end, other.end))
def __len__(self):
return self.end - self.start
overlap = Exon.overlap
merge = Exon.merge
class Gene(object):
"""This object contains information obtained from a gtf file."""
__slots__ = ("gene_id", "exons", "union_exon_length")
def __init__(self, gene_id):
self.gene_id = gene_id
#self.transcripts = {}
self.exons = nx.Graph()
self.union_exon_length = None
#def add_transcript(self, feature):
# the_id = feature.attrs["transcript_id"]
# assert the_id not in self.transcripts
# self.transcripts[the_id] = feature
def add_exon(self, feature):
#the_id = feature.attrs["exon_id"]
#assert the_id not in self.exons
#self.exons[the_id] = feature
exon = Exon(feature.chrom, feature.start, feature.end)
if exon not in self.exons:
# The merging cannot be done on the full BedTool because we dont want
# to merge together exons not belonging to the same gene.
def set_union_exon_length(self):
"""The exons are used to make a BedTool, which enables convenient merging of
overlapping features. The sum of the lengths of the merged exons is returned."""
if len(self.exons) == 1:
# No need to merge when there is only one exon
self.union_exon_length = len(next(iter(self.exons.nodes())))
# Too slow
#self.union_exon_length = sum(map(
# len, BedTool(self.exons.values()).merge().features()))
#self.union_exon_length = 0
# We group nodes that overlap, and merge them
#overlapping_exons = nx.quotient_graph(self.exons, overlap)
#for node in overlapping_exons.nodes():
# mex = reduce(merge, node)
# self.union_exon_length += len(mex)
self.union_exon_length = sum((len(reduce(
merge, node)) for node in nx.quotient_graph(self.exons, overlap).nodes()))
def gtf_2_genes_exon_lengths(gtf_filename):
"""Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
gtf_file = open(gtf_filename, "r")
gtf = BedTool(gtf_file)
genes = {}
for feature in gtf.features():
feat_type = feature[2]
if feat_type != "exon":
attrs = feature.attrs
gene_id = attrs["gene_id"]
if gene_id not in genes:
genes[gene_id] = Gene(gene_id)
gene = genes[gene_id]
except AssertionError:
# A given exon may be registered for several transcripts, hence several gtf entries
already = gene.exons[feature.attrs["exon_id"]]
assert already.attrs["transcript_id"] != feature.attrs["transcript_id"]
assert (already.start, already.end) == (feature.start, feature.end)
for gene in genes.values():
return pd.DataFrame(pd.Series(
{gene.gene_id : gene.union_exon_length for gene in genes.values()},
def repeat_bed_2_lengths(repeat_bed):
"""Computes the lengths of repeatitive elements in a bed file, grouped by families.
This assumes that the elements have their names composed of the family name,
then a colon, then a number. For instance:
-> Simple_repeat|Simple_repeat|(TTTTTTG)n
Returns a DataFrame associating the summed lengths to the family names.
# usecols=[1, 2, 3]: start, end, id
# index_col=2: id (relative to the selected columns)
start_ends = pd.read_table(repeat_bed, usecols=[1, 2, 3], header=None, index_col=2)
# bed lengths
lens = start_ends[2] - start_ends[1] = "union_exon_len"
repeat_families = [":".join(name.split(":")[:-1]) for name in start_ends.index]
# The reads assigned to a repeated element can come
# from the summed length of all the members of the family
# We call this "gene" for convenience and compatibility
return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()
def do_deseq2(cond_names, conditions, counts_data,
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment