Skip to content
Snippets Groups Projects
Commit f5991cb0 authored by Blaise Li's avatar Blaise Li
Browse files

Functions to compute union exon lengths.

parent 88cea0dd
No related branches found
No related tags found
No related merge requests found
from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, size_factor_correlations, status_setter from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, repeat_bed_2_lengths, size_factor_correlations, status_setter
from functools import reduce
import warnings import warnings
...@@ -20,6 +21,127 @@ as_df = r("as.data.frame") ...@@ -20,6 +21,127 @@ as_df = r("as.data.frame")
from rpy2.rinterface import RRuntimeError from rpy2.rinterface import RRuntimeError
from rpy2.robjects.packages import importr from rpy2.robjects.packages import importr
deseq2 = importr("DESeq2") deseq2 = importr("DESeq2")
from pybedtools import BedTool
import networkx as nx
class Exon(object):
__slots__ = ("chrom", "start", "end")
def __init__(self, chrom, start, end):
self.chrom = chrom
self.start = start
self.end = end
def overlap(self, other):
if self.chrom != other.chrom:
return False
return (self.start <= other.start < self.end) or (other.start <= self.start < other.end)
def merge(self, other):
# Not necessary: can be indirectly linked
#assert overlap(self, other)
return Exon(self.chrom, min(self.start, other.start), max(self.end, other.end))
def __len__(self):
return self.end - self.start
overlap = Exon.overlap
merge = Exon.merge
class Gene(object):
"""This object contains information obtained from a gtf file."""
__slots__ = ("gene_id", "exons", "union_exon_length")
def __init__(self, gene_id):
self.gene_id = gene_id
#self.transcripts = {}
self.exons = nx.Graph()
self.union_exon_length = None
#def add_transcript(self, feature):
# the_id = feature.attrs["transcript_id"]
# assert the_id not in self.transcripts
# self.transcripts[the_id] = feature
def add_exon(self, feature):
#the_id = feature.attrs["exon_id"]
#assert the_id not in self.exons
#self.exons[the_id] = feature
exon = Exon(feature.chrom, feature.start, feature.end)
if exon not in self.exons:
self.exons.add_node(exon)
# The merging cannot be done on the full BedTool because we dont want
# to merge together exons not belonging to the same gene.
def set_union_exon_length(self):
"""The exons are used to make a BedTool, which enables convenient merging of
overlapping features. The sum of the lengths of the merged exons is returned."""
if len(self.exons) == 1:
# No need to merge when there is only one exon
self.union_exon_length = len(next(iter(self.exons.nodes())))
else:
# Too slow
#self.union_exon_length = sum(map(
# len, BedTool(self.exons.values()).merge().features()))
#self.union_exon_length = 0
# We group nodes that overlap, and merge them
#overlapping_exons = nx.quotient_graph(self.exons, overlap)
#for node in overlapping_exons.nodes():
# mex = reduce(merge, node)
# self.union_exon_length += len(mex)
self.union_exon_length = sum((len(reduce(
merge, node)) for node in nx.quotient_graph(self.exons, overlap).nodes()))
def gtf_2_genes_exon_lengths(gtf_filename):
"""Returns a pandas DataFrame where union exon lengths are associated to gene IDs."""
gtf_file = open(gtf_filename, "r")
gtf = BedTool(gtf_file)
genes = {}
for feature in gtf.features():
feat_type = feature[2]
if feat_type != "exon":
continue
attrs = feature.attrs
gene_id = attrs["gene_id"]
if gene_id not in genes:
genes[gene_id] = Gene(gene_id)
gene = genes[gene_id]
try:
gene.add_exon(feature)
except AssertionError:
# A given exon may be registered for several transcripts, hence several gtf entries
already = gene.exons[feature.attrs["exon_id"]]
assert already.attrs["transcript_id"] != feature.attrs["transcript_id"]
assert (already.start, already.end) == (feature.start, feature.end)
for gene in genes.values():
gene.set_union_exon_length()
return pd.DataFrame(pd.Series(
{gene.gene_id : gene.union_exon_length for gene in genes.values()},
name=("union_exon_len",)).rename_axis("gene"))
def repeat_bed_2_lengths(repeat_bed):
"""Computes the lengths of repeatitive elements in a bed file, grouped by families.
This assumes that the elements have their names composed of the family name,
then a colon, then a number. For instance:
Simple_repeat|Simple_repeat|(TTTTTTG)n:1
Simple_repeat|Simple_repeat|(TTTTTTG)n:2
Simple_repeat|Simple_repeat|(TTTTTTG)n:3
Simple_repeat|Simple_repeat|(TTTTTTG)n:4
-> Simple_repeat|Simple_repeat|(TTTTTTG)n
Returns a DataFrame associating the summed lengths to the family names.
"""
# usecols=[1, 2, 3]: start, end, id
# index_col=2: id (relative to the selected columns)
start_ends = pd.read_table(repeat_bed, usecols=[1, 2, 3], header=None, index_col=2)
# bed lengths
lens = start_ends[2] - start_ends[1]
lens.name = "union_exon_len"
repeat_families = [":".join(name.split(":")[:-1]) for name in start_ends.index]
# The reads assigned to a repeated element can come
# from the summed length of all the members of the family
# We call this "gene" for convenience and compatibility
return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()
def do_deseq2(cond_names, conditions, counts_data, def do_deseq2(cond_names, conditions, counts_data,
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment