diff --git a/libhts/__init__.py b/libhts/__init__.py index 4fa70b752bc8b8defc90b9e3015c3e852bfd104e..3d089bba0c487d9853f2a98da83e4e10a62bb05a 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1 +1 @@ -from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, size_factor_correlations, status_setter +from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, repeat_bed_2_lengths, size_factor_correlations, status_setter diff --git a/libhts/libhts.py b/libhts/libhts.py index 034da9528dfe02446ee8fa62c3166295d16a6f5c..638b76dd4c5cb40dd0c014aa3032480069c9bb12 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -1,3 +1,4 @@ +from functools import reduce import warnings @@ -20,6 +21,127 @@ as_df = r("as.data.frame") from rpy2.rinterface import RRuntimeError from rpy2.robjects.packages import importr deseq2 = importr("DESeq2") +from pybedtools import BedTool +import networkx as nx + + +class Exon(object): + __slots__ = ("chrom", "start", "end") + def __init__(self, chrom, start, end): + self.chrom = chrom + self.start = start + self.end = end + + def overlap(self, other): + if self.chrom != other.chrom: + return False + return (self.start <= other.start < self.end) or (other.start <= self.start < other.end) + + def merge(self, other): + # Not necessary: can be indirectly linked + #assert overlap(self, other) + return Exon(self.chrom, min(self.start, other.start), max(self.end, other.end)) + + def __len__(self): + return self.end - self.start + +overlap = Exon.overlap +merge = Exon.merge + +class Gene(object): + """This object contains information obtained from a gtf file.""" + __slots__ = ("gene_id", "exons", "union_exon_length") + def __init__(self, gene_id): + self.gene_id = gene_id + #self.transcripts = {} + self.exons = nx.Graph() + self.union_exon_length = None + + #def add_transcript(self, feature): + # the_id = feature.attrs["transcript_id"] + # assert the_id not in self.transcripts + # self.transcripts[the_id] = feature + + def add_exon(self, feature): + #the_id = feature.attrs["exon_id"] + #assert the_id not in self.exons + #self.exons[the_id] = feature + exon = Exon(feature.chrom, feature.start, feature.end) + if exon not in self.exons: + self.exons.add_node(exon) + + # The merging cannot be done on the full BedTool because we dont want + # to merge together exons not belonging to the same gene. + def set_union_exon_length(self): + """The exons are used to make a BedTool, which enables convenient merging of + overlapping features. The sum of the lengths of the merged exons is returned.""" + if len(self.exons) == 1: + # No need to merge when there is only one exon + self.union_exon_length = len(next(iter(self.exons.nodes()))) + else: + # Too slow + #self.union_exon_length = sum(map( + # len, BedTool(self.exons.values()).merge().features())) + #self.union_exon_length = 0 + # We group nodes that overlap, and merge them + #overlapping_exons = nx.quotient_graph(self.exons, overlap) + #for node in overlapping_exons.nodes(): + # mex = reduce(merge, node) + # self.union_exon_length += len(mex) + self.union_exon_length = sum((len(reduce( + merge, node)) for node in nx.quotient_graph(self.exons, overlap).nodes())) + + +def gtf_2_genes_exon_lengths(gtf_filename): + """Returns a pandas DataFrame where union exon lengths are associated to gene IDs.""" + gtf_file = open(gtf_filename, "r") + gtf = BedTool(gtf_file) + genes = {} + for feature in gtf.features(): + feat_type = feature[2] + if feat_type != "exon": + continue + attrs = feature.attrs + gene_id = attrs["gene_id"] + if gene_id not in genes: + genes[gene_id] = Gene(gene_id) + gene = genes[gene_id] + try: + gene.add_exon(feature) + except AssertionError: + # A given exon may be registered for several transcripts, hence several gtf entries + already = gene.exons[feature.attrs["exon_id"]] + assert already.attrs["transcript_id"] != feature.attrs["transcript_id"] + assert (already.start, already.end) == (feature.start, feature.end) + for gene in genes.values(): + gene.set_union_exon_length() + return pd.DataFrame(pd.Series( + {gene.gene_id : gene.union_exon_length for gene in genes.values()}, + name=("union_exon_len",)).rename_axis("gene")) + + +def repeat_bed_2_lengths(repeat_bed): + """Computes the lengths of repeatitive elements in a bed file, grouped by families. + This assumes that the elements have their names composed of the family name, + then a colon, then a number. For instance: + Simple_repeat|Simple_repeat|(TTTTTTG)n:1 + Simple_repeat|Simple_repeat|(TTTTTTG)n:2 + Simple_repeat|Simple_repeat|(TTTTTTG)n:3 + Simple_repeat|Simple_repeat|(TTTTTTG)n:4 + -> Simple_repeat|Simple_repeat|(TTTTTTG)n + Returns a DataFrame associating the summed lengths to the family names. + """ + # usecols=[1, 2, 3]: start, end, id + # index_col=2: id (relative to the selected columns) + start_ends = pd.read_table(repeat_bed, usecols=[1, 2, 3], header=None, index_col=2) + # bed lengths + lens = start_ends[2] - start_ends[1] + lens.name = "union_exon_len" + repeat_families = [":".join(name.split(":")[:-1]) for name in start_ends.index] + # The reads assigned to a repeated element can come + # from the summed length of all the members of the family + # We call this "gene" for convenience and compatibility + return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum() def do_deseq2(cond_names, conditions, counts_data,