From 973a02ec0bcc88f0292c9a5ce1e086f165941663 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Mon, 22 Jan 2018 10:36:27 +0100 Subject: [PATCH] Add spike-ins in "exon" length computation. Also noticed a problem involving MultiIndex (https://stackoverflow.com/q/48345640/1878788). --- libhts/__init__.py | 2 +- libhts/libhts.py | 17 ++++++++++++++++- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/libhts/__init__.py b/libhts/__init__.py index 2da3911..4b8022b 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1 +1 @@ -from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, status_setter +from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, spikein_gtf_2_lengths, status_setter diff --git a/libhts/libhts.py b/libhts/libhts.py index 5617001..e708d35 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -117,7 +117,7 @@ def gtf_2_genes_exon_lengths(gtf_filename): gene.set_union_exon_length() return pd.DataFrame(pd.Series( {gene.gene_id : gene.union_exon_length for gene in genes.values()}, - name=("union_exon_len",)).rename_axis("gene")) + name=("union_exon_len")).rename_axis("gene")) def repeat_bed_2_lengths(repeat_bed): @@ -144,6 +144,20 @@ def repeat_bed_2_lengths(repeat_bed): return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum() +def spikein_gtf_2_lengths(spikein_gtf): + """Computes the lengths of spike-ins, grouped by families. + Returns a DataFrame associating the summed lengths to the spike-in namse. + """ + spikein_ends = {} + with open(spikein_gtf) as gtf_file: + for line in gtf_file: + fields = line.strip().split("\t") + spikein_ends[fields[0]] = int(fields[4]) + return pd.DataFrame(pd.Series( + spikein_ends, + name=("union_exon_len")).rename_axis("gene")) + + def do_deseq2(cond_names, conditions, counts_data, formula=None, contrast=None, deseq2_args=None): """Runs a DESeq2 differential expression analysis.""" @@ -151,6 +165,7 @@ def do_deseq2(cond_names, conditions, counts_data, formula = Formula("~ lib") if contrast is None: # FIXME: MUT and REF are not defined + # Maybe just make (formula and) contrast mandatory contrast = StrVector(["lib", MUT, REF]) if deseq2_args is None: deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True} -- GitLab