From 973a02ec0bcc88f0292c9a5ce1e086f165941663 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Mon, 22 Jan 2018 10:36:27 +0100
Subject: [PATCH] Add spike-ins in "exon" length computation.

Also noticed a problem involving MultiIndex
(https://stackoverflow.com/q/48345640/1878788).
---
 libhts/__init__.py |  2 +-
 libhts/libhts.py   | 17 ++++++++++++++++-
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/libhts/__init__.py b/libhts/__init__.py
index 2da3911..4b8022b 100644
--- a/libhts/__init__.py
+++ b/libhts/__init__.py
@@ -1 +1 @@
-from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, status_setter
+from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, spikein_gtf_2_lengths, status_setter
diff --git a/libhts/libhts.py b/libhts/libhts.py
index 5617001..e708d35 100644
--- a/libhts/libhts.py
+++ b/libhts/libhts.py
@@ -117,7 +117,7 @@ def gtf_2_genes_exon_lengths(gtf_filename):
         gene.set_union_exon_length()
     return pd.DataFrame(pd.Series(
         {gene.gene_id : gene.union_exon_length for gene in genes.values()},
-        name=("union_exon_len",)).rename_axis("gene"))
+        name=("union_exon_len")).rename_axis("gene"))
 
 
 def repeat_bed_2_lengths(repeat_bed):
@@ -144,6 +144,20 @@ def repeat_bed_2_lengths(repeat_bed):
     return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()
 
 
+def spikein_gtf_2_lengths(spikein_gtf):
+    """Computes the lengths of spike-ins, grouped by families.
+    Returns a DataFrame associating the summed lengths to the spike-in namse.
+    """
+    spikein_ends = {}
+    with open(spikein_gtf) as gtf_file:
+        for line in gtf_file:
+            fields = line.strip().split("\t")
+            spikein_ends[fields[0]] = int(fields[4])
+    return pd.DataFrame(pd.Series(
+        spikein_ends,
+        name=("union_exon_len")).rename_axis("gene"))
+
+
 def do_deseq2(cond_names, conditions, counts_data,
               formula=None, contrast=None, deseq2_args=None):
     """Runs a DESeq2 differential expression analysis."""
@@ -151,6 +165,7 @@ def do_deseq2(cond_names, conditions, counts_data,
         formula = Formula("~ lib")
     if contrast is None:
         # FIXME: MUT and REF are not defined
+        # Maybe just make (formula and) contrast mandatory
         contrast = StrVector(["lib", MUT, REF])
     if deseq2_args is None:
         deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
-- 
GitLab