Add spike-ins in "exon" length computation.

Also noticed a problem involving MultiIndex (https://stackoverflow.com/q/48345640/1878788).

Add spike-ins in "exon" length computation.
4a9b4669 · Blaise Li · c9217c51 · 4a9b4669 · 4a9b4669 · 4a9b4669
Commit 4a9b4669 authored 7 years ago by Blaise Li
--- a/compute_genes_exon_lengths.py
+++ b/compute_genes_exon_lengths.py
@@ -6,7 +6,7 @@ import argparse
 import os
 import sys
 import pandas as pd
-from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths
+from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths, spikein_gtf_2_lengths


 OPJ = os.path.join
@@ -27,6 +27,7 @@ def main():
    annot_dir = args.annot_dir
    # input files
    genes_gtf = OPJ(annot_dir, "genes.gtf")
+    spikein_gtf = OPJ(annot_dir, "spike_ins.gtf")
    dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed")
    rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed")
    satel_bed = OPJ(annot_dir, "satellites_rmsk.bed")
@@ -35,6 +36,7 @@ def main():
    exon_lengths = OPJ(annot_dir, "union_exon_lengths.txt")
    pd.concat((
        gtf_2_genes_exon_lengths(genes_gtf),
+        spikein_gtf_2_lengths(spikein_gtf),
        repeat_bed_2_lengths(dte_bed),
        repeat_bed_2_lengths(rte_bed),
        repeat_bed_2_lengths(satel_bed),

--- a/libhts/libhts/__init__.py
+++ b/libhts/libhts/__init__.py
-from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, status_setter
+from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, spikein_gtf_2_lengths, status_setter
--- a/libhts/libhts/libhts.py
+++ b/libhts/libhts/libhts.py
@@ -117,7 +117,7 @@ def gtf_2_genes_exon_lengths(gtf_filename):
        gene.set_union_exon_length()
    return pd.DataFrame(pd.Series(
        {gene.gene_id : gene.union_exon_length for gene in genes.values()},
-        name=("union_exon_len",)).rename_axis("gene"))
+        name=("union_exon_len")).rename_axis("gene"))


 def repeat_bed_2_lengths(repeat_bed):
@@ -144,6 +144,20 @@ def repeat_bed_2_lengths(repeat_bed):
    return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()


+def spikein_gtf_2_lengths(spikein_gtf):
+    """Computes the lengths of spike-ins, grouped by families.
+    Returns a DataFrame associating the summed lengths to the spike-in namse.
+    """
+    spikein_ends = {}
+    with open(spikein_gtf) as gtf_file:
+        for line in gtf_file:
+            fields = line.strip().split("\t")
+            spikein_ends[fields[0]] = int(fields[4])
+    return pd.DataFrame(pd.Series(
+        spikein_ends,
+        name=("union_exon_len")).rename_axis("gene"))
+
+
 def do_deseq2(cond_names, conditions, counts_data,
              formula=None, contrast=None, deseq2_args=None):
    """Runs a DESeq2 differential expression analysis."""
@@ -151,6 +165,7 @@ def do_deseq2(cond_names, conditions, counts_data,
        formula = Formula("~ lib")
    if contrast is None:
        # FIXME: MUT and REF are not defined
+        # Maybe just make (formula and) contrast mandatory
        contrast = StrVector(["lib", MUT, REF])
    if deseq2_args is None:
        deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}