Skip to content
Snippets Groups Projects
Commit 4a9b4669 authored by Blaise Li's avatar Blaise Li
Browse files

Add spike-ins in "exon" length computation.

Also noticed a problem involving MultiIndex
(https://stackoverflow.com/q/48345640/1878788).
parent c9217c51
Branches
No related tags found
No related merge requests found
......@@ -6,7 +6,7 @@ import argparse
import os
import sys
import pandas as pd
from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths
from libhts import gtf_2_genes_exon_lengths, repeat_bed_2_lengths, spikein_gtf_2_lengths
OPJ = os.path.join
......@@ -27,6 +27,7 @@ def main():
annot_dir = args.annot_dir
# input files
genes_gtf = OPJ(annot_dir, "genes.gtf")
spikein_gtf = OPJ(annot_dir, "spike_ins.gtf")
dte_bed = OPJ(annot_dir, "DNA_transposons_rmsk.bed")
rte_bed = OPJ(annot_dir, "RNA_transposons_rmsk.bed")
satel_bed = OPJ(annot_dir, "satellites_rmsk.bed")
......@@ -35,6 +36,7 @@ def main():
exon_lengths = OPJ(annot_dir, "union_exon_lengths.txt")
pd.concat((
gtf_2_genes_exon_lengths(genes_gtf),
spikein_gtf_2_lengths(spikein_gtf),
repeat_bed_2_lengths(dte_bed),
repeat_bed_2_lengths(rte_bed),
repeat_bed_2_lengths(satel_bed),
......
from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, status_setter
from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, spikein_gtf_2_lengths, status_setter
......@@ -117,7 +117,7 @@ def gtf_2_genes_exon_lengths(gtf_filename):
gene.set_union_exon_length()
return pd.DataFrame(pd.Series(
{gene.gene_id : gene.union_exon_length for gene in genes.values()},
name=("union_exon_len",)).rename_axis("gene"))
name=("union_exon_len")).rename_axis("gene"))
def repeat_bed_2_lengths(repeat_bed):
......@@ -144,6 +144,20 @@ def repeat_bed_2_lengths(repeat_bed):
return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum()
def spikein_gtf_2_lengths(spikein_gtf):
"""Computes the lengths of spike-ins, grouped by families.
Returns a DataFrame associating the summed lengths to the spike-in namse.
"""
spikein_ends = {}
with open(spikein_gtf) as gtf_file:
for line in gtf_file:
fields = line.strip().split("\t")
spikein_ends[fields[0]] = int(fields[4])
return pd.DataFrame(pd.Series(
spikein_ends,
name=("union_exon_len")).rename_axis("gene"))
def do_deseq2(cond_names, conditions, counts_data,
formula=None, contrast=None, deseq2_args=None):
"""Runs a DESeq2 differential expression analysis."""
......@@ -151,6 +165,7 @@ def do_deseq2(cond_names, conditions, counts_data,
formula = Formula("~ lib")
if contrast is None:
# FIXME: MUT and REF are not defined
# Maybe just make (formula and) contrast mandatory
contrast = StrVector(["lib", MUT, REF])
if deseq2_args is None:
deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment