diff --git a/libhts/__init__.py b/libhts/__init__.py index 2da39117da59070c590205818dfdcb687e918369..4b8022bb4ca900ae802bc0b60fe9e3711a3c2de4 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1 +1 @@ -from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, status_setter +from .libhts import do_deseq2, gtf_2_genes_exon_lengths, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, plot_scatter, repeat_bed_2_lengths, size_factor_correlations, spikein_gtf_2_lengths, status_setter diff --git a/libhts/libhts.py b/libhts/libhts.py index 5617001e90615eb7a4838c8e1f1b172c08c1433c..e708d35e3a4492c783574467c078dd133d7cc422 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -117,7 +117,7 @@ def gtf_2_genes_exon_lengths(gtf_filename): gene.set_union_exon_length() return pd.DataFrame(pd.Series( {gene.gene_id : gene.union_exon_length for gene in genes.values()}, - name=("union_exon_len",)).rename_axis("gene")) + name=("union_exon_len")).rename_axis("gene")) def repeat_bed_2_lengths(repeat_bed): @@ -144,6 +144,20 @@ def repeat_bed_2_lengths(repeat_bed): return pd.DataFrame(lens).assign(gene=repeat_families).groupby("gene").sum() +def spikein_gtf_2_lengths(spikein_gtf): + """Computes the lengths of spike-ins, grouped by families. + Returns a DataFrame associating the summed lengths to the spike-in namse. + """ + spikein_ends = {} + with open(spikein_gtf) as gtf_file: + for line in gtf_file: + fields = line.strip().split("\t") + spikein_ends[fields[0]] = int(fields[4]) + return pd.DataFrame(pd.Series( + spikein_ends, + name=("union_exon_len")).rename_axis("gene")) + + def do_deseq2(cond_names, conditions, counts_data, formula=None, contrast=None, deseq2_args=None): """Runs a DESeq2 differential expression analysis.""" @@ -151,6 +165,7 @@ def do_deseq2(cond_names, conditions, counts_data, formula = Formula("~ lib") if contrast is None: # FIXME: MUT and REF are not defined + # Maybe just make (formula and) contrast mandatory contrast = StrVector(["lib", MUT, REF]) if deseq2_args is None: deseq2_args = {"betaPrior" : True, "addMLE" : True, "independentFiltering" : True}