diff --git a/libhts/__init__.py b/libhts/__init__.py index 7090e6c7621f30f8f4e09053e3a5d1f07a00ac01..620b3694012665cc2a1c57ade333bedde589be50 100644 --- a/libhts/__init__.py +++ b/libhts/__init__.py @@ -1 +1 @@ -from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, status_setter +from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, size_factor_correlations, status_setter diff --git a/libhts/libhts.py b/libhts/libhts.py index d100813016141edb5548c01b0ab30f45e09bb88e..767e520835cdf74fe12449dd3f93dedb08841182 100644 --- a/libhts/libhts.py +++ b/libhts/libhts.py @@ -9,6 +9,8 @@ def formatwarning(message, category, filename, lineno, line): warnings.formatwarning = formatwarning import pandas as pd +# To compute correlation coefficient +from scipy.stats.stats import pearsonr # To compute geometric mean from scipy.stats.mstats import gmean from rpy2.robjects import r, pandas2ri, Formula, StrVector @@ -97,6 +99,22 @@ def median_ratio_to_pseudo_ref_size_factors(counts_data): return counts_data[counts_data.prod(axis=1) > 0].apply( median_ratio_to_pseudo_ref, axis=0) + +def size_factor_correlations(counts_data, summaries, normalizer): + """Is there a correlation, across libraries, between normalized values and size factors? + The size factor type *normalizer* is either computed or taken from *summaries*. + The normalized data are computed by dividing *counts_data* by this size factor.""" + if normalizer == "median_ratio_to_pseudo_ref": + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) + else: + size_factors = summaries.loc[normalizer] + #by_norm = counts_data / size_factors + def compute_pearsonr_with_size_factor(row): + return pearsonr(row, size_factors)[0] + #return by_norm.apply(compute_pearsonr_with_size_factor, axis=1) + return (counts_data / size_factors).apply(compute_pearsonr_with_size_factor, axis=1) + + def status_setter(lfc_cutoffs=None): if lfc_cutoffs is None: lfc_cutoffs = [0.5, 1, 2] @@ -107,13 +125,15 @@ def status_setter(lfc_cutoffs=None): #if row["log2FoldChange"] > 0: lfc = row["lfcMLE"] if lfc > 0: + # Start from the highest cutoff, + # and decrease until below lfc for cutoff in sorted(lfc_cutoffs, reverse=True): if lfc > cutoff: return f"up{cutoff}" return "up" else: - for cutoff in sorted(lfc_cutoffs): - if lfc < cutoff: + for cutoff in sorted(lfc_cutoffs, reverse=True): + if lfc < -cutoff: return f"down{cutoff}" return "down" else: