Commit a872a0e1 authored by Blaise Li's avatar Blaise Li
Browse files

Switched to DESeq2-reported log2FoldChange

lfcMLE may bring problems with low-expressed genes.

Also:
- Boxplots with labels turned 90 degrees
- do_deseq2 returns size factors
parent c0da7521
from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, size_factor_correlations, status_setter
from .libhts import do_deseq2, median_ratio_to_pseudo_ref_size_factors, plot_boxplots, plot_counts_distribution, plot_lfc_distribution, plot_MA, plot_norm_correlations, size_factor_correlations, status_setter
......@@ -70,6 +70,11 @@ def do_deseq2(cond_names, conditions, counts_data,
#raise
else:
raise
size_factors = pandas2ri.ri2py(as_df(deseq2.sizeFactors_DESeqDataSet(dds)))
#for cond in cond_names:
# #s = size_factors.loc[cond][0]
# #(*_, s) = size_factors.loc[cond]
#pd.DataFrame({cond : size_factors.loc[cond][0] for cond in COND_NAMES}, index=('size_factor',))
try:
dds = deseq2.estimateDispersions_DESeqDataSet(dds, fitType="parametric")
except RRuntimeError as e:
......@@ -96,7 +101,7 @@ def do_deseq2(cond_names, conditions, counts_data,
addMLE=deseq2_args["addMLE"],
independentFiltering=deseq2_args["independentFiltering"])))
res.index = counts_data.index
return res
return res, {cond : size_factors.loc[cond][0] for cond in cond_names}
def median_ratio_to_pseudo_ref_size_factors(counts_data):
......@@ -152,13 +157,22 @@ def plot_counts_distribution(data, xlabel):
ax.set_xlabel(xlabel)
def plot_boxplots(data, ylabel):
ax = data.plot.box()
ax.set_ylabel(ylabel)
for label in ax.get_xticklabels():
label.set_rotation(90)
# Cutoffs in log fold change
LFC_CUTOFFS = [0.5, 1, 2]
UP_STATUSES = [f"up{cutoff}" for cutoff in LFC_CUTOFFS]
DOWN_STATUSES = [f"down{cutoff}" for cutoff in LFC_CUTOFFS]
def status_setter(lfc_cutoffs=None):
def status_setter(lfc_cutoffs=None, fold_type="log2FoldChange"):
"""*fold_type* can also be "lfcMLE", which is based on uncorrected values.
This may not be good for genes with low expression levels."""
if lfc_cutoffs is None:
lfc_cutoffs = LFC_CUTOFFS
def set_status(row):
......@@ -166,7 +180,7 @@ def status_setter(lfc_cutoffs=None):
row of a deseq2 results table."""
if row["padj"] < 0.05:
#if row["log2FoldChange"] > 0:
lfc = row["lfcMLE"]
lfc = row[fold_type]
if lfc > 0:
# Start from the highest cutoff,
# and decrease until below lfc
......@@ -199,11 +213,17 @@ DE2COLOUR = {
"NS" : "0.85"}
def plot_lfc_distribution(res, contrast):
lfc = res.lfcMLE.dropna()
def plot_lfc_distribution(res, contrast, fold_type=None):
"""*fold_type* is "log2FoldChange" by default.
It can also be "lfcMLE", which is based on uncorrected values.
This may not be good for genes with low expression levels."""
#lfc = res.lfcMLE.dropna()
if fold_type is None:
fold_type = "log2FoldChange"
lfc = getattr(res, fold_type).dropna()
lfc.name = contrast
ax = sns.kdeplot(lfc)
ax.set_xlabel("lfcMLE")
ax.set_xlabel(fold_type)
ax.set_ylabel("frequency")
......@@ -218,23 +238,29 @@ def plot_MA(res,
grouping=None,
group2colour=None,
mean_range=None,
lfc_range=None):
lfc_range=None,
fold_type=None):
"""*fold_type* is "log2FoldChange" by default.
It can also be "lfcMLE", which is based on uncorrected values.
This may not be good for genes with low expression levels."""
fig, ax = plt.subplots()
# Make a column indicating whether the gene is DE or NS
res = res.assign(is_DE=res.apply(set_de_status, axis=1))
if fold_type is None:
fold_type = "log2FoldChange"
# First plot the data in grey and black
for de_status, group in res.groupby("is_DE"):
group.plot.scatter(x="baseMean", y="lfcMLE", s=2, logx=True, c=DE2COLOUR[de_status], label=f"{de_status} ({len(group)})", ax=ax)
group.plot.scatter(x="baseMean", y=fold_type, s=2, logx=True, c=DE2COLOUR[de_status], label=f"{de_status} ({len(group)})", ax=ax)
if grouping is not None:
if isinstance(grouping, str):
# Overlay colours based on the "grouping" column
if group2colour is None:
group2colour = STATUS2COLOUR
for status, group in res.groupby(grouping):
group.plot.scatter(x="baseMean", y="lfcMLE", s=1, logx=True, c=group2colour[status], label=f"{status} ({len(group)})", ax=ax)
group.plot.scatter(x="baseMean", y=fold_type, s=1, logx=True, c=group2colour[status], label=f"{status} ({len(group)})", ax=ax)
else:
(status, colour) = group2colour
res.ix[grouping].plot.scatter(x="baseMean", y="lfcMLE", s=1, logx=True, c=colour, label=f"{status} ({len(grouping)})", ax=ax)
res.ix[grouping].plot.scatter(x="baseMean", y=fold_type, s=1, logx=True, c=colour, label=f"{status} ({len(grouping)})", ax=ax)
ax.axhline(y=1, linewidth=0.5, color="0.5", linestyle="dashed")
ax.axhline(y=-1, linewidth=0.5, color="0.5", linestyle="dashed")
# TODO: check data basemean range
......@@ -242,10 +268,10 @@ def plot_MA(res,
ax.set_xlim(mean_range)
if lfc_range is not None:
(lfc_min, lfc_max) = lfc_range
lfcMLE_min = res.lfcMLE.min()
lfcMLE_max = res.lfcMLE.max()
if (lfcMLE_min < lfc_min) or (lfcMLE_max > lfc_max):
warnings.warn(f"Cannot plot lfcMLE data ([{lfcMLE_min}, {lfcMLE_max}]) in requested range ([{lfc_min}, {lfc_max}])\n")
lfc_here_min = getattr(res, fold_type).min()
lfc_here_max = getattr(res, fold_type).max()
if (lfc_here_min < lfc_min) or (lfc_here_max > lfc_max):
warnings.warn(f"Cannot plot {fold_type} data ([{lfc_here_min}, {lfc_here_max}]) in requested range ([{lfc_min}, {lfc_max}])\n")
else:
ax.set_ylim(lfc_range)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment