Commit 0372a1b8 authored by Blaise Li's avatar Blaise Li
Browse files

Skipping size factor graph for small data.

parent 49776bd1
......@@ -1617,66 +1617,70 @@ rule test_size_factor:
# The filter amounts to counts_data.mean(axis=1) > 4
#np.log10(counts_data[counts_data.sum(axis=1) > 4 * len(counts_data.columns)] + 1).plot.kde()
#np.log10(counts_data[counts_data.prod(axis=1) > 0]).plot.kde()
assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
pp = PdfPages(output.norm_counts_distrib_plot)
for normalizer in params.size_factor_types:
if normalizer == "median_ratio_to_pseudo_ref":
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
try:
xlabel = "log10(normalized counts)"
save_plot(pp, plot_counts_distribution, data, xlabel,
format="pdf",
title=params.counts_distrib_plot_title.format(normalizer))
except TypeError as e:
if str(e) in NO_DATA_ERRS:
warn("\n".join([
"Got TypeError:",
f"{str(e)}",
f"No data to plot for {normalizer}\n"]))
# assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
if len(counts_data) > 1:
pp = PdfPages(output.norm_counts_distrib_plot)
for normalizer in params.size_factor_types:
if normalizer == "median_ratio_to_pseudo_ref":
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
raise
except LinAlgError as e:
if str(e) == "singular matrix":
warn("\n".join([
"Got LinAlgError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
except ValueError as e:
if str(e) == "`dataset` input should have multiple elements.":
warn("\n".join([
"Got ValueError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
# xlabel = "log10(normalized counts)"
# if len(data) < 2:
# msg = "\n".join([
# "It seems that normalization led to data loss.",
# "Cannot use KDE to estimate distribution."])
# assert len(by_norm) > 1, msg
# msg = "".join([
# f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros",
# "and can be log-transformed."])
# warnings.warn(
# msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
# else:
# try:
# save_plot(pp, plot_counts_distribution, data, xlabel,
# format="pdf",
# title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer))
# except np.linalg.linalg.LinAlgError as e:
# msg = "".join([
# "There seems to be a problem with the data.\n",
# "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))])
# warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
pp.close()
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
try:
xlabel = "log10(normalized counts)"
save_plot(pp, plot_counts_distribution, data, xlabel,
format="pdf",
title=params.counts_distrib_plot_title.format(normalizer))
except TypeError as e:
if str(e) in NO_DATA_ERRS:
warn("\n".join([
"Got TypeError:",
f"{str(e)}",
f"No data to plot for {normalizer}\n"]))
else:
raise
except LinAlgError as e:
if str(e) == "singular matrix":
warn("\n".join([
"Got LinAlgError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
except ValueError as e:
if str(e) == "`dataset` input should have multiple elements.":
warn("\n".join([
"Got ValueError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
# xlabel = "log10(normalized counts)"
# if len(data) < 2:
# msg = "\n".join([
# "It seems that normalization led to data loss.",
# "Cannot use KDE to estimate distribution."])
# assert len(by_norm) > 1, msg
# msg = "".join([
# f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros",
# "and can be log-transformed."])
# warnings.warn(
# msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
# else:
# try:
# save_plot(pp, plot_counts_distribution, data, xlabel,
# format="pdf",
# title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer))
# except np.linalg.linalg.LinAlgError as e:
# msg = "".join([
# "There seems to be a problem with the data.\n",
# "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))])
# warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
pp.close()
else:
# Make the file empty
open(output.norm_counts_distrib_plot, "w").close()
# TODO: Deal with 0-counts cases:
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment