Skip to content
Snippets Groups Projects
Commit 0372a1b8 authored by Blaise Li's avatar Blaise Li
Browse files

Skipping size factor graph for small data.

parent 49776bd1
No related branches found
No related tags found
No related merge requests found
...@@ -1617,66 +1617,70 @@ rule test_size_factor: ...@@ -1617,66 +1617,70 @@ rule test_size_factor:
# The filter amounts to counts_data.mean(axis=1) > 4 # The filter amounts to counts_data.mean(axis=1) > 4
#np.log10(counts_data[counts_data.sum(axis=1) > 4 * len(counts_data.columns)] + 1).plot.kde() #np.log10(counts_data[counts_data.sum(axis=1) > 4 * len(counts_data.columns)] + 1).plot.kde()
#np.log10(counts_data[counts_data.prod(axis=1) > 0]).plot.kde() #np.log10(counts_data[counts_data.prod(axis=1) > 0]).plot.kde()
assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE." # assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
pp = PdfPages(output.norm_counts_distrib_plot) if len(counts_data) > 1:
for normalizer in params.size_factor_types: pp = PdfPages(output.norm_counts_distrib_plot)
if normalizer == "median_ratio_to_pseudo_ref": for normalizer in params.size_factor_types:
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) if normalizer == "median_ratio_to_pseudo_ref":
else: size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
try:
xlabel = "log10(normalized counts)"
save_plot(pp, plot_counts_distribution, data, xlabel,
format="pdf",
title=params.counts_distrib_plot_title.format(normalizer))
except TypeError as e:
if str(e) in NO_DATA_ERRS:
warn("\n".join([
"Got TypeError:",
f"{str(e)}",
f"No data to plot for {normalizer}\n"]))
else: else:
raise size_factors = summaries.loc[normalizer]
except LinAlgError as e: by_norm = counts_data / size_factors
if str(e) == "singular matrix": data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
warn("\n".join([ try:
"Got LinAlgError:", f"{str(e)}", xlabel = "log10(normalized counts)"
f"Data cannot be plotted for {normalizer}", save_plot(pp, plot_counts_distribution, data, xlabel,
f"{data}\n"])) format="pdf",
else: title=params.counts_distrib_plot_title.format(normalizer))
raise except TypeError as e:
except ValueError as e: if str(e) in NO_DATA_ERRS:
if str(e) == "`dataset` input should have multiple elements.": warn("\n".join([
warn("\n".join([ "Got TypeError:",
"Got ValueError:", f"{str(e)}", f"{str(e)}",
f"Data cannot be plotted for {normalizer}", f"No data to plot for {normalizer}\n"]))
f"{data}\n"])) else:
else: raise
raise except LinAlgError as e:
# xlabel = "log10(normalized counts)" if str(e) == "singular matrix":
# if len(data) < 2: warn("\n".join([
# msg = "\n".join([ "Got LinAlgError:", f"{str(e)}",
# "It seems that normalization led to data loss.", f"Data cannot be plotted for {normalizer}",
# "Cannot use KDE to estimate distribution."]) f"{data}\n"]))
# assert len(by_norm) > 1, msg else:
# msg = "".join([ raise
# f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros", except ValueError as e:
# "and can be log-transformed."]) if str(e) == "`dataset` input should have multiple elements.":
# warnings.warn( warn("\n".join([
# msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype)) "Got ValueError:", f"{str(e)}",
# else: f"Data cannot be plotted for {normalizer}",
# try: f"{data}\n"]))
# save_plot(pp, plot_counts_distribution, data, xlabel, else:
# format="pdf", raise
# title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer)) # xlabel = "log10(normalized counts)"
# except np.linalg.linalg.LinAlgError as e: # if len(data) < 2:
# msg = "".join([ # msg = "\n".join([
# "There seems to be a problem with the data.\n", # "It seems that normalization led to data loss.",
# "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))]) # "Cannot use KDE to estimate distribution."])
# warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype)) # assert len(by_norm) > 1, msg
pp.close() # msg = "".join([
# f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros",
# "and can be log-transformed."])
# warnings.warn(
# msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
# else:
# try:
# save_plot(pp, plot_counts_distribution, data, xlabel,
# format="pdf",
# title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer))
# except np.linalg.linalg.LinAlgError as e:
# msg = "".join([
# "There seems to be a problem with the data.\n",
# "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))])
# warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
pp.close()
else:
# Make the file empty
open(output.norm_counts_distrib_plot, "w").close()
# TODO: Deal with 0-counts cases: # TODO: Deal with 0-counts cases:
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment