Commit e162df3d authored by Blaise Li's avatar Blaise Li
Browse files

Make valid pdfs instead of empty files.

parent 0372a1b8
......@@ -1680,7 +1680,8 @@ rule test_size_factor:
pp.close()
else:
# Make the file empty
open(output.norm_counts_distrib_plot, "w").close()
warn(f"Not enough data to make this file:\n{output.norm_counts_distrib_plot}\n")
plot_text(output.norm_counts_distrib_plot, "Not enough data.", params.counts_distrib_plot_title)
# TODO: Deal with 0-counts cases:
......
......@@ -96,7 +96,7 @@ import pysam
import pyBigWig
# To compute correlation coefficient
from scipy.stats.stats import pearsonr
# from scipy.stats.stats import pearsonr
# To catch errors when plotting KDE
from scipy.linalg import LinAlgError
# For data processing and displaying
......@@ -1649,54 +1649,59 @@ rule test_size_factor:
# The filter amounts to counts_data.mean(axis=1) > 4
#np.log10(counts_data[counts_data.sum(axis=1) > 4 * len(counts_data.columns)] + 1).plot.kde()
#np.log10(counts_data[counts_data.prod(axis=1) > 0]).plot.kde()
assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
pp = PdfPages(output.norm_counts_distrib_plot)
for normalizer in params.size_factor_types:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
##pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
## Ignore lines with zeroes instead:
#pseudo_ref = (counts_data[counts_data.prod(axis=1) > 0]).apply(gmean, axis=1)
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
##size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
#size_factors = counts_data[counts_data.prod(axis=1) > 0].apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
try:
xlabel = "log10(normalized counts)"
save_plot(pp, plot_counts_distribution, data, xlabel,
format="pdf",
title=params.counts_distrib_plot_title.format(normalizer))
except TypeError as e:
if str(e) in NO_DATA_ERRS:
warn("\n".join([
"Got TypeError:",
f"{str(e)}",
f"No data to plot for {normalizer}\n"]))
else:
raise
except LinAlgError as e:
if str(e) == "singular matrix":
warn("\n".join([
"Got LinAlgError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
except ValueError as e:
if str(e) == "`dataset` input should have multiple elements.":
warn("\n".join([
"Got ValueError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
# assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
if len(counts_data) > 1:
pp = PdfPages(output.norm_counts_distrib_plot)
for normalizer in params.size_factor_types:
if normalizer == "median_ratio_to_pseudo_ref":
## Adapted from DESeq paper (doi:10.1186/gb-2010-11-10-r106) but
## add pseudo-count to compute the geometric mean, then remove it
##pseudo_ref = (counts_data + 1).apply(gmean, axis=1) - 1
## Ignore lines with zeroes instead:
#pseudo_ref = (counts_data[counts_data.prod(axis=1) > 0]).apply(gmean, axis=1)
#def median_ratio_to_pseudo_ref(col):
# return (col / pseudo_ref).median()
##size_factors = counts_data.apply(median_ratio_to_pseudo_ref, axis=0)
#size_factors = counts_data[counts_data.prod(axis=1) > 0].apply(median_ratio_to_pseudo_ref, axis=0)
size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
else:
raise
pp.close()
size_factors = summaries.loc[normalizer]
by_norm = counts_data / size_factors
data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
try:
xlabel = "log10(normalized counts)"
save_plot(pp, plot_counts_distribution, data, xlabel,
format="pdf",
title=params.counts_distrib_plot_title.format(normalizer))
except TypeError as e:
if str(e) in NO_DATA_ERRS:
warn("\n".join([
"Got TypeError:",
f"{str(e)}",
f"No data to plot for {normalizer}\n"]))
else:
raise
except LinAlgError as e:
if str(e) == "singular matrix":
warn("\n".join([
"Got LinAlgError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
except ValueError as e:
if str(e) == "`dataset` input should have multiple elements.":
warn("\n".join([
"Got ValueError:", f"{str(e)}",
f"Data cannot be plotted for {normalizer}",
f"{data}\n"]))
else:
raise
pp.close()
else:
# Make the file empty
warn(f"Not enough data to make this file:\n{output.norm_counts_distrib_plot}\n")
plot_text(output.norm_counts_distrib_plot, "Not enough data.", params.counts_distrib_plot_title)
rule standardize_counts:
......
......@@ -141,7 +141,7 @@ from mappy import fastx_read
import pyBigWig
# To compute correlation coefficient
from scipy.stats.stats import pearsonr
# from scipy.stats.stats import pearsonr
# To catch errors when plotting KDE
from scipy.linalg import LinAlgError
# For data processing and displaying
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment