diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index 25342af7721a6d2915279193c7102f140dee1bdb..a89dff76d29155e145f40694b479387a1f770586 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1617,66 +1617,70 @@ rule test_size_factor: # The filter amounts to counts_data.mean(axis=1) > 4 #np.log10(counts_data[counts_data.sum(axis=1) > 4 * len(counts_data.columns)] + 1).plot.kde() #np.log10(counts_data[counts_data.prod(axis=1) > 0]).plot.kde() - assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE." - pp = PdfPages(output.norm_counts_distrib_plot) - for normalizer in params.size_factor_types: - if normalizer == "median_ratio_to_pseudo_ref": - size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) - else: - size_factors = summaries.loc[normalizer] - by_norm = counts_data / size_factors - data = np.log10(by_norm[counts_data.prod(axis=1) > 0]) - try: - xlabel = "log10(normalized counts)" - save_plot(pp, plot_counts_distribution, data, xlabel, - format="pdf", - title=params.counts_distrib_plot_title.format(normalizer)) - except TypeError as e: - if str(e) in NO_DATA_ERRS: - warn("\n".join([ - "Got TypeError:", - f"{str(e)}", - f"No data to plot for {normalizer}\n"])) + # assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE." + if len(counts_data) > 1: + pp = PdfPages(output.norm_counts_distrib_plot) + for normalizer in params.size_factor_types: + if normalizer == "median_ratio_to_pseudo_ref": + size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data) else: - raise - except LinAlgError as e: - if str(e) == "singular matrix": - warn("\n".join([ - "Got LinAlgError:", f"{str(e)}", - f"Data cannot be plotted for {normalizer}", - f"{data}\n"])) - else: - raise - except ValueError as e: - if str(e) == "`dataset` input should have multiple elements.": - warn("\n".join([ - "Got ValueError:", f"{str(e)}", - f"Data cannot be plotted for {normalizer}", - f"{data}\n"])) - else: - raise - # xlabel = "log10(normalized counts)" - # if len(data) < 2: - # msg = "\n".join([ - # "It seems that normalization led to data loss.", - # "Cannot use KDE to estimate distribution."]) - # assert len(by_norm) > 1, msg - # msg = "".join([ - # f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros", - # "and can be log-transformed."]) - # warnings.warn( - # msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype)) - # else: - # try: - # save_plot(pp, plot_counts_distribution, data, xlabel, - # format="pdf", - # title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer)) - # except np.linalg.linalg.LinAlgError as e: - # msg = "".join([ - # "There seems to be a problem with the data.\n", - # "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))]) - # warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype)) - pp.close() + size_factors = summaries.loc[normalizer] + by_norm = counts_data / size_factors + data = np.log10(by_norm[counts_data.prod(axis=1) > 0]) + try: + xlabel = "log10(normalized counts)" + save_plot(pp, plot_counts_distribution, data, xlabel, + format="pdf", + title=params.counts_distrib_plot_title.format(normalizer)) + except TypeError as e: + if str(e) in NO_DATA_ERRS: + warn("\n".join([ + "Got TypeError:", + f"{str(e)}", + f"No data to plot for {normalizer}\n"])) + else: + raise + except LinAlgError as e: + if str(e) == "singular matrix": + warn("\n".join([ + "Got LinAlgError:", f"{str(e)}", + f"Data cannot be plotted for {normalizer}", + f"{data}\n"])) + else: + raise + except ValueError as e: + if str(e) == "`dataset` input should have multiple elements.": + warn("\n".join([ + "Got ValueError:", f"{str(e)}", + f"Data cannot be plotted for {normalizer}", + f"{data}\n"])) + else: + raise + # xlabel = "log10(normalized counts)" + # if len(data) < 2: + # msg = "\n".join([ + # "It seems that normalization led to data loss.", + # "Cannot use KDE to estimate distribution."]) + # assert len(by_norm) > 1, msg + # msg = "".join([ + # f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros", + # "and can be log-transformed."]) + # warnings.warn( + # msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype)) + # else: + # try: + # save_plot(pp, plot_counts_distribution, data, xlabel, + # format="pdf", + # title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer)) + # except np.linalg.linalg.LinAlgError as e: + # msg = "".join([ + # "There seems to be a problem with the data.\n", + # "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))]) + # warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype)) + pp.close() + else: + # Make the file empty + open(output.norm_counts_distrib_plot, "w").close() # TODO: Deal with 0-counts cases: