Skipping size factor graph for small data.

0372a1b8 · Blaise Li · 49776bd1 · 0372a1b8
Commit 0372a1b8 authored 5 years ago by Blaise Li
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -1617,66 +1617,70 @@ rule test_size_factor:
            # The filter amounts to counts_data.mean(axis=1) > 4
            #np.log10(counts_data[counts_data.sum(axis=1) > 4 * len(counts_data.columns)] + 1).plot.kde()
            #np.log10(counts_data[counts_data.prod(axis=1) > 0]).plot.kde()
-            assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
+            # assert len(counts_data) > 1, "Counts data with only one row cannot have its distribution estimated using KDE."
-            pp = PdfPages(output.norm_counts_distrib_plot)
+            if len(counts_data) > 1:
-            for normalizer in params.size_factor_types:
+                pp = PdfPages(output.norm_counts_distrib_plot)
-                if normalizer == "median_ratio_to_pseudo_ref":
+                for normalizer in params.size_factor_types:
-                    size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
+                    if normalizer == "median_ratio_to_pseudo_ref":
-                else:
+                        size_factors = median_ratio_to_pseudo_ref_size_factors(counts_data)
-                    size_factors = summaries.loc[normalizer]
-                by_norm = counts_data / size_factors
-                data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
-                try:
-                    xlabel = "log10(normalized counts)"
-                    save_plot(pp, plot_counts_distribution, data, xlabel,
-                        format="pdf",
-                        title=params.counts_distrib_plot_title.format(normalizer))
-                except TypeError as e:
-                    if str(e) in NO_DATA_ERRS:
-                        warn("\n".join([
-                            "Got TypeError:",
-                            f"{str(e)}",
-                            f"No data to plot for {normalizer}\n"]))
                    else:
-                        raise
+                        size_factors = summaries.loc[normalizer]
-                except LinAlgError as e:
+                    by_norm = counts_data / size_factors
-                    if str(e) == "singular matrix":
+                    data = np.log10(by_norm[counts_data.prod(axis=1) > 0])
-                        warn("\n".join([
+                    try:
-                            "Got LinAlgError:", f"{str(e)}",
+                        xlabel = "log10(normalized counts)"
-                            f"Data cannot be plotted for {normalizer}",
+                        save_plot(pp, plot_counts_distribution, data, xlabel,
-                            f"{data}\n"]))
+                            format="pdf",
-                    else:
+                            title=params.counts_distrib_plot_title.format(normalizer))
-                        raise
+                    except TypeError as e:
-                except ValueError as e:
+                        if str(e) in NO_DATA_ERRS:
-                    if str(e) == "`dataset` input should have multiple elements.":
+                            warn("\n".join([
-                        warn("\n".join([
+                                "Got TypeError:",
-                            "Got ValueError:", f"{str(e)}",
+                                f"{str(e)}",
-                            f"Data cannot be plotted for {normalizer}",
+                                f"No data to plot for {normalizer}\n"]))
-                            f"{data}\n"]))
+                        else:
-                    else:
+                            raise
-                        raise
+                    except LinAlgError as e:
-                # xlabel = "log10(normalized counts)"
+                        if str(e) == "singular matrix":
-                # if len(data) < 2:
+                            warn("\n".join([
-                #     msg = "\n".join([
+                                "Got LinAlgError:", f"{str(e)}",
-                #         "It seems that normalization led to data loss.",
+                                f"Data cannot be plotted for {normalizer}",
-                #         "Cannot use KDE to estimate distribution."])
+                                f"{data}\n"]))
-                #     assert len(by_norm) > 1, msg
+                        else:
-                #     msg = "".join([
+                            raise
-                #         f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros",
+                    except ValueError as e:
-                #         "and can be log-transformed."])
+                        if str(e) == "`dataset` input should have multiple elements.":
-                #     warnings.warn(
+                            warn("\n".join([
-                #         msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
+                                "Got ValueError:", f"{str(e)}",
-                # else:
+                                f"Data cannot be plotted for {normalizer}",
-                #     try:
+                                f"{data}\n"]))
-                #         save_plot(pp, plot_counts_distribution, data, xlabel,
+                        else:
-                #             format="pdf",
+                            raise
-                #             title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer))
+                    # xlabel = "log10(normalized counts)"
-                #     except np.linalg.linalg.LinAlgError as e:
+                    # if len(data) < 2:
-                #         msg = "".join([
+                    #     msg = "\n".join([
-                #             "There seems to be a problem with the data.\n",
+                    #         "It seems that normalization led to data loss.",
-                #             "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))])
+                    #         "Cannot use KDE to estimate distribution."])
-                #         warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
+                    #     assert len(by_norm) > 1, msg
-            pp.close()
+                    #     msg = "".join([
+                    #         f"Only {len(by_norm[counts_data.prod(axis=1) > 0])} rows have no zeros",
+                    #         "and can be log-transformed."])
+                    #     warnings.warn(
+                    #         msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
+                    # else:
+                    #     try:
+                    #         save_plot(pp, plot_counts_distribution, data, xlabel,
+                    #             format="pdf",
+                    #             title="Normalized %s_%s counts distributions\n(size factor: %s)" % (wildcards.orientation, wildcards.biotype, normalizer))
+                    #     except np.linalg.linalg.LinAlgError as e:
+                    #         msg = "".join([
+                    #             "There seems to be a problem with the data.\n",
+                    #             "The data matrix has %d lines and %d columns.\n" % (len(data), len(data.columns))])
+                    #         warnings.warn(msg + "\nSkipping %s_%s" % (wildcards.orientation, wildcards.biotype))
+                pp.close()
+            else:
+                # Make the file empty
+                open(output.norm_counts_distrib_plot, "w").close()
 # TODO: Deal with 0-counts cases: