From d5d6959f0a48f4ecf93ec1a9908812d8283cc982 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Fri, 1 Feb 2019 15:57:42 +0100
Subject: [PATCH] Better handle errors due to missing data.

---
 small_RNA-seq/small_RNA-seq.snakefile | 120 ++++++++++++++++----------
 1 file changed, 74 insertions(+), 46 deletions(-)

diff --git a/small_RNA-seq/small_RNA-seq.snakefile b/small_RNA-seq/small_RNA-seq.snakefile
index 1c38395..e532bf4 100644
--- a/small_RNA-seq/small_RNA-seq.snakefile
+++ b/small_RNA-seq/small_RNA-seq.snakefile
@@ -2400,39 +2400,51 @@ rule make_fold_heatmap:
         all_folds = source_gathered_folds,
     output:
         fold_heatmap = OPJ(output_dir, "figures", "fold_heatmaps", "{small_type}_{fold_type}_heatmap.pdf"),
+    log:
+        warnings = OPJ(log_dir, "make_fold_heatmap", "{small_type}_{fold_type}.warnings"),
     benchmark:
         OPJ(log_dir, "make_fold_heatmap", "{small_type}_{fold_type}_benchmark.txt"),
     threads: 16  # to limit memory usage, actually
     resources:
         mem_mb=8150
     run:
-        usetex = mpl.rcParams.get("text.usetex", False)
-        all_folds = pd.read_table(input.all_folds, index_col=["gene"])
-        #gene_colours = all_folds.small_type.map(small_type2colour)
-        # Faster
-        gene_colours = all_folds.apply(small_type_colour_setter, axis=1)
-        if usetex:
-            gene_colours.name = texscape(f"{wildcards.small_type}_{wildcards.fold_type}")
-        else:
-            gene_colours.name = f"{wildcards.small_type}_{wildcards.fold_type}"
-        #all_folds.drop(["cosmid", "name", "small_type"], axis=1)
-        #all_folds[all_folds.columns.difference(["cosmid", "name", "small_type"])]
-        # https://github.com/mwaskom/seaborn/issues/1262
-        #mpl.use("agg")
-        try:
-            #labels_dict = merge_with(tuple, small_type2colour, DefaultCounter(all_folds.small_type))
-            # https://stackoverflow.com/a/47396625/1878788
-            small_type_counts = Counter(small_type2colour.keys())
-            small_type_counts.update(all_folds.small_type)
-            labels_dict = {small_type : (small_type2colour[small_type], count - 1) for (small_type, count) in small_type_counts.items()}
-            save_plot(
-                output.fold_heatmap, plot_fold_heatmap,
-                all_folds.drop(["cosmid", "name", "small_type"], axis=1),
-                gene_colours, labels_dict, tight=False, rasterize=True)
-        except ValueError as e:
-            print(labels_dict)
-            raise
-        #mpl.use("PDF")
+        with warn_context(log.warnings) as warn:
+            usetex = mpl.rcParams.get("text.usetex", False)
+            all_folds = pd.read_table(input.all_folds, index_col=["gene"])
+            #gene_colours = all_folds.small_type.map(small_type2colour)
+            # Faster
+            gene_colours = all_folds.apply(small_type_colour_setter, axis=1)
+            if usetex:
+                gene_colours.name = texscape(f"{wildcards.small_type}_{wildcards.fold_type}")
+            else:
+                gene_colours.name = f"{wildcards.small_type}_{wildcards.fold_type}"
+            #all_folds.drop(["cosmid", "name", "small_type"], axis=1)
+            #all_folds[all_folds.columns.difference(["cosmid", "name", "small_type"])]
+            # https://github.com/mwaskom/seaborn/issues/1262
+            #mpl.use("agg")
+            try:
+                #labels_dict = merge_with(tuple, small_type2colour, DefaultCounter(all_folds.small_type))
+                # https://stackoverflow.com/a/47396625/1878788
+                small_type_counts = Counter(small_type2colour.keys())
+                small_type_counts.update(all_folds.small_type)
+                labels_dict = {small_type : (small_type2colour[small_type], count - 1) for (small_type, count) in small_type_counts.items()}
+                save_plot(
+                    output.fold_heatmap, plot_fold_heatmap,
+                    all_folds.drop(["cosmid", "name", "small_type"], axis=1),
+                    gene_colours, labels_dict, tight=False, rasterize=True)
+            except ValueError as err:
+                if str(err) == "The number of observations cannot be determined on an empty distance matrix.":
+                    warn("\n".join([
+                        "Got ValueError:",
+                        f"{str(err)}",
+                        f"No data to plot in {output.fold_heatmap}\n"]))
+                    warn("Generating empty file.\n")
+                    # Make the file empty
+                    open(output.fold_heatmap, "w").close()
+                else:
+                    print(labels_dict)
+                    raise
+            #mpl.use("PDF")
 
 
 #def plot_norm_counts(counts_data, summaries):
@@ -3822,30 +3834,46 @@ rule make_gene_list_lfc_boxplots:
     output:
         boxplots = OPJ(output_dir, "figures", "{contrast}",
             "{contrast}_{small_type}_{fold_type}_{gene_list}_boxplots.pdf")
+    log:
+        warnings = OPJ(log_dir, "make_gene_list_lfc_boxplots", "{contrast}_{small_type}_{fold_type}_{gene_list}.warnings"),
     params:
         id_lists = set_id_lists,
     run:
-        lfcs_dict = {}
-        lfc_data = pd.read_table(input.data, index_col="gene")
-        for (list_name, id_list) in params.id_lists.items():
+        with warn_context(log.warnings) as warn:
+            lfcs_dict = {}
+            lfc_data = pd.read_table(input.data, index_col="gene")
+            for (list_name, id_list) in params.id_lists.items():
+                try:
+                    selected_rows = lfc_data.loc[lfc_data.index.intersection(id_list)] 
+                except TypeError as err:
+                    print(params.id_lists)
+                    print(type(id_list))
+                    print(lfc_data.index.intersection(id_list))
+                    raise
+                selected_data = selected_rows[wildcards.fold_type]
+                lfcs_dict[list_name] = selected_data
+            lfcs = pd.DataFrame(lfcs_dict)
+            # lfcs = pd.DataFrame(
+            #     {list_name : lfc_data.loc[set(id_list)][wildcards.fold_type] for (
+            #         list_name, id_list) in params.id_lists.items()})
+            title = f"{wildcards.small_type} folds for {wildcards.contrast}"
             try:
-                selected_rows = lfc_data.loc[lfc_data.index.intersection(id_list)] 
+                save_plot(
+                    output.boxplots,
+                    plot_boxplots,
+                    lfcs, wildcards.fold_type,
+                    title=title)
             except TypeError as err:
-                print(params.id_lists)
-                print(type(id_list))
-                print(lfc_data.index.intersection(id_list))
-                raise
-            selected_data = selected_rows[wildcards.fold_type]
-            lfcs_dict[list_name] = selected_data
-        lfcs = pd.DataFrame(lfcs_dict)
-        # lfcs = pd.DataFrame(
-        #     {list_name : lfc_data.loc[set(id_list)][wildcards.fold_type] for (
-        #         list_name, id_list) in params.id_lists.items()})
-        save_plot(
-            output.boxplots,
-            plot_boxplots,
-            lfcs, wildcards.fold_type,
-            title=f"{wildcards.small_type} folds for {wildcards.contrast}")
+                if str(err) == "Empty 'DataFrame': no numeric data to plot":
+                    warn("\n".join([
+                        "Got TypeError:",
+                        f"{str(err)}",
+                        f"No data to plot for {title}\n"]))
+                    warn("Generating empty file.\n")
+                    # Make the file empty
+                    open(output.boxplots, "w").close()
+                else:
+                    raise
 
 
 rule make_contrast_lfc_boxplots:
-- 
GitLab