From 49776bd16335f2205d2963852cb11f4b4a67aec2 Mon Sep 17 00:00:00 2001
From: Blaise Li <blaise.li__git@nsup.org>
Date: Wed, 19 Feb 2020 14:54:21 +0100
Subject: [PATCH] Catch errors when correlating small data.

This could be happening for a transgene, alone in its category
(impossible to do statistics on one gene only).
---
 RNA_Seq_Cecere/RNA-seq.snakefile | 36 +++++++++++++++++++++-----------
 1 file changed, 24 insertions(+), 12 deletions(-)

diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile
index c8cf218..25342af 100644
--- a/RNA_Seq_Cecere/RNA-seq.snakefile
+++ b/RNA_Seq_Cecere/RNA-seq.snakefile
@@ -1243,11 +1243,17 @@ rule compare_replicates:
         log_data = np.log10(data)
         log_data[log_data == -np.inf] = np.nan
         with open(output.corr_file, "w") as corr_file:
-            # TODO: Add normality test and/or log-transform the data?
+            # Add normality test and/or log-transform the data
             normality = {}
             log_normality = {}
             for rep in data.columns:
-                _, normality[rep] = shapiro(data[rep])
+                try:
+                    _, normality[rep] = shapiro(data[rep])
+                except ValueError as err:
+                    if str(err) == "Data must be at least length 3.":
+                        normality[rep] = np.NAN
+                    else:
+                        raise
                 try:
                     _, log_normality[rep] = shapiro(log_data[rep].dropna())
                 except ValueError as err:
@@ -1259,22 +1265,28 @@ rule compare_replicates:
             for (rep_a, rep_b) in combinations(data.columns, 2):
                 # https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html
                 # Strictly speaking, Pearson’s correlation requires that each dataset be normally distributed.
-                if normality[rep_a] < 0.05:
-                    if log_normality[rep_a]:
-                        warn_a = "(*)"
-                    else:
+                if np.isnan(normality[rep_a]) or normality[rep_a] < 0.05:
+                    if np.isnan(log_normality[rep_a]) or log_normality[rep_a] < 0.05:
                         warn_a = "(**)"
+                    else:
+                        warn_a = "(*)"
                 else:
                     warn_a = ""
-                if normality[rep_b] < 0.05:
-                    if log_normality[rep_b]:
-                        warn_b = "(*)"
-                    else:
+                if np.isnan(normality[rep_b]) or normality[rep_b] < 0.05:
+                    if np.isnan(log_normality[rep_b]) or log_normality[rep_b] < 0.05:
                         warn_b = "(**)"
+                    else:
+                        warn_b = "(*)"
                 else:
                     warn_b = ""
-                (r_val, p_val) = pearsonr(data[rep_a], data[rep_b])
-                corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n")
+                try:
+                    (r_val, p_val) = pearsonr(data[rep_a], data[rep_b])
+                    corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n")
+                except ValueError as err:
+                    if str(err) == "x and y must have length at least 2.":
+                        corr_file.write(f"Only {len(data[[rep_a, rep_b]])} points in data.\n")
+                    else:
+                        raise
                 filtered = log_data[[rep_a, rep_b]].dropna()
                 try:
                     (log_r_val, log_p_val) = pearsonr(filtered[rep_a], filtered[rep_b])
-- 
GitLab