From 49776bd16335f2205d2963852cb11f4b4a67aec2 Mon Sep 17 00:00:00 2001 From: Blaise Li <blaise.li__git@nsup.org> Date: Wed, 19 Feb 2020 14:54:21 +0100 Subject: [PATCH] Catch errors when correlating small data. This could be happening for a transgene, alone in its category (impossible to do statistics on one gene only). --- RNA_Seq_Cecere/RNA-seq.snakefile | 36 +++++++++++++++++++++----------- 1 file changed, 24 insertions(+), 12 deletions(-) diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index c8cf218..25342af 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1243,11 +1243,17 @@ rule compare_replicates: log_data = np.log10(data) log_data[log_data == -np.inf] = np.nan with open(output.corr_file, "w") as corr_file: - # TODO: Add normality test and/or log-transform the data? + # Add normality test and/or log-transform the data normality = {} log_normality = {} for rep in data.columns: - _, normality[rep] = shapiro(data[rep]) + try: + _, normality[rep] = shapiro(data[rep]) + except ValueError as err: + if str(err) == "Data must be at least length 3.": + normality[rep] = np.NAN + else: + raise try: _, log_normality[rep] = shapiro(log_data[rep].dropna()) except ValueError as err: @@ -1259,22 +1265,28 @@ rule compare_replicates: for (rep_a, rep_b) in combinations(data.columns, 2): # https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html # Strictly speaking, Pearson’s correlation requires that each dataset be normally distributed. - if normality[rep_a] < 0.05: - if log_normality[rep_a]: - warn_a = "(*)" - else: + if np.isnan(normality[rep_a]) or normality[rep_a] < 0.05: + if np.isnan(log_normality[rep_a]) or log_normality[rep_a] < 0.05: warn_a = "(**)" + else: + warn_a = "(*)" else: warn_a = "" - if normality[rep_b] < 0.05: - if log_normality[rep_b]: - warn_b = "(*)" - else: + if np.isnan(normality[rep_b]) or normality[rep_b] < 0.05: + if np.isnan(log_normality[rep_b]) or log_normality[rep_b] < 0.05: warn_b = "(**)" + else: + warn_b = "(*)" else: warn_b = "" - (r_val, p_val) = pearsonr(data[rep_a], data[rep_b]) - corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n") + try: + (r_val, p_val) = pearsonr(data[rep_a], data[rep_b]) + corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n") + except ValueError as err: + if str(err) == "x and y must have length at least 2.": + corr_file.write(f"Only {len(data[[rep_a, rep_b]])} points in data.\n") + else: + raise filtered = log_data[[rep_a, rep_b]].dropna() try: (log_r_val, log_p_val) = pearsonr(filtered[rep_a], filtered[rep_b]) -- GitLab