diff --git a/RNA_Seq_Cecere/RNA-seq.snakefile b/RNA_Seq_Cecere/RNA-seq.snakefile index c8cf2189efe509efbcaca40f6073a14e8ad4fd6f..25342af7721a6d2915279193c7102f140dee1bdb 100644 --- a/RNA_Seq_Cecere/RNA-seq.snakefile +++ b/RNA_Seq_Cecere/RNA-seq.snakefile @@ -1243,11 +1243,17 @@ rule compare_replicates: log_data = np.log10(data) log_data[log_data == -np.inf] = np.nan with open(output.corr_file, "w") as corr_file: - # TODO: Add normality test and/or log-transform the data? + # Add normality test and/or log-transform the data normality = {} log_normality = {} for rep in data.columns: - _, normality[rep] = shapiro(data[rep]) + try: + _, normality[rep] = shapiro(data[rep]) + except ValueError as err: + if str(err) == "Data must be at least length 3.": + normality[rep] = np.NAN + else: + raise try: _, log_normality[rep] = shapiro(log_data[rep].dropna()) except ValueError as err: @@ -1259,22 +1265,28 @@ rule compare_replicates: for (rep_a, rep_b) in combinations(data.columns, 2): # https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html # Strictly speaking, Pearson’s correlation requires that each dataset be normally distributed. - if normality[rep_a] < 0.05: - if log_normality[rep_a]: - warn_a = "(*)" - else: + if np.isnan(normality[rep_a]) or normality[rep_a] < 0.05: + if np.isnan(log_normality[rep_a]) or log_normality[rep_a] < 0.05: warn_a = "(**)" + else: + warn_a = "(*)" else: warn_a = "" - if normality[rep_b] < 0.05: - if log_normality[rep_b]: - warn_b = "(*)" - else: + if np.isnan(normality[rep_b]) or normality[rep_b] < 0.05: + if np.isnan(log_normality[rep_b]) or log_normality[rep_b] < 0.05: warn_b = "(**)" + else: + warn_b = "(*)" else: warn_b = "" - (r_val, p_val) = pearsonr(data[rep_a], data[rep_b]) - corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n") + try: + (r_val, p_val) = pearsonr(data[rep_a], data[rep_b]) + corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n") + except ValueError as err: + if str(err) == "x and y must have length at least 2.": + corr_file.write(f"Only {len(data[[rep_a, rep_b]])} points in data.\n") + else: + raise filtered = log_data[[rep_a, rep_b]].dropna() try: (log_r_val, log_p_val) = pearsonr(filtered[rep_a], filtered[rep_b])