Commit 49776bd1 authored by Blaise Li's avatar Blaise Li
Browse files

Catch errors when correlating small data.

This could be happening for a transgene, alone in its category
(impossible to do statistics on one gene only).
parent 0160ee2f
......@@ -1243,11 +1243,17 @@ rule compare_replicates:
log_data = np.log10(data)
log_data[log_data == -np.inf] = np.nan
with open(output.corr_file, "w") as corr_file:
# TODO: Add normality test and/or log-transform the data?
# Add normality test and/or log-transform the data
normality = {}
log_normality = {}
for rep in data.columns:
_, normality[rep] = shapiro(data[rep])
try:
_, normality[rep] = shapiro(data[rep])
except ValueError as err:
if str(err) == "Data must be at least length 3.":
normality[rep] = np.NAN
else:
raise
try:
_, log_normality[rep] = shapiro(log_data[rep].dropna())
except ValueError as err:
......@@ -1259,22 +1265,28 @@ rule compare_replicates:
for (rep_a, rep_b) in combinations(data.columns, 2):
# https://docs.scipy.org/doc/scipy-0.14.0/reference/generated/scipy.stats.pearsonr.html
# Strictly speaking, Pearson’s correlation requires that each dataset be normally distributed.
if normality[rep_a] < 0.05:
if log_normality[rep_a]:
warn_a = "(*)"
else:
if np.isnan(normality[rep_a]) or normality[rep_a] < 0.05:
if np.isnan(log_normality[rep_a]) or log_normality[rep_a] < 0.05:
warn_a = "(**)"
else:
warn_a = "(*)"
else:
warn_a = ""
if normality[rep_b] < 0.05:
if log_normality[rep_b]:
warn_b = "(*)"
else:
if np.isnan(normality[rep_b]) or normality[rep_b] < 0.05:
if np.isnan(log_normality[rep_b]) or log_normality[rep_b] < 0.05:
warn_b = "(**)"
else:
warn_b = "(*)"
else:
warn_b = ""
(r_val, p_val) = pearsonr(data[rep_a], data[rep_b])
corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n")
try:
(r_val, p_val) = pearsonr(data[rep_a], data[rep_b])
corr_file.write(f"{rep_a}{warn_a}_vs_{rep_b}{warn_b}\t{r_val} ({p_val})\n")
except ValueError as err:
if str(err) == "x and y must have length at least 2.":
corr_file.write(f"Only {len(data[[rep_a, rep_b]])} points in data.\n")
else:
raise
filtered = log_data[[rep_a, rep_b]].dropna()
try:
(log_r_val, log_p_val) = pearsonr(filtered[rep_a], filtered[rep_b])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment