Skip to content
Snippets Groups Projects
Commit 2c3f1fa8 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

NA with pattern: Fix indexation in stats.py

parent 3c12cc82
No related branches found
No related tags found
No related merge requests found
......@@ -44,14 +44,19 @@ def make_stat_computer_nan(cov, stat_func):
invcov_bypattern = {}
def compute(z):
pattern_code = z.index[0]
z_na_bool = (1-z.iloc[0,].isnull())
pattern_code = np.dot(
z_na_bool, 10**np.arange((len(z_na_bool) - 1), -1, -1))
z_na_bool = z_na_bool.astype(bool)
if pattern_code in invcov_bypattern:
invcov = invcov_bypattern[pattern_code]
else:
z_na_bool = ~z.iloc[0,].isnull()
mini_cov = cov.loc[z_na_bool, z_na_bool]
invcov = np.linalg.inv(mini_cov)
invcov_bypattern[pattern_code] = invcov
z = z.loc[:,z_na_bool]
return stat_func(z, None, invcov)
return compute
......@@ -89,11 +94,16 @@ def omnibus_stat(z, cov, invcov):
:return: the joint statistics
:rtype: float
"""
p = z.shape[1]
try:
p = z.shape[1]
#stat = np.sum(np.multiply(z, z.dot(invcov)))
stat = np.einsum('ij,jk,ki->i', z, invcov, z.T)
stat = np.einsum('ij,jk,ki->i', z, invcov, z.T)
return spst.chi2.sf(stat, df=p)
except ValueError:
print(z.head())
print(invcov.shape)
print('Error in omnibus stat')
return spst.chi2.sf(stat, df=p)
omnibus_stat.can_use_pattern = True
......@@ -113,8 +123,10 @@ def sumz_stat(z, cov, invcov):
"""
p = z.shape[0]
loading = np.ones(p)
M_loadings = np.full(z.shape,loading)
M_loadings[np.isnan(z)] = 0
z= np.nan_to_num(z)
numi = np.square(loading.dot(z.transpose()))
deno = np.einsum('ij,jk,ki->i', M_loadings, cov, M_loadings.T)
......
......@@ -84,8 +84,11 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project
sum_stat_jost_tab = read_hdf(init_file_path, 'SumStatTab', columns=[
'Region', 'CHR', 'position', 'snp_ids', 'MiddlePosition'] + phenotype_ids,
where='Region >= {0} and Region < {1}'.format(binf, bsup))
# Remake row index unique: IMPORTANT for assignation with .loc at line 98
sum_stat_jost_tab.dropna(axis=0, subset=phenotype_ids, how=how_dropna, inplace=True)
sum_stat_jost_tab.reset_index(drop=True, inplace=True)
if sum_stat_jost_tab.shape[0]==0:
continue # skip region if no data are available
......@@ -100,23 +103,25 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project
patterns_missing = Series(np.dot((1- sum_stat_jost_tab[phenotype_ids].isnull()), 10**np.arange((N_pheno-1), -1, -1)))
pattern_frequency = patterns_missing.value_counts() / len(patterns_missing)
frequent_pattern = pattern_frequency.index[pattern_frequency > 0.01].tolist()
frequent_pattern = pattern_frequency.index[:10].tolist()
# index on missing patterns:
# Apply the statistic computation by missing patterns
for pattern in frequent_pattern:
# print('>>>>')
# print(pattern)
# print(bool_serie)
# print(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])
# print(stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]))
# print(sum_stat_jost_tab.loc[bool_serie])
# print('<<<<')
bool_serie = (patterns_missing == pattern)
sum_stat_jost_tab.loc[bool_serie, "PVALJOST"] = stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])
try:
sum_stat_jost_tab.loc[bool_serie, "PVALJOST"] = stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])
except ValueError:
#print(pattern)
# print(bool_serie.iloc[:])
print("worktable")
#print(sum_stat_jost_tab.loc[bool_serie, phenotype_ids].head())
Nsnp_total = Nsnp_total + sum_stat_jost_tab.shape[0]
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment