diff --git a/jass/models/stats.py b/jass/models/stats.py index f069a6e6516724233c322469930a757340648f06..2429285c4db66ceb5ca753d5866490a37ed9e4f4 100755 --- a/jass/models/stats.py +++ b/jass/models/stats.py @@ -44,14 +44,19 @@ def make_stat_computer_nan(cov, stat_func): invcov_bypattern = {} def compute(z): - pattern_code = z.index[0] + z_na_bool = (1-z.iloc[0,].isnull()) + pattern_code = np.dot( + z_na_bool, 10**np.arange((len(z_na_bool) - 1), -1, -1)) + z_na_bool = z_na_bool.astype(bool) if pattern_code in invcov_bypattern: invcov = invcov_bypattern[pattern_code] else: - z_na_bool = ~z.iloc[0,].isnull() mini_cov = cov.loc[z_na_bool, z_na_bool] invcov = np.linalg.inv(mini_cov) invcov_bypattern[pattern_code] = invcov + z = z.loc[:,z_na_bool] + + return stat_func(z, None, invcov) return compute @@ -89,11 +94,16 @@ def omnibus_stat(z, cov, invcov): :return: the joint statistics :rtype: float """ - p = z.shape[1] + try: + p = z.shape[1] #stat = np.sum(np.multiply(z, z.dot(invcov))) - stat = np.einsum('ij,jk,ki->i', z, invcov, z.T) + stat = np.einsum('ij,jk,ki->i', z, invcov, z.T) + return spst.chi2.sf(stat, df=p) + except ValueError: + print(z.head()) + print(invcov.shape) + print('Error in omnibus stat') - return spst.chi2.sf(stat, df=p) omnibus_stat.can_use_pattern = True @@ -113,8 +123,10 @@ def sumz_stat(z, cov, invcov): """ p = z.shape[0] loading = np.ones(p) + M_loadings = np.full(z.shape,loading) M_loadings[np.isnan(z)] = 0 + z= np.nan_to_num(z) numi = np.square(loading.dot(z.transpose())) deno = np.einsum('ij,jk,ki->i', M_loadings, cov, M_loadings.T) diff --git a/jass/models/worktable.py b/jass/models/worktable.py index c3dbb68a991e6b137c722f97b26f1e1cb9aef563..bd487d7a08d95a033841a64de36c7768fc87cbff 100755 --- a/jass/models/worktable.py +++ b/jass/models/worktable.py @@ -84,8 +84,11 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project sum_stat_jost_tab = read_hdf(init_file_path, 'SumStatTab', columns=[ 'Region', 'CHR', 'position', 'snp_ids', 'MiddlePosition'] + phenotype_ids, where='Region >= {0} and Region < {1}'.format(binf, bsup)) + + # Remake row index unique: IMPORTANT for assignation with .loc at line 98 sum_stat_jost_tab.dropna(axis=0, subset=phenotype_ids, how=how_dropna, inplace=True) sum_stat_jost_tab.reset_index(drop=True, inplace=True) + if sum_stat_jost_tab.shape[0]==0: continue # skip region if no data are available @@ -100,23 +103,25 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project patterns_missing = Series(np.dot((1- sum_stat_jost_tab[phenotype_ids].isnull()), 10**np.arange((N_pheno-1), -1, -1))) pattern_frequency = patterns_missing.value_counts() / len(patterns_missing) - frequent_pattern = pattern_frequency.index[pattern_frequency > 0.01].tolist() + frequent_pattern = pattern_frequency.index[:10].tolist() # index on missing patterns: - - # Apply the statistic computation by missing patterns for pattern in frequent_pattern: # print('>>>>') - # print(pattern) - # print(bool_serie) - # print(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]) # print(stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])) # print(sum_stat_jost_tab.loc[bool_serie]) # print('<<<<') bool_serie = (patterns_missing == pattern) - sum_stat_jost_tab.loc[bool_serie, "PVALJOST"] = stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]) + + try: + sum_stat_jost_tab.loc[bool_serie, "PVALJOST"] = stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]) + except ValueError: + #print(pattern) +# print(bool_serie.iloc[:]) + print("worktable") + #print(sum_stat_jost_tab.loc[bool_serie, phenotype_ids].head()) Nsnp_total = Nsnp_total + sum_stat_jost_tab.shape[0]