diff --git a/jass/models/worktable.py b/jass/models/worktable.py index a4a6c52358e53f6ef797f97c3f0dc735a21c5f17..c3dbb68a991e6b137c722f97b26f1e1cb9aef563 100755 --- a/jass/models/worktable.py +++ b/jass/models/worktable.py @@ -74,6 +74,10 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project size_chunk = 50 Nchunk = len(regions) // size_chunk + 1 + + Nsnp_total = 0 + Nsnp_jassed = 0 + for chunk in range(Nchunk): binf = chunk * size_chunk bsup = (chunk+1) * size_chunk @@ -81,7 +85,7 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project 'Region', 'CHR', 'position', 'snp_ids', 'MiddlePosition'] + phenotype_ids, where='Region >= {0} and Region < {1}'.format(binf, bsup)) sum_stat_jost_tab.dropna(axis=0, subset=phenotype_ids, how=how_dropna, inplace=True) - + sum_stat_jost_tab.reset_index(drop=True, inplace=True) if sum_stat_jost_tab.shape[0]==0: continue # skip region if no data are available @@ -89,27 +93,39 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project if remove_nan: sum_stat_jost_tab['PVALJOST'] = stat_compute(sum_stat_jost_tab[phenotype_ids])#.apply(stat_compute, axis=1) + Nsnp_total = Nsnp_total + sum_stat_jost_tab.shape[0] + Nsnp_jassed = Nsnp_jassed + sum_stat_jost_tab.shape[0] else: # Sort SumStatTab by missing patterns patterns_missing = Series(np.dot((1- sum_stat_jost_tab[phenotype_ids].isnull()), 10**np.arange((N_pheno-1), -1, -1))) pattern_frequency = patterns_missing.value_counts() / len(patterns_missing) - print("Frequency of missing patterns :") - print(pattern_frequency) - frequent_pattern = pattern_frequency.index[pattern_frequency > 0.05].tolist() + frequent_pattern = pattern_frequency.index[pattern_frequency > 0.01].tolist() # index on missing patterns: - sum_stat_jost_tab.index = Index(patterns_missing) - # Keep_only frequent_pattern - sum_stat_jost_tab = sum_stat_jost_tab.loc[frequent_pattern] + + # Apply the statistic computation by missing patterns + for pattern in frequent_pattern: - print('>>>>') - print(sum_stat_jost_tab.loc[pattern, phenotype_ids].shape) - print(stat_compute(sum_stat_jost_tab.loc[pattern, phenotype_ids]).shape) - print('<<<<') - sum_stat_jost_tab.loc[pattern, "PVALJOST"] = np.array(range(6)) - #stat_compute(sum_stat_jost_tab.loc[pattern, phenotype_ids]) + # print('>>>>') + # print(pattern) + + # print(bool_serie) + # print(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]) + # print(stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])) + # print(sum_stat_jost_tab.loc[bool_serie]) + # print('<<<<') + bool_serie = (patterns_missing == pattern) + sum_stat_jost_tab.loc[bool_serie, "PVALJOST"] = stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]) + + Nsnp_total = Nsnp_total + sum_stat_jost_tab.shape[0] + sum_stat_jost_tab.index = Index(patterns_missing) + #Keep_only frequent_pattern + sum_stat_jost_tab = sum_stat_jost_tab.loc[frequent_pattern] + Nsnp_jassed = Nsnp_jassed + sum_stat_jost_tab.shape[0] + # drop pattern index : + sum_stat_jost_tab.reset_index(drop=True, inplace=True) sum_stat_jost_tab.sort_values(by=["Region", "CHR"], inplace=True) @@ -138,6 +154,8 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project hdf_work.append('RegionSubTable', region_sub_table, min_itemsize=region_sub_table_min_itemsizes) hdf_work.close() + print("{1} SNPs treated on {0} SNPs".format(Nsnp_jassed, Nsnp_total)) + RegionSubTable = read_hdf(project_hdf_path, 'RegionSubTable') thresh = 1e-8 pval_min = RegionSubTable['PVALmin']