Skip to content
Snippets Groups Projects
Commit 3c12cc82 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

Fix pattern indexation

parent 62f9844f
No related branches found
No related tags found
No related merge requests found
......@@ -74,6 +74,10 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project
size_chunk = 50
Nchunk = len(regions) // size_chunk + 1
Nsnp_total = 0
Nsnp_jassed = 0
for chunk in range(Nchunk):
binf = chunk * size_chunk
bsup = (chunk+1) * size_chunk
......@@ -81,7 +85,7 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project
'Region', 'CHR', 'position', 'snp_ids', 'MiddlePosition'] + phenotype_ids,
where='Region >= {0} and Region < {1}'.format(binf, bsup))
sum_stat_jost_tab.dropna(axis=0, subset=phenotype_ids, how=how_dropna, inplace=True)
sum_stat_jost_tab.reset_index(drop=True, inplace=True)
if sum_stat_jost_tab.shape[0]==0:
continue # skip region if no data are available
......@@ -89,27 +93,39 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project
if remove_nan:
sum_stat_jost_tab['PVALJOST'] = stat_compute(sum_stat_jost_tab[phenotype_ids])#.apply(stat_compute, axis=1)
Nsnp_total = Nsnp_total + sum_stat_jost_tab.shape[0]
Nsnp_jassed = Nsnp_jassed + sum_stat_jost_tab.shape[0]
else:
# Sort SumStatTab by missing patterns
patterns_missing = Series(np.dot((1- sum_stat_jost_tab[phenotype_ids].isnull()), 10**np.arange((N_pheno-1), -1, -1)))
pattern_frequency = patterns_missing.value_counts() / len(patterns_missing)
print("Frequency of missing patterns :")
print(pattern_frequency)
frequent_pattern = pattern_frequency.index[pattern_frequency > 0.05].tolist()
frequent_pattern = pattern_frequency.index[pattern_frequency > 0.01].tolist()
# index on missing patterns:
sum_stat_jost_tab.index = Index(patterns_missing)
# Keep_only frequent_pattern
sum_stat_jost_tab = sum_stat_jost_tab.loc[frequent_pattern]
# Apply the statistic computation by missing patterns
for pattern in frequent_pattern:
print('>>>>')
print(sum_stat_jost_tab.loc[pattern, phenotype_ids].shape)
print(stat_compute(sum_stat_jost_tab.loc[pattern, phenotype_ids]).shape)
print('<<<<')
sum_stat_jost_tab.loc[pattern, "PVALJOST"] = np.array(range(6))
#stat_compute(sum_stat_jost_tab.loc[pattern, phenotype_ids])
# print('>>>>')
# print(pattern)
# print(bool_serie)
# print(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])
# print(stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids]))
# print(sum_stat_jost_tab.loc[bool_serie])
# print('<<<<')
bool_serie = (patterns_missing == pattern)
sum_stat_jost_tab.loc[bool_serie, "PVALJOST"] = stat_compute(sum_stat_jost_tab.loc[bool_serie, phenotype_ids])
Nsnp_total = Nsnp_total + sum_stat_jost_tab.shape[0]
sum_stat_jost_tab.index = Index(patterns_missing)
#Keep_only frequent_pattern
sum_stat_jost_tab = sum_stat_jost_tab.loc[frequent_pattern]
Nsnp_jassed = Nsnp_jassed + sum_stat_jost_tab.shape[0]
# drop pattern index :
sum_stat_jost_tab.reset_index(drop=True, inplace=True)
sum_stat_jost_tab.sort_values(by=["Region", "CHR"], inplace=True)
......@@ -138,6 +154,8 @@ def create_worktable_file(phenotype_ids: List[str], init_file_path: str, project
hdf_work.append('RegionSubTable', region_sub_table, min_itemsize=region_sub_table_min_itemsizes)
hdf_work.close()
print("{1} SNPs treated on {0} SNPs".format(Nsnp_jassed, Nsnp_total))
RegionSubTable = read_hdf(project_hdf_path, 'RegionSubTable')
thresh = 1e-8
pval_min = RegionSubTable['PVALmin']
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment