Commit 2ec760e8 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

add filter on SNP variance

parent b11049c0
......@@ -20,7 +20,6 @@ def parse_region_position(ld_file):
(chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
return (chrom, startpos, endpos)
def realigned_zfiles_on_panel(ref_panel, zscore):
"""
Check if the counted allele is the same in the reference panel and
......@@ -52,44 +51,6 @@ def prepare_zscore_for_imputation(ref_panel, zscore):
def in_region(pos_vector, start, end):
return ((start < pos_vector) & (pos_vector < end))
def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
# Find Snp to impute
if len(unknowns) == 0:
unknowns = LD_mat.index.difference(zscore.index)
N_snp = len(unknowns)
print("### Imputation of {0} snps ###".format(len(unknowns)))
for i,snp_unknown in enumerate(unknowns):
# Boundary of the centered_window
start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
known = zscore.loc[in_LD_reg_n_window].index
sig_t = LD_mat.loc[known, known]
sig_i_t = LD_mat.loc[snp_unknown, known]
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = impg_model(zt, sig_t, sig_i_t, batch=False)
zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
return zscore.sort_values(by="pos")
def compute_window_and_size(start_ld_block, end_ld_block, window_size):
"""
......@@ -131,8 +92,6 @@ def print_progression(i, Nwindows):
if i%(np.ceil(Nwindows/10)) == 0:
print("{0}\%".format(np.round(i/Nwindows,3)))
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
......@@ -178,8 +137,50 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
end_core_window = int(start_ld_block) + (i+1)*window_resize
in_core_window = in_region(batch_df.pos, start_core_window, end_core_window)
zscore_results = pd.concat([zscore_results, batch_df.loc[in_core_window, zscore_results.columns]])
# keep only SNP with non negligible explained variance
snp_well_predicted = batch_df.Var < 0.5
batch_df_filt = batch_df_filt.loc[in_core_window & snp_well_predicted, zscore_results.columns]
zscore_results = pd.concat([zscore_results, batch_df_filt])
i = i+1
print_progression(i, Nwindows)
return zscore_results.sort_values(by="pos")
def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
# Find Snp to impute
if len(unknowns) == 0:
unknowns = LD_mat.index.difference(zscore.index)
N_snp = len(unknowns)
print("### Imputation of {0} snps ###".format(len(unknowns)))
for i,snp_unknown in enumerate(unknowns):
# Boundary of the centered_window
start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
known = zscore.loc[in_LD_reg_n_window].index
sig_t = LD_mat.loc[known, known]
sig_i_t = LD_mat.loc[snp_unknown, known]
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = impg_model(zt, sig_t, sig_i_t, batch=False)
zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
return zscore.sort_values(by="pos")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment