Commit bcbe25a2 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

first implementation of batch implementation

parent d8d5e567
......@@ -20,7 +20,7 @@ class imputation_launcher:
#ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
pattern = "{0}/{1}*.ld".format(ld_folder, chrom)
for LD_file in glob.glob(pattern):
for LD_file in glob.glob(pattern)[:2]:
print("processing Region: {0}".format(LD_file))
Zscores = Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, self.window_size)
......
......@@ -3,8 +3,8 @@ implement the imputation window is sliding along the genome:
- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
- centered_window: A sliding window centered on the Snp to impute
"""
from .stat_models import ImpG_model_batch, ImpG_model_snp
from .ld_matrix import generate_sparse_matrix
......@@ -61,7 +61,6 @@ def Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, window_siz
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
LD_mat = generate_sparse_matrix(LD_file, ref_panel)
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
......@@ -91,14 +90,77 @@ def Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, window_siz
if(len(known) > 0):
imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
Zscores.loc[snp_unknown, "pos"] = ref_panel.loc[snp_unknown, 'pos']
Zscores.loc[snp_unknown, "A0"] = ref_panel.loc[snp_unknown, "Ref_all"]
Zscores.loc[snp_unknown, "A1"] = ref_panel.loc[snp_unknown, "alt_all"]
Zscores.loc[snp_unknown, "Z"] = imp['mu']
Zscores.loc[snp_unknown, "Var"] = imp['Var']
Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
Zscores.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['Var'], len(known)]
# Zscores.loc[snp_unknown, "pos"] = ref_panel.loc[snp_unknown, 'pos']
# Zscores.loc[snp_unknown, "A0"] = ref_panel.loc[snp_unknown, "Ref_all"]
# Zscores.loc[snp_unknown, "A1"] = ref_panel.loc[snp_unknown, "alt_all"]
# Zscores.loc[snp_unknown, "Z"] = imp['mu']
# Zscores.loc[snp_unknown, "Var"] = imp['Var']
# Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
i = i+1
if i%100 == 0:
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
print(Zscores.head(10))
return Zscores.sort_values(by="pos")
def ImpG_like_imputation(LD_file, ref_panel, Zscores, window_size, buffer, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
LD_mat = generate_sparse_matrix(LD_file, ref_panel)
Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
# adapt window size to cover the LD block
window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows)
all_unknowns = ref_panel.loc[ref_panel.index.difference(Zscores.index)]
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
print("### Imputation of {0} snps ###".format(unknowns.shape[0]))
i = 0
for i in range(Nwindows):
print(i)
# Boundary of the centered_window
start_windows = int(start_ld_block) + i*window_resize - buffer
end_windows = int(start_ld_block) + (i+1)*window_resize + buffer
start_pos = max(start_windows, float(start_ld_block))
end_pos = min(end_windows, float(end_ld_block))
in_LD_reg_n_window = in_region(Zscores.pos, start_pos, end_pos)
unknown_in_LD_reg_n_window = in_region(all_unknowns.pos, start_pos, end_pos)
known = Zscores.loc[in_LD_reg_n_window].index
unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index
Sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[unknowns, known]
Zt = Zscores.loc[known,'Z']
if(len(known) > 0):
imp = ImpG_model_batch(Zt, Sig_t, Sig_i_t)
batch_df = pd.DataFrame({
'pos': ref_panel.loc[unknowns, 'pos'],
'A0': ref_panel.loc[unknowns, "Ref_all"],
"A1": ref_panel.loc[unknowns,"alt_all"],
"Z" : imp['mu'],
"Var": imp["Var"],
"Nsnp_to_impute" : len(known)
})
Zscores = pd.concat([Zscores, batch_df])
# Zscores.loc[unknowns, 'pos'] = ref_panel.loc[unknowns, 'pos']
# Zscores.loc[unknowns, 'A0'] = ref_panel.loc[unknowns, "Ref_all"]
# Zscores.loc[unknowns, 'A1'] = ref_panel.loc[unknowns, "alt_all"]
# Zscores.loc[unknowns, 'Z'] = imp['mu']
# Zscores.loc[unknowns, 'Var'] = imp["Var"]
# Zscores.loc[unknowns, "Nsnp_to_impute"] = len(known)
i = i+1
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
return Zscores.sort_values(by="pos")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment