Commit c79adc67 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

fixed centered window imputation

parent 8b3491c6
......@@ -9,6 +9,8 @@ from .stat_models import ImpG_model_batch, ImpG_model_snp
from .ld_matrix import generate_sparse_matrix
import pandas as pd
import numpy as np
def parse_region_position(LD_file):
"""
......@@ -37,43 +39,66 @@ def realigned_zfiles_on_panel(ref_panel, Zscores):
return Zscores
def prepare_Zscore_for_imputation(ref_panel, Zscores):
"""
Prepare the known Z score by realigning them on the reference ref_panel
the snps that are not present in the ref panel are filtered
"""
Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
Zscores['Var'] = 1
Zscores['Nsnp_to_impute'] = -1
Zscores = Zscores.loc[Zscores.index.intersection(ref_panel.index)]
return Zscores
def in_region(pos_vector, start, end):
return ((start < pos_vector) & (pos_vector < end))
def centered_window_imputation(LD_file, ref_panel_folder, Zfile, window_size):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, startpos, endpos) = parse_region_position(LD_file)
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
ref_panel_file = "/mnt/atlas/PCMA/1._DATA/ImpG_refpanel/{0}.eur.1pct.bim".format(chrom)
print(ref_panel_file)
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
LD_mat = generate_sparse_matrix(LD_file, ref_panel)
Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
Zscores['Var'] = 1
# dispatch snp between typed and untyped
unknowns = LD_mat.index.difference(Df.index)
# Find Snp to impute
unknowns = LD_mat.index.difference(Zscores.index)
N_snp = len(unknowns)
print("### Imputation of {0} snps ###".format(len(unknowns)))
i = 0
for snp_unknown in unknowns:
# Boundary of the centered_window
start_ld_block = ref_panel.loc[snp_unknown,'pos'] - window
end_ld_block = ref_panel.loc[snp_unknown,'pos'] + window
known = Zscores.loc[(start_ld_block < Df.pos) & (Df.pos < end_ld_block)].index
start_pos = ref_panel.loc[snp_unknown,'pos'] - window_size
end_pos = ref_panel.loc[snp_unknown,'pos'] + window_size
#print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)
in_LD_reg_n_window = in_region(Zscores.pos, int(start_ld_block), int(end_ld_block)) & in_region(Zscores.pos, start_pos, end_pos)
known = Zscores.loc[in_LD_reg_n_window].index
Sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[snp_unknown, known]
Zt = Zscores.loc[known,'Z']
imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
Zt.loc[snp_unknown, "Z"] = imp['mu']
Zt.loc[snp_unknown, "Var"] = imp['Var']
if(len(known) > 0):
imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
Zscores.loc[snp_unknown, "Z"] = imp['mu']
Zscores.loc[snp_unknown, "Var"] = imp['Var']
Zscores[snp_unknown, 'Nsnp_to_impute'] = len(known)
i = i+1
if i%10 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
return Zt.sort_values(by="pos")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment