Commit e47676db authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

remove SNP by SNP imputation because it doesn't seem pertinent (way too slow...

remove SNP by SNP imputation because it doesn't seem pertinent (way too slow with no increased accuracy)
parent 1adef6b5
"""
Function set to launch imputation on a complete chromosome or
Function set to launch SNP imputation on a complete chromosome or
on the genome
"""
import glob
......@@ -13,17 +13,14 @@ class ImputationLauncher(object):
Class perform imputation of snp from summary statistic
"""
def __init__(self, window_size=10000, imputation_style="batch", buf=2500,
def __init__(self, window_size=10000, buf=2500,
lamb= 0.01, pinv_rcond = 0.01 ):
"""
Initialise the imputation object. Fix the windows size, the buffer size
and the king of imputation employed
and the king of imputation employed
Args:
window_size (int): size of the imputation window in bp
imputation_style (str): define if the windows while span the genome
in a non overlapping fashion ("batch") or
by being centered on each snp to impute
('online')
buffer (int): the size of the padding around the windows of
imputation (relevant only for batch imputation)
lamb (float): size of the increment added to snp correlation
......@@ -32,7 +29,7 @@ class ImputationLauncher(object):
The scipy.linalg.pinv is used to invert
the correlation matrices
"""
self.imputation_style = imputation_style
self.window_size = window_size
self.buffer = buf
self.lamb = lamb
......@@ -55,16 +52,11 @@ class ImputationLauncher(object):
pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
zscore_results = zscore.copy(deep=True)
if self.imputation_style == "online":
def imputer(ld_file):
return ld_region_centered_window_imputation(ld_file, ref_panel,
zscore,
self.window_size)
elif self.imputation_style == "batch":
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore,
self.window_size, self.buffer,
self.lamb, self.rcond)
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore,
self.window_size, self.buffer,
self.lamb, self.rcond)
for ld_file in glob.glob(pattern):
print("processing Region: {0}".format(ld_file))
......
......@@ -127,7 +127,7 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
Args:
ld_file (str): Linkage desiquilibrium matrix files
ref_panel (pd.dataframe): the dataframe containing reference panel
snps
snps
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
......@@ -174,42 +174,3 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
print_progression(i, Nwindows)
return zscore_results.sort_values(by="pos")
def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
# Find Snp to impute
if len(unknowns) == 0:
unknowns = LD_mat.index.difference(zscore.index)
N_snp = len(unknowns)
print("### Imputation of {0} snps ###".format(len(unknowns)))
for i,snp_unknown in enumerate(unknowns):
# Boundary of the centered_window
start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
known = zscore.loc[in_LD_reg_n_window].index
sig_t = LD_mat.loc[known, known]
sig_i_t = LD_mat.loc[snp_unknown, known]
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = impg_model(zt, sig_t, sig_i_t, batch=False)
zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
return zscore.sort_values(by="pos")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment