Commit f07a787a authored by Hanna  JULIENNE's avatar Hanna JULIENNE

whole genome imputation

parent defb0d7a
......@@ -7,30 +7,58 @@ from .windows import ld_region_centered_window_imputation, impg_like_imputation,
class ImputationLauncher(object):
def __init__(self, window_size=10000, imputation_style="online", buffer=2500):
def __init__(self, window_size=10000, imputation_style="online", buf=2500, lamb= 0.01, pinv_rcond = 0.01 ):
self.imputation_style = imputation_style
self.window_size = window_size
self.buffer = buffer
self.buffer = buf
self.lamb = lamb
self.rcond = pinv_rcond
def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
"""
Impute the panel zscore score for one chromosome and with the specified parameters
Args:
chrom : str specifying chromosome
zscore : known zscore
ref_panel : location of the folder of reference chromosome
ld_folder: location of linkage desiquilibrium matrices
Returns
Imputed zscore dataframe
"""
ref_panel_file = "/mnt/atlas/PCMA/1._DATA/ImpG_refpanel/{0}.eur.1pct.bim".format(chrom)
#ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
pattern = "{0}/{1}*.ld".format(ld_folder, chrom)
if self.imputation_style == "online":
def imputer(ld_file):
return ld_region_centered_window_imputation(ld_file, ref_panel, zscore, self.window_size)
elif self.imputation_style == "batch":
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer)
return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer, self.lamb, self.rcond)
for ld_file in glob.glob(pattern):
print("processing Region: {0}".format(ld_file))
zscore = imputer(ld_file)
zscore = realigned_zfiles_on_panel(ref_panel, zscore)
return zscore
def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
"""
Launch imputation on all chromosome
"""
for i in range(1, 23):
ref_panel_file = ref_panel_folder + "/chr" + str(i) + ".eur.1pct.bim"
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
known_zscore_file = zscore_folder + "/z_" + tag + "chr" + str(i) + ".txt"
known_zscore = pd.read_csv(known_zscore_file, index_col=0, sep="\t")
chrom = "chr"+str(i)
z_imp = self.chromosome_imputation(chrom, known_zscore, ref_panel, ld_folder)
imputed_zscore = folder_output + "/z_" + tag + "chr" + str(i) + ".txt"
z_imp.to_csv(imputed_zscore, sep="\t")
......@@ -27,7 +27,7 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder):
fo = "{0}/chr{1}_{2}_{3}".format(folder, chr, startpos, endpos)
cmd = "p-link --noweb --bfile {0} --r --ld-snp-list ./snp_list.txt --ld-window 50 --ld-window-kb 3000 --ld-window-r2 0.4 --chr {1} --out {2}".format(reffile, chr, fo)
print(cmd)
sub.check_output(cmd, shell=True)
......
......@@ -21,18 +21,15 @@ def compute_var(sig_i_t, sig_t_inv, lamb, batch=True):
def check_inversion(sig_t, sig_t_inv):
return np.allclose(sig_t, np.dot(sig_t, np.dot(sig_t_inv, sig_t)))
def impg_model(zt, sig_t, sig_i_t, lamb=0.01, batch=True):
def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
"""
Argument:
zt : (vector) the vector of known Z scores
"""
snps = sig_t.columns
sig_t = sig_t.values
np.fill_diagonal(sig_t, (1+lamb))
sig_t_inv = sc.linalg.pinv(sig_t)
sig_t_inv = sc.linalg.pinv(sig_t, rcond=rcond)
if batch:
condition_number = np.array([np.linalg.cond(sig_t)]*sig_i_t.shape[0])
......@@ -44,7 +41,7 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, batch=True):
var, ld_score = compute_var(sig_i_t, sig_t_inv, lamb, batch)
mu = compute_mu(sig_i_t, sig_t_inv, zt)
if np.any(mu > 100):
if np.any(mu > 50):
print("ABERANT SNP SNiP ")
#mu = mu / (((1+lamb)-var)**0.5)
return({"var":var, "mu":mu, "ld_score": ld_score, "condition_number":condition_number, "correct_inversion":correct_inversion })
......@@ -16,7 +16,6 @@ def parse_region_position(ld_file):
Retrieve the region definition from a ld-file generated by impute_jass
Argument :
ld_file : A ld file generated by jass_impute
"""
(chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
return (chrom, startpos, endpos)
......@@ -91,7 +90,7 @@ def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size
return zscore.sort_values(by="pos")
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, unknowns=pd.Series([])):
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
......@@ -133,7 +132,7 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, unknow
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = impg_model(zt, sig_t, sig_i_t, batch=True)
imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True)
batch_df = pd.DataFrame({
'pos': ref_panel.loc[unknowns, 'pos'],
'A0': ref_panel.loc[unknowns, "Ref_all"],
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment