Commit eb9d89e8 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

fix issues

parent bcbe25a2
import impute_jass.ld_matrix as LD import impute_jass.ld_matrix as LD
import impute_jass.stat_models as model import impute_jass.stat_models as model
import impute_jass.windows import impute_jass.windows
from impute_jass.imputation_launcher import imputation_launcher from impute_jass.imputation_launcher import ImputationLauncher
...@@ -3,15 +3,16 @@ Function set to launch imputation on a complete chromosome or ...@@ -3,15 +3,16 @@ Function set to launch imputation on a complete chromosome or
on the genome on the genome
""" """
import glob import glob
from .windows import Ld_region_centered_window_imputation from .windows import ld_region_centered_window_imputation, impg_like_imputation, realigned_zfiles_on_panel
class imputation_launcher: class ImputationLauncher(object):
def __init__(self, window_size=10000): def __init__(self, window_size=10000, imputation_style="online", buffer=2500):
self.imputation_style = "online" self.imputation_style = imputation_style
self.window_size = window_size self.window_size = window_size
self.buffer = buffer
def chromosome_imputation(self, chrom, Zscores, ref_panel, ld_folder): def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
""" """
""" """
...@@ -20,8 +21,17 @@ class imputation_launcher: ...@@ -20,8 +21,17 @@ class imputation_launcher:
#ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1) #ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
pattern = "{0}/{1}*.ld".format(ld_folder, chrom) pattern = "{0}/{1}*.ld".format(ld_folder, chrom)
for LD_file in glob.glob(pattern)[:2]: if self.imputation_style == "online":
print("processing Region: {0}".format(LD_file)) def imputer(ld_file):
return ld_region_centered_window_imputation(ld_file, ref_panel, zscore, self.window_size)
elif self.imputation_style == "batch":
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer)
Zscores = Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, self.window_size) for ld_file in glob.glob(pattern):
return Zscores print("processing Region: {0}".format(ld_file))
zscore = imputer(ld_file)
zscore = realigned_zfiles_on_panel(ref_panel, zscore)
return zscore
...@@ -3,42 +3,29 @@ function for SNP imputation ...@@ -3,42 +3,29 @@ function for SNP imputation
""" """
import numpy as np import numpy as np
def ImpG_model_batch(Zt, Sig_t, Sig_i_t, lamb=0.01): def compute_mu(sig_i_t, sig_t_inv, zt):
""" return np.dot(sig_i_t, np.dot(sig_t_inv, zt))
Argument:
Zt : (vector) the vector of known Z scores
"""
#np.fill_diagonal(Sig_t.values, 1.01)
#Sig_t.fillna(0, inplace=True)
Sig_t = Sig_t.values
np.fill_diagonal(Sig_t, (1+lamb))
Sig_t_inv =np.linalg.pinv(Sig_t)
Var = np.diag(Sig_t)[0] - np.einsum('ij,jk,ki->i', Sig_i_t, Sig_t_inv ,Sig_i_t.transpose()) def compute_var(sig_i_t, sig_t_inv, lamb, batch=True):
if batch:
var = (1 + lamb) - np.einsum('ij,jk,ki->i', sig_i_t, sig_t_inv ,sig_i_t.transpose())
else:
var = (1 + lamb) - np.dot(sig_i_t, np.dot(sig_t_inv, sig_i_t.transpose()))
return var
mu = np.dot(Sig_i_t, np.dot(Sig_t_inv, Zt)) def impg_model(zt, sig_t, sig_i_t, lamb=0.01, batch=True):
return({"Var":Var, "mu":mu})
def ImpG_model_snp(Zt, Sig_t, Sig_i_t, lamb=0.01):
""" """
Argument: Argument:
Zt : (vector) the vector of known Z scores zt : (vector) the vector of known Z scores
""" """
#np.fill_diagonal(Sig_t.values, 1.01) sig_t = sig_t.values
#Sig_t.fillna(0, inplace=True) np.fill_diagonal(sig_t, (1+lamb))
Sig_t = Sig_t.values
np.fill_diagonal(Sig_t, (1+lamb)) sig_t_inv = np.linalg.pinv(sig_t)
#I = np.identity(Sig_t.shape[0]) var = compute_var(sig_i_t, sig_t_inv, lamb, batch)
#Sig_t_inv =np.linalg.inv(Sig_t) #if var< 0:
Sig_t_inv = np.linalg.pinv(Sig_t) # var=0
Var = np.diag(Sig_t)[0] - np.dot(Sig_i_t, np.dot(Sig_t_inv, Sig_i_t.transpose()))
if Var< 0:
Var=0
#np.einsum('ij,jk,ki->i', Sig_i_t, Sig_t_inv ,Sig_i_t.transpose())
mu = np.dot(Sig_i_t, np.dot(Sig_t_inv, Zt)) mu = compute_mu(sig_i_t, sig_t_inv, zt)
mu = mu / ((1-Var)**0.5) mu = mu / (((1+lamb)-var)**0.5)
return({"Var":Var, "mu":mu}) return({"var":var, "mu":mu})
...@@ -5,162 +5,146 @@ implement the imputation window is sliding along the genome: ...@@ -5,162 +5,146 @@ implement the imputation window is sliding along the genome:
- centered_window: A sliding window centered on the Snp to impute - centered_window: A sliding window centered on the Snp to impute
""" """
from .stat_models import ImpG_model_batch, ImpG_model_snp from .stat_models import impg_model
from .ld_matrix import generate_sparse_matrix from .ld_matrix import generate_sparse_matrix
import pandas as pd import pandas as pd
import numpy as np import numpy as np
def parse_region_position(LD_file): def parse_region_position(ld_file):
""" """
Retrieve the region definition from a ld-file generated by impute_jass Retrieve the region definition from a ld-file generated by impute_jass
Argument : Argument :
LD_file : A LD file generated by jass_impute ld_file : A ld file generated by jass_impute
""" """
(chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_') (chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
return (chrom, startpos, endpos) return (chrom, startpos, endpos)
def realigned_zfiles_on_panel(ref_panel, Zscores): def realigned_zfiles_on_panel(ref_panel, zscore):
""" """
Check if the counted allele is the same in the reference panel and Check if the counted allele is the same in the reference panel and
the Zscore files. the Zscore files.
If not, the coded and other allele are inverted and the Zscores sign If not, the coded and other allele are inverted and the zscore sign
is inverted also. is inverted also.
""" """
allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0) allele_inverted = (ref_panel.loc[zscore.index, 'Ref_all'] != zscore.A0)
Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all zscore.loc[allele_inverted, "A0"] = ref_panel.alt_all
Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all zscore.loc[allele_inverted, "A1"] = ref_panel.Ref_all
Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"] zscore.loc[allele_inverted, "Z"] = - zscore.loc[allele_inverted, "Z"]
return Zscores return zscore
def prepare_Zscore_for_imputation(ref_panel, Zscores): def prepare_zscore_for_imputation(ref_panel, zscore):
""" """
Prepare the known Z score by realigning them on the reference ref_panel Prepare the known Z score by realigning them on the reference ref_panel
the snps that are not present in the ref panel are filtered the snps that are not present in the ref panel are filtered
""" """
Zscores = realigned_zfiles_on_panel(ref_panel, Zscores) zscore = realigned_zfiles_on_panel(ref_panel, zscore)
Zscores['Var'] = 1 zscore['Var'] = 1
Zscores['Nsnp_to_impute'] = -1 zscore['Nsnp_to_impute'] = -1
Zscores = Zscores.loc[Zscores.index.intersection(ref_panel.index)] zscore = zscore.loc[zscore.index.intersection(ref_panel.index)]
return Zscores return zscore
def in_region(pos_vector, start, end): def in_region(pos_vector, start, end):
return ((start < pos_vector) & (pos_vector < end)) return ((start < pos_vector) & (pos_vector < end))
def Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, window_size, unknowns=pd.Series([])): def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
""" """
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument Argument
""" """
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file) (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(LD_file, ref_panel) LD_mat = generate_sparse_matrix(ld_file, ref_panel)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
# Find Snp to impute # Find Snp to impute
if len(unknowns) == 0: if len(unknowns) == 0:
unknowns = LD_mat.index.difference(Zscores.index) unknowns = LD_mat.index.difference(zscore.index)
N_snp = len(unknowns) N_snp = len(unknowns)
print("### Imputation of {0} snps ###".format(len(unknowns))) print("### Imputation of {0} snps ###".format(len(unknowns)))
i = 0
for snp_unknown in unknowns: for i,snp_unknown in enumerate(unknowns):
# Boundary of the centered_window # Boundary of the centered_window
#print(((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block)))
start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block)) start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block)) end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
#print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)
in_LD_reg_n_window = in_region(Zscores.pos, start_pos, end_pos) in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
known = Zscores.loc[in_LD_reg_n_window].index known = zscore.loc[in_LD_reg_n_window].index
Sig_t = LD_mat.loc[known, known] sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[snp_unknown, known] sig_i_t = LD_mat.loc[snp_unknown, known]
Zt = Zscores.loc[known,'Z'] zt = zscore.loc[known,'Z']
if(len(known) > 0): if(len(known) > 0):
imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t) imp = impg_model(zt, sig_t, sig_i_t, batch=False)
Zscores.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['Var'], len(known)] zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]
# Zscores.loc[snp_unknown, "pos"] = ref_panel.loc[snp_unknown, 'pos']
# Zscores.loc[snp_unknown, "A0"] = ref_panel.loc[snp_unknown, "Ref_all"]
# Zscores.loc[snp_unknown, "A1"] = ref_panel.loc[snp_unknown, "alt_all"]
# Zscores.loc[snp_unknown, "Z"] = imp['mu']
# Zscores.loc[snp_unknown, "Var"] = imp['Var']
# Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
i = i+1
if i%300 == 0: if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4))) print("{0}\%".format(np.round(i/N_snp,4)))
return Zscores.sort_values(by="pos") return zscore.sort_values(by="pos")
def ImpG_like_imputation(LD_file, ref_panel, Zscores, window_size, buffer, unknowns=pd.Series([])): def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, unknowns=pd.Series([])):
""" """
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument Argument
""" """
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file) (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(LD_file, ref_panel) LD_mat = generate_sparse_matrix(ld_file, ref_panel)
Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
# adapt window size to cover the LD block # adapt window size to cover the LD block
window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows) window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows)
all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
all_unknowns = ref_panel.loc[ref_panel.index.difference(Zscores.index)] #zscore = pd.read_csv(Zfile, index_col=0, sep="\t")
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t") zscore = prepare_zscore_for_imputation(ref_panel, zscore)
Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
print("### Imputation of {0} snps ###".format(unknowns.shape[0])) print("### Imputation of {0} snps ###".format(unknowns.shape[0]))
i = 0
for i in range(Nwindows): for i in range(Nwindows):
print(i) # Boundary of the sliding_window
# Boundary of the centered_window
start_windows = int(start_ld_block) + i*window_resize - buffer start_windows = int(start_ld_block) + i*window_resize - buffer
end_windows = int(start_ld_block) + (i+1)*window_resize + buffer end_windows = int(start_ld_block) + (i+1)*window_resize + buffer
start_pos = max(start_windows, float(start_ld_block)) start_pos = max(start_windows, float(start_ld_block))
end_pos = min(end_windows, float(end_ld_block)) end_pos = min(end_windows, float(end_ld_block))
in_LD_reg_n_window = in_region(Zscores.pos, start_pos, end_pos) in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
unknown_in_LD_reg_n_window = in_region(all_unknowns.pos, start_pos, end_pos) unknown_in_LD_reg_n_window = in_region(all_unknowns.pos, start_pos, end_pos)
known = Zscores.loc[in_LD_reg_n_window].index known = zscore.loc[in_LD_reg_n_window].index
unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index
Sig_t = LD_mat.loc[known, known] sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[unknowns, known] sig_i_t = LD_mat.loc[unknowns, known]
Zt = Zscores.loc[known,'Z'] zt = zscore.loc[known,'Z']
if(len(known) > 0): if(len(known) > 0):
imp = ImpG_model_batch(Zt, Sig_t, Sig_i_t) imp = impg_model(zt, sig_t, sig_i_t, batch=True)
batch_df = pd.DataFrame({ batch_df = pd.DataFrame({
'pos': ref_panel.loc[unknowns, 'pos'], 'pos': ref_panel.loc[unknowns, 'pos'],
'A0': ref_panel.loc[unknowns, "Ref_all"], 'A0': ref_panel.loc[unknowns, "Ref_all"],
"A1": ref_panel.loc[unknowns,"alt_all"], "A1": ref_panel.loc[unknowns,"alt_all"],
"Z" : imp['mu'], "Z" : imp['mu'],
"Var": imp["Var"], "Var": imp["var"],
"Nsnp_to_impute" : len(known) "Nsnp_to_impute" : len(known)
}) })
Zscores = pd.concat([Zscores, batch_df]) # keep only snp in the core window
# Zscores.loc[unknowns, 'pos'] = ref_panel.loc[unknowns, 'pos'] start_windows = int(start_ld_block) + i*window_resize
# Zscores.loc[unknowns, 'A0'] = ref_panel.loc[unknowns, "Ref_all"] end_windows = int(start_ld_block) + (i+1)*window_resize
# Zscores.loc[unknowns, 'A1'] = ref_panel.loc[unknowns, "alt_all"] in_core_window = in_region(batch_df.pos, start_windows, end_windows)
# Zscores.loc[unknowns, 'Z'] = imp['mu'] zscore = pd.concat([zscore, batch_df.loc[in_core_window]])
# Zscores.loc[unknowns, 'Var'] = imp["Var"]
# Zscores.loc[unknowns, "Nsnp_to_impute"] = len(known)
i = i+1 i = i+1
if i%300 == 0: if i%10 == 0:
print("{0}\%".format(np.round(i/N_snp,4))) print("{0}\%".format(np.round(i/Nwindows,4)))
return Zscores.sort_values(by="pos") return zscore.sort_values(by="pos")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment