Commit eb9d89e8 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

fix issues

parent bcbe25a2
import impute_jass.ld_matrix as LD
import impute_jass.stat_models as model
import impute_jass.windows
from impute_jass.imputation_launcher import imputation_launcher
from impute_jass.imputation_launcher import ImputationLauncher
......@@ -3,15 +3,16 @@ Function set to launch imputation on a complete chromosome or
on the genome
"""
import glob
from .windows import Ld_region_centered_window_imputation
from .windows import ld_region_centered_window_imputation, impg_like_imputation, realigned_zfiles_on_panel
class imputation_launcher:
class ImputationLauncher(object):
def __init__(self, window_size=10000):
self.imputation_style = "online"
def __init__(self, window_size=10000, imputation_style="online", buffer=2500):
self.imputation_style = imputation_style
self.window_size = window_size
self.buffer = buffer
def chromosome_imputation(self, chrom, Zscores, ref_panel, ld_folder):
def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
"""
"""
......@@ -20,8 +21,17 @@ class imputation_launcher:
#ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
pattern = "{0}/{1}*.ld".format(ld_folder, chrom)
for LD_file in glob.glob(pattern)[:2]:
print("processing Region: {0}".format(LD_file))
if self.imputation_style == "online":
def imputer(ld_file):
return ld_region_centered_window_imputation(ld_file, ref_panel, zscore, self.window_size)
elif self.imputation_style == "batch":
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer)
Zscores = Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, self.window_size)
return Zscores
for ld_file in glob.glob(pattern):
print("processing Region: {0}".format(ld_file))
zscore = imputer(ld_file)
zscore = realigned_zfiles_on_panel(ref_panel, zscore)
return zscore
......@@ -3,42 +3,29 @@ function for SNP imputation
"""
import numpy as np
def ImpG_model_batch(Zt, Sig_t, Sig_i_t, lamb=0.01):
"""
Argument:
Zt : (vector) the vector of known Z scores
"""
#np.fill_diagonal(Sig_t.values, 1.01)
#Sig_t.fillna(0, inplace=True)
Sig_t = Sig_t.values
np.fill_diagonal(Sig_t, (1+lamb))
Sig_t_inv =np.linalg.pinv(Sig_t)
def compute_mu(sig_i_t, sig_t_inv, zt):
return np.dot(sig_i_t, np.dot(sig_t_inv, zt))
Var = np.diag(Sig_t)[0] - np.einsum('ij,jk,ki->i', Sig_i_t, Sig_t_inv ,Sig_i_t.transpose())
def compute_var(sig_i_t, sig_t_inv, lamb, batch=True):
if batch:
var = (1 + lamb) - np.einsum('ij,jk,ki->i', sig_i_t, sig_t_inv ,sig_i_t.transpose())
else:
var = (1 + lamb) - np.dot(sig_i_t, np.dot(sig_t_inv, sig_i_t.transpose()))
return var
mu = np.dot(Sig_i_t, np.dot(Sig_t_inv, Zt))
return({"Var":Var, "mu":mu})
def ImpG_model_snp(Zt, Sig_t, Sig_i_t, lamb=0.01):
def impg_model(zt, sig_t, sig_i_t, lamb=0.01, batch=True):
"""
Argument:
Zt : (vector) the vector of known Z scores
zt : (vector) the vector of known Z scores
"""
#np.fill_diagonal(Sig_t.values, 1.01)
#Sig_t.fillna(0, inplace=True)
Sig_t = Sig_t.values
np.fill_diagonal(Sig_t, (1+lamb))
#I = np.identity(Sig_t.shape[0])
#Sig_t_inv =np.linalg.inv(Sig_t)
Sig_t_inv = np.linalg.pinv(Sig_t)
Var = np.diag(Sig_t)[0] - np.dot(Sig_i_t, np.dot(Sig_t_inv, Sig_i_t.transpose()))
if Var< 0:
Var=0
#np.einsum('ij,jk,ki->i', Sig_i_t, Sig_t_inv ,Sig_i_t.transpose())
sig_t = sig_t.values
np.fill_diagonal(sig_t, (1+lamb))
sig_t_inv = np.linalg.pinv(sig_t)
var = compute_var(sig_i_t, sig_t_inv, lamb, batch)
#if var< 0:
# var=0
mu = np.dot(Sig_i_t, np.dot(Sig_t_inv, Zt))
mu = mu / ((1-Var)**0.5)
return({"Var":Var, "mu":mu})
mu = compute_mu(sig_i_t, sig_t_inv, zt)
mu = mu / (((1+lamb)-var)**0.5)
return({"var":var, "mu":mu})
......@@ -5,162 +5,146 @@ implement the imputation window is sliding along the genome:
- centered_window: A sliding window centered on the Snp to impute
"""
from .stat_models import ImpG_model_batch, ImpG_model_snp
from .stat_models import impg_model
from .ld_matrix import generate_sparse_matrix
import pandas as pd
import numpy as np
def parse_region_position(LD_file):
def parse_region_position(ld_file):
"""
Retrieve the region definition from a ld-file generated by impute_jass
Argument :
LD_file : A LD file generated by jass_impute
ld_file : A ld file generated by jass_impute
"""
(chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_')
(chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
return (chrom, startpos, endpos)
def realigned_zfiles_on_panel(ref_panel, Zscores):
def realigned_zfiles_on_panel(ref_panel, zscore):
"""
Check if the counted allele is the same in the reference panel and
the Zscore files.
If not, the coded and other allele are inverted and the Zscores sign
If not, the coded and other allele are inverted and the zscore sign
is inverted also.
"""
allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0)
allele_inverted = (ref_panel.loc[zscore.index, 'Ref_all'] != zscore.A0)
Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all
Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all
Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"]
zscore.loc[allele_inverted, "A0"] = ref_panel.alt_all
zscore.loc[allele_inverted, "A1"] = ref_panel.Ref_all
zscore.loc[allele_inverted, "Z"] = - zscore.loc[allele_inverted, "Z"]
return Zscores
return zscore
def prepare_Zscore_for_imputation(ref_panel, Zscores):
def prepare_zscore_for_imputation(ref_panel, zscore):
"""
Prepare the known Z score by realigning them on the reference ref_panel
the snps that are not present in the ref panel are filtered
"""
Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
Zscores['Var'] = 1
Zscores['Nsnp_to_impute'] = -1
Zscores = Zscores.loc[Zscores.index.intersection(ref_panel.index)]
return Zscores
zscore = realigned_zfiles_on_panel(ref_panel, zscore)
zscore['Var'] = 1
zscore['Nsnp_to_impute'] = -1
zscore = zscore.loc[zscore.index.intersection(ref_panel.index)]
return zscore
def in_region(pos_vector, start, end):
return ((start < pos_vector) & (pos_vector < end))
def Ld_region_centered_window_imputation(LD_file, ref_panel, Zscores, window_size, unknowns=pd.Series([])):
def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(LD_file, ref_panel)
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
# Find Snp to impute
if len(unknowns) == 0:
unknowns = LD_mat.index.difference(Zscores.index)
unknowns = LD_mat.index.difference(zscore.index)
N_snp = len(unknowns)
print("### Imputation of {0} snps ###".format(len(unknowns)))
i = 0
for snp_unknown in unknowns:
for i,snp_unknown in enumerate(unknowns):
# Boundary of the centered_window
#print(((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block)))
start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
#print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)
in_LD_reg_n_window = in_region(Zscores.pos, start_pos, end_pos)
in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
known = Zscores.loc[in_LD_reg_n_window].index
Sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[snp_unknown, known]
Zt = Zscores.loc[known,'Z']
known = zscore.loc[in_LD_reg_n_window].index
sig_t = LD_mat.loc[known, known]
sig_i_t = LD_mat.loc[snp_unknown, known]
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
Zscores.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['Var'], len(known)]
# Zscores.loc[snp_unknown, "pos"] = ref_panel.loc[snp_unknown, 'pos']
# Zscores.loc[snp_unknown, "A0"] = ref_panel.loc[snp_unknown, "Ref_all"]
# Zscores.loc[snp_unknown, "A1"] = ref_panel.loc[snp_unknown, "alt_all"]
# Zscores.loc[snp_unknown, "Z"] = imp['mu']
# Zscores.loc[snp_unknown, "Var"] = imp['Var']
# Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
i = i+1
imp = impg_model(zt, sig_t, sig_i_t, batch=False)
zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
return Zscores.sort_values(by="pos")
return zscore.sort_values(by="pos")
def ImpG_like_imputation(LD_file, ref_panel, Zscores, window_size, buffer, unknowns=pd.Series([])):
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(LD_file, ref_panel)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
# adapt window size to cover the LD block
window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows)
all_unknowns = ref_panel.loc[ref_panel.index.difference(Zscores.index)]
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
#zscore = pd.read_csv(Zfile, index_col=0, sep="\t")
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
print("### Imputation of {0} snps ###".format(unknowns.shape[0]))
i = 0
for i in range(Nwindows):
print(i)
# Boundary of the centered_window
# Boundary of the sliding_window
start_windows = int(start_ld_block) + i*window_resize - buffer
end_windows = int(start_ld_block) + (i+1)*window_resize + buffer
start_pos = max(start_windows, float(start_ld_block))
end_pos = min(end_windows, float(end_ld_block))
in_LD_reg_n_window = in_region(Zscores.pos, start_pos, end_pos)
in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos)
unknown_in_LD_reg_n_window = in_region(all_unknowns.pos, start_pos, end_pos)
known = Zscores.loc[in_LD_reg_n_window].index
known = zscore.loc[in_LD_reg_n_window].index
unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index
Sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[unknowns, known]
Zt = Zscores.loc[known,'Z']
sig_t = LD_mat.loc[known, known]
sig_i_t = LD_mat.loc[unknowns, known]
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = ImpG_model_batch(Zt, Sig_t, Sig_i_t)
imp = impg_model(zt, sig_t, sig_i_t, batch=True)
batch_df = pd.DataFrame({
'pos': ref_panel.loc[unknowns, 'pos'],
'A0': ref_panel.loc[unknowns, "Ref_all"],
"A1": ref_panel.loc[unknowns,"alt_all"],
"Z" : imp['mu'],
"Var": imp["Var"],
"Var": imp["var"],
"Nsnp_to_impute" : len(known)
})
Zscores = pd.concat([Zscores, batch_df])
# Zscores.loc[unknowns, 'pos'] = ref_panel.loc[unknowns, 'pos']
# Zscores.loc[unknowns, 'A0'] = ref_panel.loc[unknowns, "Ref_all"]
# Zscores.loc[unknowns, 'A1'] = ref_panel.loc[unknowns, "alt_all"]
# Zscores.loc[unknowns, 'Z'] = imp['mu']
# Zscores.loc[unknowns, 'Var'] = imp["Var"]
# Zscores.loc[unknowns, "Nsnp_to_impute"] = len(known)
# keep only snp in the core window
start_windows = int(start_ld_block) + i*window_resize
end_windows = int(start_ld_block) + (i+1)*window_resize
in_core_window = in_region(batch_df.pos, start_windows, end_windows)
zscore = pd.concat([zscore, batch_df.loc[in_core_window]])
i = i+1
if i%300 == 0:
print("{0}\%".format(np.round(i/N_snp,4)))
if i%10 == 0:
print("{0}\%".format(np.round(i/Nwindows,4)))
return Zscores.sort_values(by="pos")
return zscore.sort_values(by="pos")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment