Commit 15a3dd75 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

fixing variance erasing at each ld block

parent 31337253
......@@ -4,7 +4,7 @@ on the genome
"""
import glob
import pandas as pd
from .windows import ld_region_centered_window_imputation, impg_like_imputation, realigned_zfiles_on_panel
from .windows import ld_region_centered_window_imputation, prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
......@@ -13,7 +13,8 @@ class ImputationLauncher(object):
Class perform imputation of snp from summary statistic
"""
def __init__(self, window_size=10000, imputation_style="batch", buf=2500, lamb= 0.01, pinv_rcond = 0.01 ):
def __init__(self, window_size=10000, imputation_style="batch", buf=2500,
lamb= 0.01, pinv_rcond = 0.01 ):
"""
Initialise the imputation object. Fix the windows size, the buffer size
and the king of imputation employed
......@@ -29,7 +30,7 @@ class ImputationLauncher(object):
matrices to make it less singular
pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
The scipy.linalg.pinv is used to invert
the correlationmatrices
the correlation matrices
"""
self.imputation_style = imputation_style
self.window_size = window_size
......@@ -37,6 +38,7 @@ class ImputationLauncher(object):
self.lamb = lamb
self.rcond = pinv_rcond
def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
"""
Impute the panel zscore score for one chromosome and with the specified
......@@ -51,19 +53,27 @@ class ImputationLauncher(object):
Imputed zscore dataframe
"""
pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
zscore_results = zscore.copy(deep=True)
if self.imputation_style == "online":
def imputer(ld_file):
return ld_region_centered_window_imputation(ld_file, ref_panel, zscore, self.window_size)
return ld_region_centered_window_imputation(ld_file, ref_panel,
zscore,
self.window_size)
elif self.imputation_style == "batch":
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer, self.lamb, self.rcond)
return impg_like_imputation(ld_file, ref_panel, zscore,
self.window_size, self.buffer,
self.lamb, self.rcond)
for ld_file in glob.glob(pattern):
print("processing Region: {0}".format(ld_file))
zscore = imputer(ld_file)
ld_batch = imputer(ld_file)
zscore_results = pd.concat([zscore_results, ld_batch])
zscore = realigned_zfiles_on_panel(ref_panel, zscore)
return zscore
zscore_results.sort_values(by="pos")
zscore_results = realigned_zfiles_on_panel(ref_panel, zscore_results)
return zscore_results
def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output):
......
......@@ -29,7 +29,7 @@ def compute_mu(sig_i_t, sig_t_inv, zt):
zt (np.array?): Zscores of known snp
Returns:
mu_i (np.array): a vector of length i containing the estimate of zscore
"""
return np.dot(sig_i_t, np.dot(sig_t_inv, zt))
......@@ -101,5 +101,4 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
var_norm = var_in_boundaries(var, lamb)
R2 = ((1+lamb)-var_norm)
mu = mu / np.sqrt(R2)
return({"var" : var, "mu" : mu, "ld_score" : ld_score, "condition_number" : condition_number, "correct_inversion":correct_inversion })
......@@ -27,7 +27,9 @@ def realigned_zfiles_on_panel(ref_panel, zscore):
If not, the coded and other allele are inverted and the zscore sign
is inverted also.
Args:
- ref_panel (pd.dataframe) : snp of reference on the imputed chromosome
- zscore (pd.dataframe):
"""
sub_ref_panel = ref_panel.loc[zscore.index]
allele_inverted = (sub_ref_panel['Ref_all'] != zscore.A0)
......@@ -40,14 +42,20 @@ def realigned_zfiles_on_panel(ref_panel, zscore):
def prepare_zscore_for_imputation(ref_panel, zscore):
"""
Prepare the known Z score by realigning them on the reference ref_panel
the snps that are not present in the ref panel are filtered
Prepare the known Z score by
- realigning them on the reference ref_panel
- filtering snps that are not present in the ref panel
- Adding columns that will contain information on imputation:
* Var : theoritical variance estimate of z
* Nsnp_to_impute : Number of known snp
* ld_score : the sum of the square correlation of the snp with all other
known snp (give an idea if the we have enough information to compute a
precise z estimate)
"""
zscore = realigned_zfiles_on_panel(ref_panel, zscore)
zscore['Var'] = -1
zscore['Var'] = -1.0
zscore['Nsnp_to_impute'] = -1
zscore['ld_score'] = -1
zscore['ld_score'] = -1.0
zscore = zscore.loc[zscore.index.intersection(ref_panel.index)]
return zscore
......@@ -57,7 +65,17 @@ def in_region(pos_vector, start, end):
def compute_window_and_size(start_ld_block, end_ld_block, window_size):
"""
Compute the number of window to pave the ld_block tightly
Compute the number of window to pave the Linkage desiquilibrium block
tightly with a window
size the closest to the one specified by the user
Args:
start_ld_block (int): the start of the Linkage desiquilibrium block
end_ld_block (int): the end of the Linkage desiquilibrium block
window_size (int) : size in bp of the window to compute imputation_style
all unknown snps in the window will be imputed from the known snp in
the windows
return:
a tuple ( number of windows (int), size of the window resize to the ld block (int))
"""
Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
......@@ -83,7 +101,8 @@ def format_result_df(imp, ref_panel, known, unknowns):
"correct_inversion":imp["correct_inversion"],
"Nsnp_to_impute" : len(known)
}
column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", "correct_inversion", "Nsnp_to_impute"]
column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number",
"correct_inversion", "Nsnp_to_impute"]
batch_df = pd.DataFrame(result_dict, columns = column_order)
return batch_df
......@@ -95,23 +114,28 @@ def print_progression(i, Nwindows):
if i%(np.ceil(Nwindows/10)) == 0:
print("{0}\%".format(np.round(i/Nwindows,3)))
def empty_imputed_dataframe():
column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number",
"correct_inversion", "Nsnp_to_impute"]
zscore_results = pd.DataFrame(columns = column_order)
return zscore_results
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
rcond, file_output, unknowns=pd.Series([])):
rcond, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
Argument.
Args:
ld_file (str): Linkage desiquilibrium matrix files
ref_panel (pd.dataframe): the dataframe containing reference panel
snps
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size)
all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
zscore.to_csv(z_fo, sep='\t')
zscore_results = zscore.copy(deep=True)
# the dataframe containing results
zscore_results = empty_imputed_dataframe()
print("### Imputation of {0} snps ###".format(all_unknowns.shape[0]))
......@@ -145,7 +169,6 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
# keep only SNP with non negligible explained variance
snp_well_predicted = (batch_df.Var < 0.9)
batch_df_filt = batch_df.loc[in_core_window & snp_well_predicted, zscore_results.columns]
batch_df_filt.to_csv(z_fo, sep='\t', mode = 'a')
zscore_results = pd.concat([zscore_results, batch_df_filt])
i = i+1
print_progression(i, Nwindows)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment