Commit 8c2374aa authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

refactoring windows.py a little

parent 12959804
...@@ -8,17 +8,21 @@ def launch_chromosome_imputation(args): ...@@ -8,17 +8,21 @@ def launch_chromosome_imputation(args):
Function whose only purpose is to allow the calling of the ImputationLauncher.chromosome_imputation method Function whose only purpose is to allow the calling of the ImputationLauncher.chromosome_imputation method
from an entry point from an entry point
""" """
print(args) print("Imputation of {0} gwas for chromosome {1}".format(args.gwas, args.chrom))
imputer = ImputationLauncher( window_size=int(args.window_size), buf=int(args.buffer_size), imputer = ImputationLauncher( window_size=int(args.window_size), buf=int(args.buffer_size),
lamb= float(args.l2_regularization), pinv_rcond = float(args.eigen_treshold)) lamb= float(args.l2_regularization), pinv_rcond = float(args.eigen_treshold))
z_file = "{0}/z_{1}_{2}.txt".format(args.zscore_folder, args.gwas, args.chrom) z_file = "{0}/z_{1}_{2}.txt".format(args.zscore_folder, args.gwas, args.chrom)
zscore = pd.read_csv(z_file) zscore = pd.read_csv(z_file,index_col=0, sep="\t")
ref_panel_file = args.ref_folder + "/"+ args.chrom +".eur.1pct.bim" ref_panel_file = args.ref_folder + "/"+ args.chrom +".eur.1pct.bim"
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1) ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
imputer.chromosome_imputation(args.chrom, zscore, ref_panel, args.ld_folder) imputed_zscore = imputer.chromosome_imputation(args.chrom, zscore, ref_panel, args.ld_folder)
print("Imputation DONE")
z_fo = "{0}/z_{1}_{2}.txt".format(args.output_folder, args.gwas, args.chrom)
imputed_zscore.to_csv(z_fo, sep='\t')
print("Save imputation done ")
def add_chromosome_imputation_argument(parser): def add_chromosome_imputation_argument(parser):
parser.add_argument('--chrom', required=True, help= "chromosome to impute to the chr\d+ format") parser.add_argument('--chrom', required=True, help= "chromosome to impute to the chr\d+ format")
......
...@@ -3,6 +3,7 @@ function for SNP imputation ...@@ -3,6 +3,7 @@ function for SNP imputation
""" """
import numpy as np import numpy as np
import scipy as sc import scipy as sc
import scipy.linalg
def compute_mu(sig_i_t, sig_t_inv, zt): def compute_mu(sig_i_t, sig_t_inv, zt):
return np.dot(sig_i_t, np.dot(sig_t_inv, zt)) return np.dot(sig_i_t, np.dot(sig_t_inv, zt))
...@@ -28,7 +29,7 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True): ...@@ -28,7 +29,7 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
""" """
sig_t = sig_t.values sig_t = sig_t.values
np.fill_diagonal(sig_t, (1+lamb)) np.fill_diagonal(sig_t, (1+lamb))
sig_t_inv = sc.linalg.pinv(sig_t, rcond=rcond) sig_t_inv = scipy.linalg.pinv(sig_t, rcond=rcond)
if batch: if batch:
condition_number = np.array([np.linalg.cond(sig_t)]*sig_i_t.shape[0]) condition_number = np.array([np.linalg.cond(sig_t)]*sig_i_t.shape[0])
......
...@@ -90,14 +90,12 @@ def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size ...@@ -90,14 +90,12 @@ def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size
return zscore.sort_values(by="pos") return zscore.sort_values(by="pos")
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])):
def compute_window_and_size(start_ld_block, end_ld_block, window_size):
""" """
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute Compute the number of window to pave the ld_block tightly
Argument
""" """
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
# adapt window size to cover the LD block # adapt window size to cover the LD block
if Nwindows > 0: if Nwindows > 0:
...@@ -105,9 +103,49 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, ...@@ -105,9 +103,49 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
else: else:
Nwindows = 1 Nwindows = 1
window_resize = np.ceil(int(end_ld_block) - (int(start_ld_block))) window_resize = np.ceil(int(end_ld_block) - (int(start_ld_block)))
return Nwindows, window_resize
def format_result_df(imp, ref_panel, known, unknowns):
result_dict = {
'pos': ref_panel.loc[unknowns, 'pos'],
'A0': ref_panel.loc[unknowns, "Ref_all"],
"A1": ref_panel.loc[unknowns,"alt_all"],
"Z" : imp['mu'],
"Var": imp["var"],
"ld_score" : imp["ld_score"],
"condition_number": imp['condition_number'],
"correct_inversion":imp["correct_inversion"],
"Nsnp_to_impute" : len(known)
}
column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", "correct_inversion", "Nsnp_to_impute"]
batch_df = pd.DataFrame(result_dict, columns = column_order)
return batch_df
def print_progression(i, Nwindows):
"""
print the percentage of window treated in the ld block
"""
if i%(np.ceil(Nwindows/10)) == 0:
print("{0}\%".format(np.round(i/Nwindows,3)))
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size)
all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)] all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
#zscore = pd.read_csv(Zfile, index_col=0, sep="\t")
zscore = prepare_zscore_for_imputation(ref_panel, zscore) zscore = prepare_zscore_for_imputation(ref_panel, zscore)
zscore_results = zscore.copy(deep=True) zscore_results = zscore.copy(deep=True)
...@@ -133,28 +171,15 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, ...@@ -133,28 +171,15 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
if(len(known) > 0): if(len(known) > 0):
imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True) imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True)
batch_df = pd.DataFrame({ batch_df = format_result_df(imp, ref_panel, known, unknowns):
'pos': ref_panel.loc[unknowns, 'pos'],
'A0': ref_panel.loc[unknowns, "Ref_all"],
"A1": ref_panel.loc[unknowns,"alt_all"],
"ws_start":start_windows,
"ws_end":end_windows,
"Z" : imp['mu'],
"Var": imp["var"],
"ld_score" : imp["ld_score"],
"condition_number": imp['condition_number'],
"correct_inversion":imp["correct_inversion"],
"Nsnp_to_impute" : len(known)
})
# keep only snp in the core window
start_windows = int(start_ld_block) + i*window_resize
end_windows = int(start_ld_block) + (i+1)*window_resize
in_core_window = in_region(batch_df.pos, start_windows, end_windows)
zscore_results = pd.concat([zscore_results, batch_df.loc[in_core_window]])
# keep only snp in the core window (and not in the buffer)
start_core_window = int(start_ld_block) + i*window_resize
end_core_window = int(start_ld_block) + (i+1)*window_resize
in_core_window = in_region(batch_df.pos, start_core_window, end_core_window)
zscore_results = pd.concat([zscore_results, batch_df.loc[in_core_window]])
i = i+1 i = i+1
if i%10 == 0: print_progression(i, Nwindows)
print("{0}\%".format(np.round(i/Nwindows,4)))
return zscore_results.sort_values(by="pos") return zscore_results.sort_values(by="pos")
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment