diff --git a/impute_jass/impute_jass/__main__.py b/impute_jass/impute_jass/__main__.py index 54007291b613545e1b24be1c92651ae97c08e119..5ead09607e96b25772218c25e600e47d5b618f30 100644 --- a/impute_jass/impute_jass/__main__.py +++ b/impute_jass/impute_jass/__main__.py @@ -8,17 +8,21 @@ def launch_chromosome_imputation(args): Function whose only purpose is to allow the calling of the ImputationLauncher.chromosome_imputation method from an entry point """ - print(args) + print("Imputation of {0} gwas for chromosome {1}".format(args.gwas, args.chrom)) imputer = ImputationLauncher( window_size=int(args.window_size), buf=int(args.buffer_size), lamb= float(args.l2_regularization), pinv_rcond = float(args.eigen_treshold)) z_file = "{0}/z_{1}_{2}.txt".format(args.zscore_folder, args.gwas, args.chrom) - zscore = pd.read_csv(z_file) - + zscore = pd.read_csv(z_file,index_col=0, sep="\t") ref_panel_file = args.ref_folder + "/"+ args.chrom +".eur.1pct.bim" ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1) + - imputer.chromosome_imputation(args.chrom, zscore, ref_panel, args.ld_folder) + imputed_zscore = imputer.chromosome_imputation(args.chrom, zscore, ref_panel, args.ld_folder) + print("Imputation DONE") + z_fo = "{0}/z_{1}_{2}.txt".format(args.output_folder, args.gwas, args.chrom) + imputed_zscore.to_csv(z_fo, sep='\t') + print("Save imputation done ") def add_chromosome_imputation_argument(parser): parser.add_argument('--chrom', required=True, help= "chromosome to impute to the chr\d+ format") diff --git a/impute_jass/impute_jass/stat_models.py b/impute_jass/impute_jass/stat_models.py index a5c5343e3bd920ece9aa1cd8f3494881772d62b6..8234672c44122bc81895c3dc91d975300f79cb6a 100644 --- a/impute_jass/impute_jass/stat_models.py +++ b/impute_jass/impute_jass/stat_models.py @@ -3,6 +3,7 @@ function for SNP imputation """ import numpy as np import scipy as sc +import scipy.linalg def compute_mu(sig_i_t, sig_t_inv, zt): return np.dot(sig_i_t, np.dot(sig_t_inv, zt)) @@ -28,7 +29,7 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True): """ sig_t = sig_t.values np.fill_diagonal(sig_t, (1+lamb)) - sig_t_inv = sc.linalg.pinv(sig_t, rcond=rcond) + sig_t_inv = scipy.linalg.pinv(sig_t, rcond=rcond) if batch: condition_number = np.array([np.linalg.cond(sig_t)]*sig_i_t.shape[0]) diff --git a/impute_jass/impute_jass/windows.py b/impute_jass/impute_jass/windows.py index ca8cdd299d245d520f7471eca048b101d7f3c046..8607a6797a7184302d5810824134548b8c9157b9 100644 --- a/impute_jass/impute_jass/windows.py +++ b/impute_jass/impute_jass/windows.py @@ -90,14 +90,12 @@ def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size return zscore.sort_values(by="pos") -def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])): + +def compute_window_and_size(start_ld_block, end_ld_block, window_size): """ - Each missing Snp is imputed by known snp found in a window centered on the SNP to impute - Argument + Compute the number of window to pave the ld_block tightly """ - (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file) - LD_mat = generate_sparse_matrix(ld_file, ref_panel) Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size # adapt window size to cover the LD block if Nwindows > 0: @@ -105,9 +103,49 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, else: Nwindows = 1 window_resize = np.ceil(int(end_ld_block) - (int(start_ld_block))) + + return Nwindows, window_resize + +def format_result_df(imp, ref_panel, known, unknowns): + + result_dict = { + 'pos': ref_panel.loc[unknowns, 'pos'], + 'A0': ref_panel.loc[unknowns, "Ref_all"], + "A1": ref_panel.loc[unknowns,"alt_all"], + "Z" : imp['mu'], + "Var": imp["var"], + "ld_score" : imp["ld_score"], + "condition_number": imp['condition_number'], + "correct_inversion":imp["correct_inversion"], + "Nsnp_to_impute" : len(known) + } + column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", "correct_inversion", "Nsnp_to_impute"] + batch_df = pd.DataFrame(result_dict, columns = column_order) + return batch_df + + +def print_progression(i, Nwindows): + """ + print the percentage of window treated in the ld block + """ + if i%(np.ceil(Nwindows/10)) == 0: + print("{0}\%".format(np.round(i/Nwindows,3))) + + + +def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])): + """ + Each missing Snp is imputed by known snp found in a window centered on the SNP to impute + Argument + """ + (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file) + + LD_mat = generate_sparse_matrix(ld_file, ref_panel) + + Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size) all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)] - #zscore = pd.read_csv(Zfile, index_col=0, sep="\t") + zscore = prepare_zscore_for_imputation(ref_panel, zscore) zscore_results = zscore.copy(deep=True) @@ -133,28 +171,15 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, if(len(known) > 0): imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True) - batch_df = pd.DataFrame({ - 'pos': ref_panel.loc[unknowns, 'pos'], - 'A0': ref_panel.loc[unknowns, "Ref_all"], - "A1": ref_panel.loc[unknowns,"alt_all"], - "ws_start":start_windows, - "ws_end":end_windows, - "Z" : imp['mu'], - "Var": imp["var"], - "ld_score" : imp["ld_score"], - "condition_number": imp['condition_number'], - "correct_inversion":imp["correct_inversion"], - "Nsnp_to_impute" : len(known) - }) - # keep only snp in the core window - start_windows = int(start_ld_block) + i*window_resize - end_windows = int(start_ld_block) + (i+1)*window_resize - in_core_window = in_region(batch_df.pos, start_windows, end_windows) - zscore_results = pd.concat([zscore_results, batch_df.loc[in_core_window]]) + batch_df = format_result_df(imp, ref_panel, known, unknowns): + # keep only snp in the core window (and not in the buffer) + start_core_window = int(start_ld_block) + i*window_resize + end_core_window = int(start_ld_block) + (i+1)*window_resize + in_core_window = in_region(batch_df.pos, start_core_window, end_core_window) + zscore_results = pd.concat([zscore_results, batch_df.loc[in_core_window]]) i = i+1 - if i%10 == 0: - print("{0}\%".format(np.round(i/Nwindows,4))) + print_progression(i, Nwindows) return zscore_results.sort_values(by="pos")