diff --git a/impute_jass/impute_jass/imputation_launcher.py b/impute_jass/impute_jass/imputation_launcher.py index 2d7d1ecba5f4223fb7ff3c037915c7d4cc226bfc..3f3ce8177a9ac1970f07423e549e61a7b0706e71 100644 --- a/impute_jass/impute_jass/imputation_launcher.py +++ b/impute_jass/impute_jass/imputation_launcher.py @@ -4,7 +4,7 @@ on the genome """ import glob import pandas as pd -from .windows import ld_region_centered_window_imputation, impg_like_imputation, realigned_zfiles_on_panel +from .windows import ld_region_centered_window_imputation, prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel @@ -13,7 +13,8 @@ class ImputationLauncher(object): Class perform imputation of snp from summary statistic """ - def __init__(self, window_size=10000, imputation_style="batch", buf=2500, lamb= 0.01, pinv_rcond = 0.01 ): + def __init__(self, window_size=10000, imputation_style="batch", buf=2500, + lamb= 0.01, pinv_rcond = 0.01 ): """ Initialise the imputation object. Fix the windows size, the buffer size and the king of imputation employed @@ -29,7 +30,7 @@ class ImputationLauncher(object): matrices to make it less singular pinv_rcond (float): the rcond scipy.linalg.pinv function argument. The scipy.linalg.pinv is used to invert - the correlationmatrices + the correlation matrices """ self.imputation_style = imputation_style self.window_size = window_size @@ -37,6 +38,7 @@ class ImputationLauncher(object): self.lamb = lamb self.rcond = pinv_rcond + def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder): """ Impute the panel zscore score for one chromosome and with the specified @@ -51,19 +53,27 @@ class ImputationLauncher(object): Imputed zscore dataframe """ pattern = "{0}/{1}_*.ld".format(ld_folder, chrom) + zscore = prepare_zscore_for_imputation(ref_panel, zscore) + zscore_results = zscore.copy(deep=True) if self.imputation_style == "online": def imputer(ld_file): - return ld_region_centered_window_imputation(ld_file, ref_panel, zscore, self.window_size) + return ld_region_centered_window_imputation(ld_file, ref_panel, + zscore, + self.window_size) elif self.imputation_style == "batch": def imputer(ld_file): - return impg_like_imputation(ld_file, ref_panel, zscore, self.window_size, self.buffer, self.lamb, self.rcond) + return impg_like_imputation(ld_file, ref_panel, zscore, + self.window_size, self.buffer, + self.lamb, self.rcond) for ld_file in glob.glob(pattern): print("processing Region: {0}".format(ld_file)) - zscore = imputer(ld_file) + ld_batch = imputer(ld_file) + zscore_results = pd.concat([zscore_results, ld_batch]) - zscore = realigned_zfiles_on_panel(ref_panel, zscore) - return zscore + zscore_results.sort_values(by="pos") + zscore_results = realigned_zfiles_on_panel(ref_panel, zscore_results) + return zscore_results def genome_imputation(self, gwas_tag, ref_panel_folder, ld_folder, zscore_folder, folder_output): diff --git a/impute_jass/impute_jass/stat_models.py b/impute_jass/impute_jass/stat_models.py index 7c520eb9011032737f926bdc2d14b11e46e98869..16cca0c924350774af00c23588ac0f5abd519191 100644 --- a/impute_jass/impute_jass/stat_models.py +++ b/impute_jass/impute_jass/stat_models.py @@ -29,7 +29,7 @@ def compute_mu(sig_i_t, sig_t_inv, zt): zt (np.array?): Zscores of known snp Returns: mu_i (np.array): a vector of length i containing the estimate of zscore - + """ return np.dot(sig_i_t, np.dot(sig_t_inv, zt)) @@ -101,5 +101,4 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True): var_norm = var_in_boundaries(var, lamb) R2 = ((1+lamb)-var_norm) mu = mu / np.sqrt(R2) - return({"var" : var, "mu" : mu, "ld_score" : ld_score, "condition_number" : condition_number, "correct_inversion":correct_inversion }) diff --git a/impute_jass/impute_jass/windows.py b/impute_jass/impute_jass/windows.py index 4ecf1041aa0478222112cac1c45278b3b59366ef..3f30ae026d6821900efa4fb1f56b4d0447da864e 100644 --- a/impute_jass/impute_jass/windows.py +++ b/impute_jass/impute_jass/windows.py @@ -27,7 +27,9 @@ def realigned_zfiles_on_panel(ref_panel, zscore): If not, the coded and other allele are inverted and the zscore sign is inverted also. - + Args: + - ref_panel (pd.dataframe) : snp of reference on the imputed chromosome + - zscore (pd.dataframe): """ sub_ref_panel = ref_panel.loc[zscore.index] allele_inverted = (sub_ref_panel['Ref_all'] != zscore.A0) @@ -40,14 +42,20 @@ def realigned_zfiles_on_panel(ref_panel, zscore): def prepare_zscore_for_imputation(ref_panel, zscore): """ - Prepare the known Z score by realigning them on the reference ref_panel - the snps that are not present in the ref panel are filtered - + Prepare the known Z score by + - realigning them on the reference ref_panel + - filtering snps that are not present in the ref panel + - Adding columns that will contain information on imputation: + * Var : theoritical variance estimate of z + * Nsnp_to_impute : Number of known snp + * ld_score : the sum of the square correlation of the snp with all other + known snp (give an idea if the we have enough information to compute a + precise z estimate) """ zscore = realigned_zfiles_on_panel(ref_panel, zscore) - zscore['Var'] = -1 + zscore['Var'] = -1.0 zscore['Nsnp_to_impute'] = -1 - zscore['ld_score'] = -1 + zscore['ld_score'] = -1.0 zscore = zscore.loc[zscore.index.intersection(ref_panel.index)] return zscore @@ -57,7 +65,17 @@ def in_region(pos_vector, start, end): def compute_window_and_size(start_ld_block, end_ld_block, window_size): """ - Compute the number of window to pave the ld_block tightly + Compute the number of window to pave the Linkage desiquilibrium block + tightly with a window + size the closest to the one specified by the user + Args: + start_ld_block (int): the start of the Linkage desiquilibrium block + end_ld_block (int): the end of the Linkage desiquilibrium block + window_size (int) : size in bp of the window to compute imputation_style + all unknown snps in the window will be imputed from the known snp in + the windows + return: + a tuple ( number of windows (int), size of the window resize to the ld block (int)) """ Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size @@ -83,7 +101,8 @@ def format_result_df(imp, ref_panel, known, unknowns): "correct_inversion":imp["correct_inversion"], "Nsnp_to_impute" : len(known) } - column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", "correct_inversion", "Nsnp_to_impute"] + column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", + "correct_inversion", "Nsnp_to_impute"] batch_df = pd.DataFrame(result_dict, columns = column_order) return batch_df @@ -95,23 +114,28 @@ def print_progression(i, Nwindows): if i%(np.ceil(Nwindows/10)) == 0: print("{0}\%".format(np.round(i/Nwindows,3))) +def empty_imputed_dataframe(): + column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", + "correct_inversion", "Nsnp_to_impute"] + zscore_results = pd.DataFrame(columns = column_order) + return zscore_results def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, - rcond, file_output, unknowns=pd.Series([])): + rcond, unknowns=pd.Series([])): """ Each missing Snp is imputed by known snp found in a window centered on the SNP to impute - Argument + Argument. + Args: + ld_file (str): Linkage desiquilibrium matrix files + ref_panel (pd.dataframe): the dataframe containing reference panel + snps """ (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file) - LD_mat = generate_sparse_matrix(ld_file, ref_panel) - Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size) - all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)] - zscore = prepare_zscore_for_imputation(ref_panel, zscore) - zscore.to_csv(z_fo, sep='\t') - zscore_results = zscore.copy(deep=True) + # the dataframe containing results + zscore_results = empty_imputed_dataframe() print("### Imputation of {0} snps ###".format(all_unknowns.shape[0])) @@ -145,7 +169,6 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, # keep only SNP with non negligible explained variance snp_well_predicted = (batch_df.Var < 0.9) batch_df_filt = batch_df.loc[in_core_window & snp_well_predicted, zscore_results.columns] - batch_df_filt.to_csv(z_fo, sep='\t', mode = 'a') zscore_results = pd.concat([zscore_results, batch_df_filt]) i = i+1 print_progression(i, Nwindows)