diff --git a/impute_jass/impute_jass/imputation_launcher.py b/impute_jass/impute_jass/imputation_launcher.py index d1759d99d0787a5d1406fa727bc71691a3961eaf..85a250bb394bb2bab3e39db37c7553807b435078 100644 --- a/impute_jass/impute_jass/imputation_launcher.py +++ b/impute_jass/impute_jass/imputation_launcher.py @@ -1,5 +1,5 @@ """ -Function set to launch imputation on a complete chromosome or +Function set to launch SNP imputation on a complete chromosome or on the genome """ import glob @@ -13,17 +13,14 @@ class ImputationLauncher(object): Class perform imputation of snp from summary statistic """ - def __init__(self, window_size=10000, imputation_style="batch", buf=2500, + def __init__(self, window_size=10000, buf=2500, lamb= 0.01, pinv_rcond = 0.01 ): """ Initialise the imputation object. Fix the windows size, the buffer size - and the king of imputation employed + and the king of imputation employed + Args: window_size (int): size of the imputation window in bp - imputation_style (str): define if the windows while span the genome - in a non overlapping fashion ("batch") or - by being centered on each snp to impute - ('online') buffer (int): the size of the padding around the windows of imputation (relevant only for batch imputation) lamb (float): size of the increment added to snp correlation @@ -32,7 +29,7 @@ class ImputationLauncher(object): The scipy.linalg.pinv is used to invert the correlation matrices """ - self.imputation_style = imputation_style + self.window_size = window_size self.buffer = buf self.lamb = lamb @@ -55,16 +52,11 @@ class ImputationLauncher(object): pattern = "{0}/{1}_*.ld".format(ld_folder, chrom) zscore = prepare_zscore_for_imputation(ref_panel, zscore) zscore_results = zscore.copy(deep=True) - if self.imputation_style == "online": - def imputer(ld_file): - return ld_region_centered_window_imputation(ld_file, ref_panel, - zscore, - self.window_size) - elif self.imputation_style == "batch": - def imputer(ld_file): - return impg_like_imputation(ld_file, ref_panel, zscore, - self.window_size, self.buffer, - self.lamb, self.rcond) + + def imputer(ld_file): + return impg_like_imputation(ld_file, ref_panel, zscore, + self.window_size, self.buffer, + self.lamb, self.rcond) for ld_file in glob.glob(pattern): print("processing Region: {0}".format(ld_file)) diff --git a/impute_jass/impute_jass/windows.py b/impute_jass/impute_jass/windows.py index 3f30ae026d6821900efa4fb1f56b4d0447da864e..8368711bc98fa6d8a3716fe95ca301fe9618ea4e 100644 --- a/impute_jass/impute_jass/windows.py +++ b/impute_jass/impute_jass/windows.py @@ -127,7 +127,7 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, Args: ld_file (str): Linkage desiquilibrium matrix files ref_panel (pd.dataframe): the dataframe containing reference panel - snps + snps """ (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file) LD_mat = generate_sparse_matrix(ld_file, ref_panel) @@ -174,42 +174,3 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, print_progression(i, Nwindows) return zscore_results.sort_values(by="pos") - - -def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])): - """ - Each missing Snp is imputed by known snp found in a window centered on the SNP to impute - Argument - """ - (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file) - - LD_mat = generate_sparse_matrix(ld_file, ref_panel) - zscore = prepare_zscore_for_imputation(ref_panel, zscore) - - # Find Snp to impute - if len(unknowns) == 0: - unknowns = LD_mat.index.difference(zscore.index) - - N_snp = len(unknowns) - print("### Imputation of {0} snps ###".format(len(unknowns))) - - for i,snp_unknown in enumerate(unknowns): - # Boundary of the centered_window - start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block)) - end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block)) - - in_LD_reg_n_window = in_region(zscore.pos, start_pos, end_pos) - - known = zscore.loc[in_LD_reg_n_window].index - sig_t = LD_mat.loc[known, known] - sig_i_t = LD_mat.loc[snp_unknown, known] - zt = zscore.loc[known,'Z'] - - if(len(known) > 0): - imp = impg_model(zt, sig_t, sig_i_t, batch=False) - zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"], ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)] - - if i%300 == 0: - print("{0}\%".format(np.round(i/N_snp,4))) - - return zscore.sort_values(by="pos")