diff --git a/impute_jass/impute_jass/stat_models.py b/impute_jass/impute_jass/stat_models.py index bfc6138d6680435f484219439b2b9b7bef43c9c2..918e1a2699d513f5a252767e715356e8eb5931a3 100644 --- a/impute_jass/impute_jass/stat_models.py +++ b/impute_jass/impute_jass/stat_models.py @@ -1,6 +1,5 @@ """ function for SNP imputation - """ import numpy as np diff --git a/impute_jass/impute_jass/windows.py b/impute_jass/impute_jass/windows.py index 946e3f233ae981afe5b0701720e753f25f304b74..4f6bc80507019a5e42e386c42eb94b1f82019c92 100644 --- a/impute_jass/impute_jass/windows.py +++ b/impute_jass/impute_jass/windows.py @@ -5,7 +5,10 @@ implement the imputation window is sliding along the genome: - centered_window: A sliding window centered on the Snp to impute """ +from .stat_models import ImpG_model_batch, ImpG_model_snp +from .ld_matrix import generate_sparse_matrix +import pandas as pd def parse_region_position(LD_file): """ @@ -17,9 +20,60 @@ def parse_region_position(LD_file): (chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_') return (chrom, startpos, endpos) -def centered_window_imputation(LD_file, ref_panel_folder, Zfile): + +def realigned_zfiles_on_panel(ref_panel, Zscores): """ - Each missing Snp is imputed by known snp found in a window centered on the SNP to impute + Check if the counted allele is the same in the reference panel and + the Zscore files. + + If not, the coded and other allele are inverted and the Zscores sign + is inverted also. + """ + allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0) + + Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all + Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all + Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"] + return Zscores + +def centered_window_imputation(LD_file, ref_panel_folder, Zfile, window_size): + """ + Each missing Snp is imputed by known snp found in a window centered on the SNP to impute + Argument """ - pass + (chrom, startpos, endpos) = parse_region_position(LD_file) + + ref_panel_file = "/mnt/atlas/PCMA/1._DATA/ImpG_refpanel/{0}.eur.1pct.bim".format(chrom) + print(ref_panel_file) + ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1) + + LD_mat = generate_sparse_matrix(LD_file, ref_panel) + + Zscores = pd.read_csv(Zfile, index_col=0, sep="\t") + + Zscores = realigned_zfiles_on_panel(ref_panel, Zscores) + Zscores['Var'] = 1 + + # dispatch snp between typed and untyped + unknowns = LD_mat.index.difference(Df.index) + + print("### Imputation of {0} snps ###".format(len(unknowns))) + + + for snp_unknown in unknowns: + # Boundary of the centered_window + start_ld_block = ref_panel.loc[snp_unknown,'pos'] - window + end_ld_block = ref_panel.loc[snp_unknown,'pos'] + window + + known = Zscores.loc[(start_ld_block < Df.pos) & (Df.pos < end_ld_block)].index + + Sig_t = LD_mat.loc[known, known] + Sig_i_t = LD_mat.loc[snp_unknown, known] + Zt = Zscores.loc[known,'Z'] + + imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t) + Zt.loc[snp_unknown, "Z"] = imp['mu'] + Zt.loc[snp_unknown, "Var"] = imp['Var'] + + return Zt.sort_values(by="pos")