Commit 8b3491c6 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

wrote centered window function

parent 36cb79b4
function for SNP imputation
import numpy as np
......@@ -5,7 +5,10 @@ implement the imputation window is sliding along the genome:
- centered_window: A sliding window centered on the Snp to impute
from .stat_models import ImpG_model_batch, ImpG_model_snp
from .ld_matrix import generate_sparse_matrix
import pandas as pd
def parse_region_position(LD_file):
......@@ -17,9 +20,60 @@ def parse_region_position(LD_file):
(chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_')
return (chrom, startpos, endpos)
def centered_window_imputation(LD_file, ref_panel_folder, Zfile):
def realigned_zfiles_on_panel(ref_panel, Zscores):
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Check if the counted allele is the same in the reference panel and
the Zscore files.
If not, the coded and other allele are inverted and the Zscores sign
is inverted also.
allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0)
Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all
Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all
Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"]
return Zscores
def centered_window_imputation(LD_file, ref_panel_folder, Zfile, window_size):
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
(chrom, startpos, endpos) = parse_region_position(LD_file)
ref_panel_file = "/mnt/atlas/PCMA/1._DATA/ImpG_refpanel/{0}.eur.1pct.bim".format(chrom)
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
LD_mat = generate_sparse_matrix(LD_file, ref_panel)
Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
Zscores['Var'] = 1
# dispatch snp between typed and untyped
unknowns = LD_mat.index.difference(Df.index)
print("### Imputation of {0} snps ###".format(len(unknowns)))
for snp_unknown in unknowns:
# Boundary of the centered_window
start_ld_block = ref_panel.loc[snp_unknown,'pos'] - window
end_ld_block = ref_panel.loc[snp_unknown,'pos'] + window
known = Zscores.loc[(start_ld_block < Df.pos) & (Df.pos < end_ld_block)].index
Sig_t = LD_mat.loc[known, known]
Sig_i_t = LD_mat.loc[snp_unknown, known]
Zt = Zscores.loc[known,'Z']
imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
Zt.loc[snp_unknown, "Z"] = imp['mu']
Zt.loc[snp_unknown, "Var"] = imp['Var']
return Zt.sort_values(by="pos")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment