windows.py 3.72 KB
Newer Older
1
2
3
4
"""
implement the imputation window is sliding along the genome:

- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
5
- centered_window:  A sliding window centered on the Snp to impute
6
7

"""
8
9
from .stat_models import ImpG_model_batch, ImpG_model_snp
from .ld_matrix import generate_sparse_matrix
10

11
import pandas as pd
12
13
import numpy as np

14
15
16
17
18
19
20
21
22
23
24

def parse_region_position(LD_file):
    """
    Retrieve the region definition from a ld-file generated by impute_jass
    Argument :
        LD_file : A LD file generated by jass_impute

    """
    (chrom, startpos, endpos ) = LD_file.split("/")[-1].split(".")[0].split('_')
    return (chrom, startpos, endpos)

25
26

def realigned_zfiles_on_panel(ref_panel, Zscores):
27
    """
28
29
30
31
32
33
34
35
36
37
38
    Check if the counted allele is the same in the reference panel and
    the Zscore files.

    If not, the coded and other allele are inverted and the Zscores sign
    is inverted also.
    """
    allele_inverted = (ref_panel.loc[Zscores.index, 'Ref_all'] != Zscores.A0)

    Zscores.loc[allele_inverted, "A0"] = ref_panel.alt_all
    Zscores.loc[allele_inverted, "A1"] = ref_panel.Ref_all
    Zscores.loc[allele_inverted, "Z"] = - Zscores.loc[allele_inverted, "Z"]
39

40
41
    return Zscores

42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
def prepare_Zscore_for_imputation(ref_panel, Zscores):
    """
    Prepare the known Z score by realigning them on the reference ref_panel
    the snps that are not present in the ref panel are filtered
    """
    Zscores = realigned_zfiles_on_panel(ref_panel, Zscores)
    Zscores['Var'] = 1
    Zscores['Nsnp_to_impute'] = -1
    Zscores = Zscores.loc[Zscores.index.intersection(ref_panel.index)]
    return Zscores


def in_region(pos_vector, start, end):
    return ((start < pos_vector) & (pos_vector < end))

57
def centered_window_imputation(LD_file, ref_panel_folder, Zscores, window_size, unknowns=pd.Series([])):
58
59
60
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
61
    """
62
    (chrom, start_ld_block, end_ld_block) = parse_region_position(LD_file)
63
64
65
66
67
68

    ref_panel_file = "/mnt/atlas/PCMA/1._DATA/ImpG_refpanel/{0}.eur.1pct.bim".format(chrom)
    ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)

    LD_mat = generate_sparse_matrix(LD_file, ref_panel)

69
    #Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
70
    Zscores = prepare_Zscore_for_imputation(ref_panel, Zscores)
71

72
    # Find Snp to impute
73
74
75
    if len(unknowns) == 0:
        unknowns = LD_mat.index.difference(Zscores.index)

76
    N_snp = len(unknowns)
77
    print("### Imputation of {0} snps ###".format(len(unknowns)))
78
    i = 0
79
80
81
82

    for snp_unknown in unknowns:
        # Boundary of the centered_window

83
84
85
86
87
88
89
        start_pos = ref_panel.loc[snp_unknown,'pos'] - window_size
        end_pos = ref_panel.loc[snp_unknown,'pos'] + window_size
        #print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)

        in_LD_reg_n_window = in_region(Zscores.pos, int(start_ld_block), int(end_ld_block)) & in_region(Zscores.pos, start_pos, end_pos)

        known = Zscores.loc[in_LD_reg_n_window].index
90
91
92
93
        Sig_t = LD_mat.loc[known, known]
        Sig_i_t = LD_mat.loc[snp_unknown, known]
        Zt = Zscores.loc[known,'Z']

94
95
        if(len(known) > 0):
            imp = ImpG_model_snp(Zt, Sig_t, Sig_i_t)
96
            Zscores.loc[snp_unknown, ["pos", "A0", "A1"]] = ref_panel.loc[snp_unknown, ['pos', "Ref_all", "alt_all"]]
97
98
            Zscores.loc[snp_unknown, "Z"] = imp['mu']
            Zscores.loc[snp_unknown, "Var"] = imp['Var']
99
            Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
100
101
        i = i+1

102
        if i%100 == 0:
103
104
            print("{0}\%".format(np.round(i/N_snp,4)))

105

106
    return Zscores.sort_values(by="pos")