windows.py 5.62 KB
Newer Older
1
2
3
4
"""
implement the imputation window is sliding along the genome:

- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
5
- centered_window:  A sliding window centered on the Snp to impute
6
"""
7

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
8
from .stat_models import impg_model
9
from .ld_matrix import generate_sparse_matrix
10

11
import pandas as pd
12
13
import numpy as np

14

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
15
def parse_region_position(ld_file):
16
17
18
    """
    Retrieve the region definition from a ld-file generated by impute_jass
    Argument :
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
19
        ld_file : A ld file generated by jass_impute
20
21

    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
22
    (chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
23
24
    return (chrom, startpos, endpos)

25

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
26
def realigned_zfiles_on_panel(ref_panel, zscore):
27
    """
28
29
30
    Check if the counted allele is the same in the reference panel and
    the Zscore files.

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
31
    If not, the coded and other allele are inverted and the zscore sign
32
33
    is inverted also.
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
34
    allele_inverted = (ref_panel.loc[zscore.index, 'Ref_all'] != zscore.A0)
35

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
36
37
38
    zscore.loc[allele_inverted, "A0"] = ref_panel.alt_all
    zscore.loc[allele_inverted, "A1"] = ref_panel.Ref_all
    zscore.loc[allele_inverted, "Z"] = - zscore.loc[allele_inverted, "Z"]
39

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
40
    return zscore
41

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
42
def prepare_zscore_for_imputation(ref_panel, zscore):
43
44
45
46
    """
    Prepare the known Z score by realigning them on the reference ref_panel
    the snps that are not present in the ref panel are filtered
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
47
48
49
50
51
    zscore = realigned_zfiles_on_panel(ref_panel, zscore)
    zscore['Var'] = 1
    zscore['Nsnp_to_impute'] = -1
    zscore = zscore.loc[zscore.index.intersection(ref_panel.index)]
    return zscore
52
53
54
55
56


def in_region(pos_vector, start, end):
    return ((start < pos_vector) & (pos_vector < end))

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
57
def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
58
59
60
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
61
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
62
    (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
63

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
64
65
    LD_mat = generate_sparse_matrix(ld_file, ref_panel)
    zscore = prepare_zscore_for_imputation(ref_panel, zscore)
66

67
    # Find Snp to impute
68
    if len(unknowns) == 0:
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
69
        unknowns = LD_mat.index.difference(zscore.index)
70

71
    N_snp = len(unknowns)
72
73
    print("### Imputation of {0} snps ###".format(len(unknowns)))

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
74
    for i,snp_unknown in enumerate(unknowns):
75
        # Boundary of the centered_window
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
76
77
        start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
        end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))
78

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
79
        in_LD_reg_n_window =  in_region(zscore.pos, start_pos, end_pos)
80

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
81
82
83
84
        known = zscore.loc[in_LD_reg_n_window].index
        sig_t = LD_mat.loc[known, known]
        sig_i_t = LD_mat.loc[snp_unknown, known]
        zt = zscore.loc[known,'Z']
85

86
        if(len(known) > 0):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
87
88
89
            imp = impg_model(zt, sig_t, sig_i_t, batch=False)
            zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"],  ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]

90
        if i%300 == 0:
91
            print("{0}\%".format(np.round(i/N_snp,4)))
92

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
93
    return zscore.sort_values(by="pos")
94

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
95
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, unknowns=pd.Series([])):
96
97
98
99
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
100
    (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
101

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
102
    LD_mat = generate_sparse_matrix(ld_file, ref_panel)
103
104
105
    Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
    # adapt window size to cover the LD block
    window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
106
107
108
    all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
    #zscore = pd.read_csv(Zfile, index_col=0, sep="\t")
    zscore = prepare_zscore_for_imputation(ref_panel, zscore)
109
110
111
    print("### Imputation of {0} snps ###".format(unknowns.shape[0]))

    for i in range(Nwindows):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
112
        # Boundary of the sliding_window
113
114
115
116
117
118
        start_windows = int(start_ld_block) + i*window_resize - buffer
        end_windows = int(start_ld_block) + (i+1)*window_resize + buffer

        start_pos = max(start_windows, float(start_ld_block))
        end_pos = min(end_windows, float(end_ld_block))

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
119
        in_LD_reg_n_window =  in_region(zscore.pos, start_pos, end_pos)
120
121
        unknown_in_LD_reg_n_window =  in_region(all_unknowns.pos, start_pos, end_pos)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
122
        known = zscore.loc[in_LD_reg_n_window].index
123
124
        unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
125
126
127
        sig_t = LD_mat.loc[known, known]
        sig_i_t = LD_mat.loc[unknowns, known]
        zt = zscore.loc[known,'Z']
128
129

        if(len(known) > 0):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
130
            imp = impg_model(zt, sig_t, sig_i_t, batch=True)
131
132
133
134
135
            batch_df = pd.DataFrame({
                'pos': ref_panel.loc[unknowns, 'pos'],
                'A0': ref_panel.loc[unknowns, "Ref_all"],
                "A1": ref_panel.loc[unknowns,"alt_all"],
                "Z" : imp['mu'],
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
136
                "Var": imp["var"],
137
138
                "Nsnp_to_impute" : len(known)
            })
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
139
140
141
142
143
144
            # keep only snp in the core window
            start_windows = int(start_ld_block) + i*window_resize
            end_windows = int(start_ld_block) + (i+1)*window_resize
            in_core_window = in_region(batch_df.pos, start_windows, end_windows)
            zscore = pd.concat([zscore, batch_df.loc[in_core_window]])

145
146

        i = i+1
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
147
148
        if i%10 == 0:
            print("{0}\%".format(np.round(i/Nwindows,4)))
149

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
150
    return zscore.sort_values(by="pos")