windows.py 7.07 KB
Newer Older
1
2
3
4
"""
implement the imputation window is sliding along the genome:

- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
5
- centered_window:  A sliding window centered on the Snp to impute
6
"""
7

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
8
from .stat_models import impg_model
9
from .ld_matrix import generate_sparse_matrix
10

11
import pandas as pd
12
13
import numpy as np

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
14
def parse_region_position(ld_file):
15
16
17
    """
    Retrieve the region definition from a ld-file generated by impute_jass
    Argument :
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
18
        ld_file : A ld file generated by jass_impute
19
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
20
    (chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
21
22
    return (chrom, startpos, endpos)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
23
def realigned_zfiles_on_panel(ref_panel, zscore):
24
    """
25
26
27
    Check if the counted allele is the same in the reference panel and
    the Zscore files.

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
28
    If not, the coded and other allele are inverted and the zscore sign
29
30
    is inverted also.
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
31
32
    sub_ref_panel = ref_panel.loc[zscore.index]
    allele_inverted = (sub_ref_panel['Ref_all'] != zscore.A0)
33

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
34
35
    zscore.loc[allele_inverted, "A0"] = sub_ref_panel.loc[allele_inverted].Ref_all
    zscore.loc[allele_inverted, "A1"] = sub_ref_panel.loc[allele_inverted].alt_all
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
36
    zscore.loc[allele_inverted, "Z"] = - zscore.loc[allele_inverted, "Z"]
37

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
38
    return zscore
39

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
40
def prepare_zscore_for_imputation(ref_panel, zscore):
41
42
43
    """
    Prepare the known Z score by realigning them on the reference ref_panel
    the snps that are not present in the ref panel are filtered
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
44
    
45
    """
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
46
    zscore = realigned_zfiles_on_panel(ref_panel, zscore)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
47
    zscore['Var'] = -1
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
48
    zscore['Nsnp_to_impute'] = -1
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
49
    zscore['ld_score'] = -1
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
50
51
    zscore = zscore.loc[zscore.index.intersection(ref_panel.index)]
    return zscore
52
53
54
55

def in_region(pos_vector, start, end):
    return ((start < pos_vector) & (pos_vector < end))

56
57

def compute_window_and_size(start_ld_block, end_ld_block, window_size):
58
    """
59
    Compute the number of window to pave the ld_block tightly
60
61
62
63
    """

    Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
    # adapt window size to cover the LD block
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
64
65
66
67
68
    if Nwindows > 0:
        window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows)
    else:
        Nwindows = 1
        window_resize = np.ceil(int(end_ld_block) - (int(start_ld_block)))
69

70
71
72
    return Nwindows, window_resize

def format_result_df(imp, ref_panel, known, unknowns):
73

74
75
76
77
78
79
80
81
82
83
84
    result_dict = {
                'pos': ref_panel.loc[unknowns, 'pos'],
                'A0': ref_panel.loc[unknowns, "Ref_all"],
                "A1": ref_panel.loc[unknowns,"alt_all"],
                "Z" : imp['mu'],
                "Var": imp["var"],
                "ld_score" : imp["ld_score"],
                "condition_number": imp['condition_number'],
                "correct_inversion":imp["correct_inversion"],
                "Nsnp_to_impute" : len(known)
            }
85
    column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number", "correct_inversion", "Nsnp_to_impute"]
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
    batch_df = pd.DataFrame(result_dict, columns = column_order)
    return batch_df


def print_progression(i, Nwindows):
    """
    print the percentage of window treated in the ld block
    """
    if i%(np.ceil(Nwindows/10)) == 0:
        print("{0}\%".format(np.round(i/Nwindows,3)))

def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb, rcond, unknowns=pd.Series([])):
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
    """
    (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)

    LD_mat = generate_sparse_matrix(ld_file, ref_panel)

    Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
107

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
108
    all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
109

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
110
    zscore = prepare_zscore_for_imputation(ref_panel, zscore)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
111
112
    zscore_results = zscore.copy(deep=True)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
113
    print("### Imputation of {0} snps ###".format(all_unknowns.shape[0]))
114
115

    for i in range(Nwindows):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
116
        # Boundary of the sliding_window
117
118
119
120
121
122
        start_windows = int(start_ld_block) + i*window_resize - buffer
        end_windows = int(start_ld_block) + (i+1)*window_resize + buffer

        start_pos = max(start_windows, float(start_ld_block))
        end_pos = min(end_windows, float(end_ld_block))

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
123
        in_LD_reg_n_window =  in_region(zscore.pos, start_pos, end_pos)
124
125
        unknown_in_LD_reg_n_window =  in_region(all_unknowns.pos, start_pos, end_pos)

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
126
        known = zscore.loc[in_LD_reg_n_window].index
127
128
        unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
129
130
131
        sig_t = LD_mat.loc[known, known]
        sig_i_t = LD_mat.loc[unknowns, known]
        zt = zscore.loc[known,'Z']
132
133

        if(len(known) > 0):
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
134
            imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True)
135
            batch_df = format_result_df(imp, ref_panel, known, unknowns)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
136

137
138
139
140
            # keep only snp in the core window (and not in the buffer)
            start_core_window = int(start_ld_block) + i*window_resize
            end_core_window = int(start_ld_block) + (i+1)*window_resize
            in_core_window = in_region(batch_df.pos, start_core_window, end_core_window)
141

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
142
            # keep only SNP with non negligible explained variance
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
143
            snp_well_predicted = (batch_df.Var < 0.9)
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
144
            batch_df_filt = batch_df.loc[in_core_window & snp_well_predicted, zscore_results.columns]
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
145
            zscore_results = pd.concat([zscore_results, batch_df_filt])
146
        i = i+1
147
        print_progression(i, Nwindows)
148

Hanna  JULIENNE's avatar
Hanna JULIENNE committed
149
    return zscore_results.sort_values(by="pos")
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188


def ld_region_centered_window_imputation(ld_file, ref_panel, zscore, window_size, unknowns=pd.Series([])):
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument
    """
    (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)

    LD_mat = generate_sparse_matrix(ld_file, ref_panel)
    zscore = prepare_zscore_for_imputation(ref_panel, zscore)

    # Find Snp to impute
    if len(unknowns) == 0:
        unknowns = LD_mat.index.difference(zscore.index)

    N_snp = len(unknowns)
    print("### Imputation of {0} snps ###".format(len(unknowns)))

    for i,snp_unknown in enumerate(unknowns):
        # Boundary of the centered_window
        start_pos = max((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block))
        end_pos = min(ref_panel.loc[snp_unknown,'pos'] + window_size, float(end_ld_block))

        in_LD_reg_n_window =  in_region(zscore.pos, start_pos, end_pos)

        known = zscore.loc[in_LD_reg_n_window].index
        sig_t = LD_mat.loc[known, known]
        sig_i_t = LD_mat.loc[snp_unknown, known]
        zt = zscore.loc[known,'Z']

        if(len(known) > 0):
            imp = impg_model(zt, sig_t, sig_i_t, batch=False)
            zscore.loc[snp_unknown] = [ref_panel.loc[snp_unknown, 'pos'], ref_panel.loc[snp_unknown, "Ref_all"],  ref_panel.loc[snp_unknown, "alt_all"], imp['mu'], imp['var'], len(known)]

        if i%300 == 0:
            print("{0}\%".format(np.round(i/N_snp,4)))

    return zscore.sort_values(by="pos")