windows.py 6.97 KB
Newer Older
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
"""
implement the imputation window is sliding along the genome:

- ImpG like: Non overlapping windows, the imputation is apply in batch to unknown snp in the window
- centered_window:  A sliding window centered on the Snp to impute
"""

from .stat_models import impg_model
from .ld_matrix import generate_sparse_matrix

import pandas as pd
import numpy as np

def parse_region_position(ld_file):
    """
    Retrieve the region definition from a ld-file generated by impute_jass
    Argument :
        ld_file : A ld file generated by jass_impute
    """
    (chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
    return (chrom, startpos, endpos)

def realigned_zfiles_on_panel(ref_panel, zscore):
    """
    Check if the counted allele is the same in the reference panel and
    the Zscore files.

    If not, the coded and other allele are inverted and the zscore sign
    is inverted also.
    Args:
        - ref_panel (pd.dataframe) : snp of reference on the imputed chromosome
        - zscore (pd.dataframe):
    """
    sub_ref_panel = ref_panel.loc[zscore.index]
    allele_inverted = (sub_ref_panel['Ref_all'] != zscore.A0)

    zscore.loc[allele_inverted, "A0"] = sub_ref_panel.loc[allele_inverted].Ref_all
    zscore.loc[allele_inverted, "A1"] = sub_ref_panel.loc[allele_inverted].alt_all
    zscore.loc[allele_inverted, "Z"] = - zscore.loc[allele_inverted, "Z"]

    return zscore

def prepare_zscore_for_imputation(ref_panel, zscore):
    """
    Prepare the known Z score by
    - realigning them on the reference ref_panel
    - filtering snps that are not present in the ref panel
    - Adding columns that will contain information on imputation:
        * Var : theoritical variance estimate of z
        * Nsnp_to_impute : Number of known snp
        * ld_score : the sum of the square correlation of the snp with all other
         known snp (give an idea if the we have enough information to compute a
         precise z estimate)
    """
    zscore = realigned_zfiles_on_panel(ref_panel, zscore)
    zscore['Var'] = -1.0
    zscore['Nsnp_to_impute'] = -1
    zscore['ld_score'] = -1.0
    zscore = zscore.loc[zscore.index.intersection(ref_panel.index)]
    return zscore

def in_region(pos_vector, start, end):
    return ((start < pos_vector) & (pos_vector < end))


def compute_window_and_size(start_ld_block, end_ld_block, window_size):
    """
    Compute the number of window to pave the Linkage desiquilibrium block
    tightly with a window
    size the closest to the one specified by the user
    Args:
        start_ld_block (int): the start of the Linkage desiquilibrium block
        end_ld_block (int): the end of the Linkage desiquilibrium block
        window_size (int) : size in bp of the window to compute imputation_style
        all unknown snps in the window will be imputed from the known snp in
        the windows
    return:
         a tuple ( number of windows (int), size of the window resize to the ld block (int))
    """

    Nwindows = ((int(end_ld_block)) - (int(start_ld_block)))//window_size
    # adapt window size to cover the LD block
    if Nwindows > 0:
        window_resize = np.ceil((int(end_ld_block) - (int(start_ld_block)))/Nwindows)
    else:
        Nwindows = 1
        window_resize = np.ceil(int(end_ld_block) - (int(start_ld_block)))

    return Nwindows, window_resize

def format_result_df(imp, ref_panel, known, unknowns):

    result_dict = {
                'pos': ref_panel.loc[unknowns, 'pos'],
                'A0': ref_panel.loc[unknowns, "Ref_all"],
                "A1": ref_panel.loc[unknowns,"alt_all"],
                "Z" : imp['mu'],
                "Var": imp["var"],
                "ld_score" : imp["ld_score"],
                "condition_number": imp['condition_number'],
                "correct_inversion":imp["correct_inversion"],
                "Nsnp_to_impute" : len(known)
            }
    column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number",
                    "correct_inversion", "Nsnp_to_impute"]
    batch_df = pd.DataFrame(result_dict, columns = column_order)
    return batch_df


def print_progression(i, Nwindows):
    """
    print the percentage of window treated in the ld block
    """
    if i%(np.ceil(Nwindows/10)) == 0:
        print("{0}\%".format(np.round(i/Nwindows,3)))

def empty_imputed_dataframe():
    column_order = ['pos','A0',"A1","Z","Var", "ld_score", "condition_number",
                        "correct_inversion", "Nsnp_to_impute"]
    zscore_results = pd.DataFrame(columns = column_order)
    return zscore_results
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
                         rcond, unknowns=pd.Series([])):
    """
        Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
        Argument.
        Args:
            ld_file (str): Linkage desiquilibrium matrix files
            ref_panel (pd.dataframe): the dataframe containing reference panel
            snps
    """
    (chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
    LD_mat = generate_sparse_matrix(ld_file, ref_panel)
    Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size)
    all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]

    # the dataframe containing results
    zscore_results = empty_imputed_dataframe()

    print("### Imputation of {0} snps ###".format(all_unknowns.shape[0]))

    for i in range(Nwindows):
        # Boundary of the sliding_window
        start_windows = int(start_ld_block) + i*window_resize - buffer
        end_windows = int(start_ld_block) + (i+1)*window_resize + buffer

        start_pos = max(start_windows, float(start_ld_block))
        end_pos = min(end_windows, float(end_ld_block))

        in_LD_reg_n_window =  in_region(zscore.pos, start_pos, end_pos)
        unknown_in_LD_reg_n_window =  in_region(all_unknowns.pos, start_pos, end_pos)

        known = zscore.loc[in_LD_reg_n_window].index
        unknowns = all_unknowns.loc[unknown_in_LD_reg_n_window].index

        sig_t = LD_mat.loc[known, known]
        sig_i_t = LD_mat.loc[unknowns, known]
        zt = zscore.loc[known,'Z']

        if(len(known) > 0):
            imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True)
            batch_df = format_result_df(imp, ref_panel, known, unknowns)

            # keep only snp in the core window (and not in the buffer)
            start_core_window = int(start_ld_block) + i*window_resize
            end_core_window = int(start_ld_block) + (i+1)*window_resize
            in_core_window = in_region(batch_df.pos, start_core_window, end_core_window)

            # keep only SNP with non negligible explained variance
            snp_well_predicted = (batch_df.Var < 0.9)
            batch_df_filt = batch_df.loc[in_core_window & snp_well_predicted, zscore_results.columns]
            zscore_results = pd.concat([zscore_results, batch_df_filt])
        i = i+1
        print_progression(i, Nwindows)

    return zscore_results.sort_values(by="pos")