stat_models.py 3.54 KB
Newer Older
Hanna  JULIENNE's avatar
Hanna JULIENNE committed
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
"""
This module contain the statistical library for imputation.

Notation style of matrices subset and vectors are based on the publication:

Bogdan Pasaniuc, Noah Zaitlen, Huwenbo Shi, Gaurav Bhatia, Alexander Gusev,
Joseph Pickrell, Joel Hirschhorn, David P. Strachan, Nick Patterson,
Alkes L. Price;
Fast and accurate imputation of summary statistics enhances evidence
of functional enrichment, Bioinformatics, Volume 30, Issue 20, 15 October 2014,
Pages 2906–2914

"""

import numpy as np
import scipy as sc
import scipy.linalg

def compute_mu(sig_i_t, sig_t_inv, zt):
    """
    Compute the estimation of z-score from neighborring snp

    Args:
        sig_i_t (matrix?) : correlation matrix with line corresponding to
        unknown Snp (snp to impute) and column to known SNPs
        sig_t_inv (np.ndarray): inverse of the correlation matrix of known
        matrix
        zt (np.array?): Zscores of known snp
    Returns:
        mu_i (np.array): a vector of length i containing the estimate of zscore

    """
    return np.dot(sig_i_t, np.dot(sig_t_inv, zt))

def compute_var(sig_i_t, sig_t_inv, lamb, batch=True):
    """
    Compute the expected variance of the imputed SNPs
    Args:
        sig_i_t (matrix?) : correlation matrix with line corresponding to
        unknown Snp (snp to impute) and column to known SNPs
        sig_t_inv (np.ndarray): inverse of the correlation matrix of known
        matrix
        lamb (float): regularization term added to matrix

    """

    if batch:
        var = (1 + lamb) - np.einsum('ij,jk,ki->i', sig_i_t, sig_t_inv ,sig_i_t.transpose())
        ld_score = (sig_i_t**2).sum(1)

    else:
        var = (1 + lamb) - np.dot(sig_i_t, np.dot(sig_t_inv, sig_i_t.transpose()))
        ld_score = (sig_i_t**2).sum()

    return var, ld_score

def check_inversion(sig_t, sig_t_inv):
     return np.allclose(sig_t, np.dot(sig_t, np.dot(sig_t_inv, sig_t)))

def var_in_boundaries(var,lamb):
    """
    Forces the variance to be in the 0 to 1+lambda boundary
    theoritically we shouldn't have to do that
    """
    id_neg = np.where(var < 0)
    var_norm = var
    var[id_neg] = 0
    id_inf = np.where(var > (1+lamb))
    var[id_inf] = 1

    return var

def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01,  batch=True):
    """
    Compute the variance
    Args:
        zt (np.array): the vector of known Z scores
        sig_t (np.ndarray) : the matrix of known Linkage desiquilibrium
         correlation
        sig_i_t (np.ndarray): inverse of the correlation matrix of known
        matrix
        lamb (float): regularization term added to the diagonal of the sig_t matrix
        rcond (float): threshold to filter eigenvector with a eigenvalue under rcond
        make inversion biased but much more numerically robust
    """
    sig_t = sig_t.values
    np.fill_diagonal(sig_t, (1+lamb))
    sig_t_inv = scipy.linalg.pinv(sig_t, rcond=rcond)

    if batch:
        condition_number = np.array([np.linalg.cond(sig_t)]*sig_i_t.shape[0])
        correct_inversion = np.array([check_inversion(sig_t, sig_t_inv)]*sig_i_t.shape[0])
    else:
        condition_number = np.linalg.cond(sig_t)
        correct_inversion = check_inversion(sig_t, sig_t_inv)
    var, ld_score = compute_var(sig_i_t, sig_t_inv, lamb, batch)
    mu = compute_mu(sig_i_t, sig_t_inv, zt)
    if np.any(mu > 30):
        print("ABERANT SNP SNiP")
    var_norm = var_in_boundaries(var, lamb)
    R2 = ((1+lamb)-var_norm)
    mu = mu / np.sqrt(R2)
    return({"var" : var, "mu" : mu, "ld_score" : ld_score, "condition_number" : condition_number, "correct_inversion":correct_inversion })