Skip to content
Snippets Groups Projects
Commit 916131e7 authored by Yuka Suzuki's avatar Yuka Suzuki
Browse files

Upload New File

parent 00fd7107
No related branches found
No related tags found
No related merge requests found
'''
This script is from https://github.com/JonJala/mtag/blob/master/mtag.py written by the authors below.
Slightly adjusted to run the analysis with simulated data.
Authors
Omeed Maghzian (Harvard University, Department of Economics)
Raymond Walters (Broad Institute of MIT and Harvard)
Patrick Turley (Broad Institute of MIT and Harvard)
Turley, et. al. (2018) Multi-Trait analysis of genome-wide association summary statistics using MTAG. Nature Genetics doi: https://doi.org/10.1038/s41588-017-0009-4.
'''
import pandas as pd
import numpy as np
import logging
import scipy.optimize
import scipy.stats
from scipy.stats import chi2
def _posDef_adjustment(mat, scaling_factor=0.99,max_it=1000):
'''
Checks whether the provided is pos semidefinite. If it is not, then it performs the the adjustment procedure descried in 1.2.2 of the Supplementary Note
scaling_factor: the multiplicative factor that all off-diagonal elements of the matrix are scaled by in the second step of the procedure.
max_it: max number of iterations set so that
'''
logging.info('Checking for positive definiteness ..')
assert mat.ndim == 2
assert mat.shape[0] == mat.shape[1]
is_pos_semidef = lambda m: np.all(np.linalg.eigvals(m) >= 0)
if is_pos_semidef(mat):
return mat
else:
logging.info('matrix is not positive definite, performing adjustment..')
P = mat.shape[0]
for i in range(P):
for j in range(i,P):
if np.abs(mat[i,j]) > np.sqrt(mat[i,i] * mat[j,j]):
mat[i,j] = scaling_factor*np.sign(mat[i,j])*np.sqrt(mat[i,i] * mat[j,j])
mat[j,i] = mat[i,j]
n=0
while not is_pos_semidef(mat) and n < max_it:
dg = np.diag(mat)
mat = scaling_factor * mat
mat[np.diag_indices(P)] = dg
n += 1
if n == max_it:
logging.info('Warning: max number of iterations reached in adjustment procedure. Sigma matrix used is still non-positive-definite.')
else:
logging.info('Completed in {} iterations'.format(n))
return mat
def gmm_omega(Zs, Ns, sigma_LD):
logging.info('Using GMM estimator of Omega ..')
N_mats = np.sqrt(np.einsum('mp,mq->mpq', Ns,Ns))
Z_outer = np.einsum('mp,mq->mpq',Zs, Zs)
return np.mean((Z_outer - sigma_LD) / N_mats, axis=0)
def jointEffect_probability(Z_score, omega_hat, sigma_hat,N_mats, S=None):
''' For each SNP m in each state s , computes the evaluates the multivariate normal distribution at the observed row of Z-scores
Calculate the distribution of (Z_m | s ) for all s in S, m in M. --> M x|S| matrix
The output is a M x n_S matrix of joint probabilities
'''
DTYPE = np.float64
(M,P) = Z_score.shape
if S is None: # 2D dimensional form
assert omega_hat.ndim == 2
omega_hat = omega_hat.reshape(1,P,P)
S = np.ones((1,P),dtype=bool)
(n_S,_) = S.shape
jointProbs = np.empty((M,n_S))
xRinvs = np.zeros([M,n_S,P], dtype=DTYPE)
logSqrtDetSigmas = np.zeros([M,n_S], dtype=DTYPE)
Ls = np.zeros([M,n_S,P,P], dtype=DTYPE)
cov_s = np.zeros([M,n_S,P,P], dtype=DTYPE)
Zs_rep = np.einsum('mp,s->msp',Z_score,np.ones(n_S)) # functionally equivalent to repmat
cov_s = np.einsum('mpq,spq->mspq',N_mats,omega_hat) + sigma_hat
Ls = np.linalg.cholesky(cov_s)
Rs = np.transpose(Ls, axes=(0,1,3,2))
xRinvs = np.linalg.solve(Ls, Zs_rep)
logSqrtDetSigmas = np.sum(np.log(np.diagonal(Rs,axis1=2,axis2=3)),axis=2).reshape(M,n_S)
quadforms = np.sum(xRinvs**2,axis=2).reshape(M,n_S)
jointProbs = np.exp(-0.5 * quadforms - logSqrtDetSigmas - P * np.log(2 * np.pi) / 2)
if n_S == 1:
jointProbs = jointProbs.flatten()
return jointProbs
def rebuild_omega(chol_elems, s=None):
'''Rebuild state-dependent Omega given combination of causal states
cholX_elements are the elements (entered row-wise) of the lower triangular cholesky decomposition of Omega_s
'''
if s is None:
P = int((-1 + np.sqrt(1.+ 8.*len(chol_elems)))/2.)
s = np.ones(P,dtype=bool)
P_c = P
else:
P_c = int(np.sum(s))
P = s.shape[1] if s.ndim == 2 else len(s)
cholL = np.zeros((P_c,P_c))
cholL[np.tril_indices(P_c)] = np.array(chol_elems)
cholL[np.diag_indices(P_c)] = np.exp(np.diag(cholL)) # exponentiate the diagnoal so cholL unique
for i in range(P_c):
for j in range(i): # multiply by exponentiated diags
cholL[i,j] = cholL[i,j]*np.sqrt(cholL[i,i]*cholL[j,j])
omega_c = np.dot(cholL, cholL.T)
# Expand to include zeros of matrix
omega = np.zeros((P,P))
s_caus_ind = np.argwhere(np.outer(s, s))
omega[(s_caus_ind[:,0],s_caus_ind[:,1])] = omega_c.flatten()
return omega
def _omega_neglogL(x,Zs,N_mats,sigma_LD):
omega_it = rebuild_omega(x)
joint_prob = jointEffect_probability(Zs,omega_it,sigma_LD,N_mats)
return - np.sum(np.log(joint_prob))
def numerical_omega(Zs,N_mats,sigma_LD,omega_start, tol=1.0e-10):
M,P = Zs.shape
solver_options = dict()
solver_options['fatol'] = 1.0e-8
solver_options['xatol'] = tol
solver_options['disp'] = False
solver_options['maxiter'] = P*(P+1)*500
x_start = flatten_out_omega(omega_start)
opt_results = scipy.optimize.minimize(_omega_neglogL,x_start,args=(Zs,N_mats,sigma_LD),method='Nelder-Mead',options=solver_options)
return rebuild_omega(opt_results.x), opt_results
def estimate_omega(Zs,Ns,sigma_LD):
logging.info('Beginning estimation of Omega ...')
M,P = Zs.shape
N_mats = np.sqrt(np.einsum('mp, mq -> mpq',Ns, Ns))
return _posDef_adjustment(gmm_omega(Zs,Ns,sigma_LD))
def mtag_analysis(Zs, Ns, omega_hat, sigma_LD):
M,P = Zs.shape
W_N = np.einsum('mp,pq->mpq',np.sqrt(Ns),np.eye(P))
W_N_inv = np.linalg.inv(W_N)
Sigma_N = np.einsum('mpq,mqr->mpr',np.einsum('mpq,qr->mpr',W_N_inv,sigma_LD),W_N_inv)
mtag_betas = np.zeros((M,P))
mtag_se = np.zeros((M,P))
mtag_factor = np.zeros((M,P))
mtag_pval = np.zeros((M,P))
mtag_z = np.zeros((M,P))
p_values = lambda z: 2*(scipy.stats.norm.cdf(-1.*np.abs(z)))
for p in range(P):
# Note that in the code, what I call "gamma should really be omega", but avoid the latter term due to possible confusion with big Omega
gamma_k = omega_hat[:,p]
tau_k_2 = omega_hat[p,p]
om_min_gam = omega_hat - np.outer(gamma_k,gamma_k)/tau_k_2
xx = om_min_gam + Sigma_N
inv_xx = np.linalg.inv(xx)
yy = gamma_k/tau_k_2
W_inv_Z = np.einsum('mqp,mp->mq',W_N_inv,Zs)
beta_denom = np.einsum('mp,p->m',np.einsum('q,mqp->mp',yy,inv_xx),yy)
mtag_factor[:,p] = np.einsum('mp,m->m',np.einsum('q,mqp->mp',yy,inv_xx), 1/beta_denom)
mtag_var_p = 1. / beta_denom
mtag_betas[:,p] = np.einsum('mp,mp->m',np.einsum('q,mqp->mp',yy,inv_xx), W_inv_Z) / beta_denom
mtag_se[:,p] = np.sqrt(mtag_var_p)
mtag_z[:,p] = mtag_betas[:,p]/mtag_se[:,p]
mtag_pval[:,p] = p_values(mtag_z[:,p])
# print("\n betas:",mtag_betas)
# print("\n se:",mtag_se)
# print("\n mtag_factor:", mtag_factor)
print("\n mtag_z:", mtag_z)
print("\n mtag_pval:", mtag_pval)
return mtag_z, mtag_pval
def mtag(Zs,Ns,sigma_hat):
omega_hat = estimate_omega(Zs, Ns, sigma_hat)
print("\n estimated omega:", omega_hat)
mtag_z, mtag_pval = mtag_analysis(Zs, Ns, omega_hat, sigma_hat)
print("lambda in MTAG raw p-vals:",chi2.sf(np.median(mtag_pval), df=1) / chi2.sf(0.5, df=1))
min_mtag_pval = mtag_pval.min(axis=1)
return min_mtag_pval
if __name__ == '__main__':
name='H0_null_k5_ce_0.25_ov_1.tsv' # to specify input file of Z-scores multiple traits
df = pd.read_csv('Z_score_'+name,delimiter='\t')
Zs = df[['z1','z2','z3','z4','z5']].values # extract Z-scores columns only
sigma_hat = pd.read_csv('COV_'+name,delimiter='\t').values # import null cov matrix.
N_samp = 50000 # sample size (pre-defined in the generation of the simulated data)
n_trait = 5
M = len(df) # number of SNPs
Ns = np.zeros([M,n_trait]) + N_samp
min_mtag_pval = mtag(Zs,Ns,sigma_hat) # min P value across traits (per SNP)
df['min_pval_mtag'] = min_mtag_pval
df.to_csv('Z_score_wMTAG_'+name,sep='\t',na_rep="NA",index=False)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment