diff --git a/Figures_manuscript/scripts/Fig_SUP_simulation_H0_MTAG.py b/Figures_manuscript/scripts/Fig_SUP_simulation_H0_MTAG.py new file mode 100644 index 0000000000000000000000000000000000000000..e5a6d84ddb03a920db7fe655663493126b4eb5e8 --- /dev/null +++ b/Figures_manuscript/scripts/Fig_SUP_simulation_H0_MTAG.py @@ -0,0 +1,214 @@ +''' +This script is from https://github.com/JonJala/mtag/blob/master/mtag.py written by the authors below. +Slightly adjusted to run the analysis with simulated data. + +Authors +Omeed Maghzian (Harvard University, Department of Economics) +Raymond Walters (Broad Institute of MIT and Harvard) +Patrick Turley (Broad Institute of MIT and Harvard) + +Turley, et. al. (2018) Multi-Trait analysis of genome-wide association summary statistics using MTAG. Nature Genetics doi: https://doi.org/10.1038/s41588-017-0009-4. +''' + +import pandas as pd +import numpy as np +import logging +import scipy.optimize +import scipy.stats +from scipy.stats import chi2 + +def _posDef_adjustment(mat, scaling_factor=0.99,max_it=1000): + ''' + Checks whether the provided is pos semidefinite. If it is not, then it performs the the adjustment procedure descried in 1.2.2 of the Supplementary Note + + scaling_factor: the multiplicative factor that all off-diagonal elements of the matrix are scaled by in the second step of the procedure. + max_it: max number of iterations set so that + ''' + logging.info('Checking for positive definiteness ..') + assert mat.ndim == 2 + assert mat.shape[0] == mat.shape[1] + is_pos_semidef = lambda m: np.all(np.linalg.eigvals(m) >= 0) + if is_pos_semidef(mat): + return mat + else: + logging.info('matrix is not positive definite, performing adjustment..') + P = mat.shape[0] + for i in range(P): + for j in range(i,P): + if np.abs(mat[i,j]) > np.sqrt(mat[i,i] * mat[j,j]): + mat[i,j] = scaling_factor*np.sign(mat[i,j])*np.sqrt(mat[i,i] * mat[j,j]) + mat[j,i] = mat[i,j] + n=0 + while not is_pos_semidef(mat) and n < max_it: + dg = np.diag(mat) + mat = scaling_factor * mat + mat[np.diag_indices(P)] = dg + n += 1 + if n == max_it: + logging.info('Warning: max number of iterations reached in adjustment procedure. Sigma matrix used is still non-positive-definite.') + else: + logging.info('Completed in {} iterations'.format(n)) + return mat + +def gmm_omega(Zs, Ns, sigma_LD): + logging.info('Using GMM estimator of Omega ..') + N_mats = np.sqrt(np.einsum('mp,mq->mpq', Ns,Ns)) + Z_outer = np.einsum('mp,mq->mpq',Zs, Zs) + return np.mean((Z_outer - sigma_LD) / N_mats, axis=0) + +def jointEffect_probability(Z_score, omega_hat, sigma_hat,N_mats, S=None): + ''' For each SNP m in each state s , computes the evaluates the multivariate normal distribution at the observed row of Z-scores + Calculate the distribution of (Z_m | s ) for all s in S, m in M. --> M x|S| matrix + The output is a M x n_S matrix of joint probabilities + ''' + + DTYPE = np.float64 + (M,P) = Z_score.shape + if S is None: # 2D dimensional form + assert omega_hat.ndim == 2 + omega_hat = omega_hat.reshape(1,P,P) + S = np.ones((1,P),dtype=bool) + + (n_S,_) = S.shape + jointProbs = np.empty((M,n_S)) + + xRinvs = np.zeros([M,n_S,P], dtype=DTYPE) + logSqrtDetSigmas = np.zeros([M,n_S], dtype=DTYPE) + Ls = np.zeros([M,n_S,P,P], dtype=DTYPE) + cov_s = np.zeros([M,n_S,P,P], dtype=DTYPE) + + Zs_rep = np.einsum('mp,s->msp',Z_score,np.ones(n_S)) # functionally equivalent to repmat + cov_s = np.einsum('mpq,spq->mspq',N_mats,omega_hat) + sigma_hat + + Ls = np.linalg.cholesky(cov_s) + Rs = np.transpose(Ls, axes=(0,1,3,2)) + + xRinvs = np.linalg.solve(Ls, Zs_rep) + + logSqrtDetSigmas = np.sum(np.log(np.diagonal(Rs,axis1=2,axis2=3)),axis=2).reshape(M,n_S) + + quadforms = np.sum(xRinvs**2,axis=2).reshape(M,n_S) + jointProbs = np.exp(-0.5 * quadforms - logSqrtDetSigmas - P * np.log(2 * np.pi) / 2) + + if n_S == 1: + jointProbs = jointProbs.flatten() + + return jointProbs + +def rebuild_omega(chol_elems, s=None): + '''Rebuild state-dependent Omega given combination of causal states + cholX_elements are the elements (entered row-wise) of the lower triangular cholesky decomposition of Omega_s + + ''' + if s is None: + P = int((-1 + np.sqrt(1.+ 8.*len(chol_elems)))/2.) + s = np.ones(P,dtype=bool) + P_c = P + else: + P_c = int(np.sum(s)) + P = s.shape[1] if s.ndim == 2 else len(s) + cholL = np.zeros((P_c,P_c)) + + cholL[np.tril_indices(P_c)] = np.array(chol_elems) + cholL[np.diag_indices(P_c)] = np.exp(np.diag(cholL)) # exponentiate the diagnoal so cholL unique + for i in range(P_c): + for j in range(i): # multiply by exponentiated diags + cholL[i,j] = cholL[i,j]*np.sqrt(cholL[i,i]*cholL[j,j]) + + omega_c = np.dot(cholL, cholL.T) + + # Expand to include zeros of matrix + omega = np.zeros((P,P)) + s_caus_ind = np.argwhere(np.outer(s, s)) + omega[(s_caus_ind[:,0],s_caus_ind[:,1])] = omega_c.flatten() + return omega + +def _omega_neglogL(x,Zs,N_mats,sigma_LD): + omega_it = rebuild_omega(x) + joint_prob = jointEffect_probability(Zs,omega_it,sigma_LD,N_mats) + return - np.sum(np.log(joint_prob)) + +def numerical_omega(Zs,N_mats,sigma_LD,omega_start, tol=1.0e-10): + M,P = Zs.shape + solver_options = dict() + solver_options['fatol'] = 1.0e-8 + solver_options['xatol'] = tol + solver_options['disp'] = False + solver_options['maxiter'] = P*(P+1)*500 + + x_start = flatten_out_omega(omega_start) + + opt_results = scipy.optimize.minimize(_omega_neglogL,x_start,args=(Zs,N_mats,sigma_LD),method='Nelder-Mead',options=solver_options) + + return rebuild_omega(opt_results.x), opt_results + +def estimate_omega(Zs,Ns,sigma_LD): + logging.info('Beginning estimation of Omega ...') + + M,P = Zs.shape + N_mats = np.sqrt(np.einsum('mp, mq -> mpq',Ns, Ns)) + + return _posDef_adjustment(gmm_omega(Zs,Ns,sigma_LD)) + +def mtag_analysis(Zs, Ns, omega_hat, sigma_LD): + M,P = Zs.shape + + W_N = np.einsum('mp,pq->mpq',np.sqrt(Ns),np.eye(P)) + W_N_inv = np.linalg.inv(W_N) + Sigma_N = np.einsum('mpq,mqr->mpr',np.einsum('mpq,qr->mpr',W_N_inv,sigma_LD),W_N_inv) + + mtag_betas = np.zeros((M,P)) + mtag_se = np.zeros((M,P)) + mtag_factor = np.zeros((M,P)) + mtag_pval = np.zeros((M,P)) + mtag_z = np.zeros((M,P)) + p_values = lambda z: 2*(scipy.stats.norm.cdf(-1.*np.abs(z))) + + for p in range(P): + # Note that in the code, what I call "gamma should really be omega", but avoid the latter term due to possible confusion with big Omega + gamma_k = omega_hat[:,p] + tau_k_2 = omega_hat[p,p] + om_min_gam = omega_hat - np.outer(gamma_k,gamma_k)/tau_k_2 + + xx = om_min_gam + Sigma_N + inv_xx = np.linalg.inv(xx) + yy = gamma_k/tau_k_2 + W_inv_Z = np.einsum('mqp,mp->mq',W_N_inv,Zs) + + beta_denom = np.einsum('mp,p->m',np.einsum('q,mqp->mp',yy,inv_xx),yy) + mtag_factor[:,p] = np.einsum('mp,m->m',np.einsum('q,mqp->mp',yy,inv_xx), 1/beta_denom) + mtag_var_p = 1. / beta_denom + + mtag_betas[:,p] = np.einsum('mp,mp->m',np.einsum('q,mqp->mp',yy,inv_xx), W_inv_Z) / beta_denom + mtag_se[:,p] = np.sqrt(mtag_var_p) + mtag_z[:,p] = mtag_betas[:,p]/mtag_se[:,p] + mtag_pval[:,p] = p_values(mtag_z[:,p]) + +# print("\n betas:",mtag_betas) +# print("\n se:",mtag_se) +# print("\n mtag_factor:", mtag_factor) + print("\n mtag_z:", mtag_z) + print("\n mtag_pval:", mtag_pval) + return mtag_z, mtag_pval + +def mtag(Zs,Ns,sigma_hat): + omega_hat = estimate_omega(Zs, Ns, sigma_hat) + print("\n estimated omega:", omega_hat) + mtag_z, mtag_pval = mtag_analysis(Zs, Ns, omega_hat, sigma_hat) + print("lambda in MTAG raw p-vals:",chi2.sf(np.median(mtag_pval), df=1) / chi2.sf(0.5, df=1)) + min_mtag_pval = mtag_pval.min(axis=1) + return min_mtag_pval + +if __name__ == '__main__': + name='H0_null_k5_ce_0.25_ov_1.tsv' # to specify input file of Z-scores multiple traits + df = pd.read_csv('Z_score_'+name,delimiter='\t') + Zs = df[['z1','z2','z3','z4','z5']].values # extract Z-scores columns only + sigma_hat = pd.read_csv('COV_'+name,delimiter='\t').values # import null cov matrix. + N_samp = 50000 # sample size (pre-defined in the generation of the simulated data) + n_trait = 5 + + M = len(df) # number of SNPs + Ns = np.zeros([M,n_trait]) + N_samp + min_mtag_pval = mtag(Zs,Ns,sigma_hat) # min P value across traits (per SNP) + df['min_pval_mtag'] = min_mtag_pval + df.to_csv('Z_score_wMTAG_'+name,sep='\t',na_rep="NA",index=False)