Commit 630b993b authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

script for dataset generation

parents
# coding: utf-8
# Direct imputation with the presence of causal SNPs
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import raiss
import os
import sys
from sklearn.preprocessing import StandardScaler
def exponential_decay(nsnp):
exp_decay = np.zeros((nsnp, nsnp))
dec = np.exp(-np.linspace(0,4,nsnp))
for i in range(nsnp):
exp_decay[i] = np.roll(dec, i)
tr = np.triu(exp_decay, k=1)
tr = tr + np.transpose(tr)
np.fill_diagonal(tr,1)
return(tr)
# # # Create a vector of causal effect
# Insert one causal SNPs
def compute_Y(beta, X, sig=1.0):
noise= np.random.normal(size=X.shape[0], scale=sig)
Y = X.dot(beta) + noise
return Y
def compute_Zscore(Y, X, sig):
nind = X.shape[0]
Zscores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x = X.iloc[:,i]
Zscores[i] = x.dot(Y) / (np.linalg.norm(x,2)*sig)#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return Zscores
def generate_one_causal(amplitude, LD_cor, X, sig=1.0):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=0.005)
beta[nsnp//2] = amplitude
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_two_causal(amplitude, LD_cor, X, sig=1.0):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=0.005)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = amplitude
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_two_opposite(amplitude, LD_cor, X, sig=1.0):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp,scale=0.005)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = -amplitude
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def define_signal_generator(tag):
if tag=="one_causal":
return generate_one_causal
if tag=="two_causal":
return generate_two_causal
if tag=="two_opposite":
return generate_two_opposite
def get_perf(rd, Zscore, LD_cor, ids_known, ids_masked):
imputed = raiss.stat_models.raiss_model(Zscore[ids_known], pd.DataFrame(LD_cor[ids_known,:][:,ids_known]), LD_cor[ids_masked,:][:,ids_known], rcond=rd)
reconstructed = imputed['mu']* np.sqrt(1-imputed['var'])
cor_perf = np.corrcoef(reconstructed, Zscore[ids_masked])[0,1]
MAE = np.linalg.norm(reconstructed - Zscore[ids_masked], 1) / len(ids_masked) # L1- loss
id_large = np.where(np.abs(Zscore[ids_masked]) > 6)[0]
cor_large = np.corrcoef(reconstructed[id_large], Zscore[ids_masked][id_large])[0,1]
MAE_large = np.linalg.norm(reconstructed[id_large] - Zscore[ids_masked][id_large], 1) / len(id_large)
return({'cor':cor_perf, "MAE": MAE, "cor_large":cor_large, "MAE_large":MAE_large})
# Get best rd
def best_rd(zscore, LD_cor, ids_known, ids_masked):
rd_list = np.linspace(0.001,0.75, 100)
cor_perf = pd.Series(index=rd_list)
MAE_perf = pd.Series(index=rd_list)
cor_large = pd.Series(index=rd_list)
MAE_large = pd.Series(index=rd_list)
for rd in rd_list:
perf = get_perf(rd, zscore, LD_cor, ids_known, ids_masked)
cor_perf[rd] = perf["cor"]
MAE_perf[rd] = perf["MAE"]
cor_large[rd] = perf["cor_large"]
MAE_large[rd] = perf["MAE_large"]
return({"correlation": cor_perf[np.argmax(cor_perf)],
"MAE": MAE_perf[np.argmax(cor_perf)],
"correlation_large": cor_large[np.argmax(cor_perf)],
"MAE_large": MAE_large[np.argmax(cor_perf)],
"rcond":np.argmax(cor_perf)})
if __name__=="__main__":
# get the type of simulation from command line
tag = sys.argv[1] # possibles values are two_opposite, two_causal, one_causal
print("Simulating values for {}".format(tag))
#get genotype
X = pd.read_csv("./data/genotype.csv", sep="\t")
nsnp = X.shape[1]
# Estimate LD from genotype:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X.head()
LD_cor = X.corr().values
n_rep = 100
n_masked = 40
max_amp = 10
ids_masked = np.random.choice(100, n_masked, replace=False)
ids_known = np.setdiff1d(np.array(range(100)), ids_masked)
amplitude_effect = pd.DataFrame(index=np.arange(0, n_rep*max_amp, 1),
columns = ["amplitude", "correlation", "MAE", "correlation_large", "MAE_large"])
id = 0
np.linspace(1.8,2,3)
for amp in np.linspace(0,0.25,50):
for rep in range(n_rep):
print('search of best rd for amplitude {}'.format(amp))
signal_generator = define_signal_generator(tag)
signal = signal_generator(amp, LD_cor, X, sig=1)
Zscore = signal["Zscore"]
beta = signal["causal"]
if amp==0.15:
pd.Series(Zscore).to_csv("./results/Zscore_{0}.csv".format(tag))
pd.Series(beta).to_csv("./results/Beta_{0}.csv".format(tag))
res = best_rd(Zscore, LD_cor, ids_known, ids_masked)
print("best rd : {}".format(res['rcond']))
print("Correlation : {}".format(res['correlation']))
print("MAE : {}".format(res['MAE']))
print("R2_genet : {}".format(signal["R2_genet"]))
amplitude_effect.loc[id, "amplitude"] = amp
amplitude_effect.loc[id, "correlation"] = res['correlation']
amplitude_effect.loc[id, "MAE"] = res["MAE"]
amplitude_effect.loc[id, "correlation_large"] = res['correlation_large']
amplitude_effect.loc[id, "MAE_large"] = res["MAE_large"]
id = id +1
amplitude_effect.to_csv("./results/amplitude_effect_{0}.csv".format(tag))
library(Matrix)
library(bindata)
nvar = 2000 # Number of SNPs
P = runif(nvar, 0.1, 0.9)
cov_mat = matrix(rep(0, nvar^2), ncol=nvar)
diag(cov_mat) = P*(1-P)
get_possible_p11<- function(p1,p2){return(c(max(0, p1+p2-1), min(c(p1,p2))))}
for( i in 1:(nvar-1)){
for(j in (i+1):nvar)
{
print(paste(i,j))
p1 = P[i]
p2 = P[j]
p11_range = get_possible_p11(p1,p2)
print(p11_range)
p11 = p11_range[1] + (p11_range[2] - p11_range[1]) * 0.55
C12 = p11 - p1 * p2
cov_mat[i, j] = C12
cov_mat[j, i] = C12
}
}
CC <- cov2cor(cov_mat)
## Simulate 10000 x-y pairs, and check that they have the specified
## correlation structure
larg=30
dec_mat = matrix(rep(0, nvar^2), ncol=nvar)
dec <- exp(seq(0,-4, len=larg+1))
for(i in 1:nvar){
upb = min(i+larg, nvar)
dec_mat[i, i:upb] = dec[1:(upb-i+1)]
}
dec_mat = dec_mat + t(dec_mat)
diag(dec_mat) = 1
CC_dec =CC*dec_mat
bincor = bincorr2commonprob(P,CC_dec)
Nind=16000
print("## Generate genotype : ")
x1 <- rmvbin(Nind, margprob = P, bincorr = CC_dec)
x2 <- rmvbin(Nind, margprob = P, bincorr = CC_dec)
print("## save genotype:")
write.table(x1 + x2, "../../data/processed/Simulated/Genotypes/genotype2.csv", sep="\t")
write.table(CC_dec, "../../data/processed/Simulated/Genotypes/LD_matrix2.csv", sep="\t")
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment