Commit 18917981 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

script to generate Zscores

parent c015f608
......@@ -3,11 +3,10 @@
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import raiss
import os
import sys
import re
from sklearn.preprocessing import StandardScaler
def exponential_decay(nsnp):
......@@ -33,7 +32,8 @@ def compute_Zscore(Y, X, sig):
Zscores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
x = X.iloc[:,i]
Zscores[i] = x.dot(Y) / (np.linalg.norm(x,2)*sig)#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
Zscores[i] = x.dot(Y) / (np.linalg.norm(x,2)*sig)
#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return Zscores
......@@ -73,87 +73,60 @@ def define_signal_generator(tag):
if tag=="two_opposite":
return generate_two_opposite
if __name__ == '__main__':
# import sample size each study
def get_perf(rd, Zscore, LD_cor, ids_known, ids_masked):
imputed = raiss.stat_models.raiss_model(Zscore[ids_known], pd.DataFrame(LD_cor[ids_known,:][:,ids_known]), LD_cor[ids_masked,:][:,ids_known], rcond=rd)
reconstructed = imputed['mu']* np.sqrt(1-imputed['var'])
cor_perf = np.corrcoef(reconstructed, Zscore[ids_masked])[0,1]
MAE = np.linalg.norm(reconstructed - Zscore[ids_masked], 1) / len(ids_masked) # L1- loss
id_large = np.where(np.abs(Zscore[ids_masked]) > 6)[0]
cor_large = np.corrcoef(reconstructed[id_large], Zscore[ids_masked][id_large])[0,1]
MAE_large = np.linalg.norm(reconstructed[id_large] - Zscore[ids_masked][id_large], 1) / len(id_large)
return({'cor':cor_perf, "MAE": MAE, "cor_large":cor_large, "MAE_large":MAE_large})
# Get best rd
def best_rd(zscore, LD_cor, ids_known, ids_masked):
rd_list = np.linspace(0.001,0.75, 100)
cor_perf = pd.Series(index=rd_list)
MAE_perf = pd.Series(index=rd_list)
cor_large = pd.Series(index=rd_list)
MAE_large = pd.Series(index=rd_list)
for rd in rd_list:
perf = get_perf(rd, zscore, LD_cor, ids_known, ids_masked)
cor_perf[rd] = perf["cor"]
MAE_perf[rd] = perf["MAE"]
cor_large[rd] = perf["cor_large"]
MAE_large[rd] = perf["MAE_large"]
return({"correlation": cor_perf[np.argmax(cor_perf)],
"MAE": MAE_perf[np.argmax(cor_perf)],
"correlation_large": cor_large[np.argmax(cor_perf)],
"MAE_large": MAE_large[np.argmax(cor_perf)],
"rcond":np.argmax(cor_perf)})
if __name__=="__main__":
Ssize = pd.read_csv("../../data/external/meta_data/Ncases_B2_vs_ALL.tsv", sep="\t")
Ssize = Ssize.loc[[re.search("EUR",i)!=None for i in Ssize.Name],]
Ssize["per"] = Ssize.n_cases / (Ssize.n_cases + Ssize.n_controls)
Ssize["N_effective"] = np.floor(Ssize.per * (1- Ssize.per) * (Ssize.n_cases + Ssize.n_controls))
Ssize.N_effective = Ssize.N_effective.astype(int, copy=False)
# get the type of simulation from command line
tag = sys.argv[1] # possibles values are two_opposite, two_causal, one_causal
print("Simulating values for {}".format(tag))
#get genotype
X = pd.read_csv("./data/genotype.csv", sep="\t")
nsnp = X.shape[1]
# Estimate LD from genotype:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X.head()
LD_cor = X.corr().values
n_rep = 100
n_masked = 40
max_amp = 10
ids_masked = np.random.choice(100, n_masked, replace=False)
ids_known = np.setdiff1d(np.array(range(100)), ids_masked)
amplitude_effect = pd.DataFrame(index=np.arange(0, n_rep*max_amp, 1),
columns = ["amplitude", "correlation", "MAE", "correlation_large", "MAE_large"])
id = 0
np.linspace(1.8,2,3)
for amp in np.linspace(0,0.25,50):
for rep in range(n_rep):
print('search of best rd for amplitude {}'.format(amp))
signal_generator = define_signal_generator(tag)
signal = signal_generator(amp, LD_cor, X, sig=1)
Zscore = signal["Zscore"]
beta = signal["causal"]
tag = "one_causal" # possibles values are two_opposite, two_causal, one_causal
for tag in ['two_opposite', 'two_causal', 'one_causal']:
print("Simulating values for {}".format(tag))
#get genotype
X = pd.read_csv("../../data/processed/Simulated/Genotypes/genotype2.csv", sep="\t")
LD_cor = pd.read_csv("../../data/processed/Simulated/Genotypes/LD_matrix2.csv", sep="\t")
LD_cor = LD_cor.values
nsnp = X.shape[1]
# Estimate LD from genotype:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
#LD_cor = X.corr().values
n_snp = LD_cor.shape[0]
n_masked = 20
amp = 1
print('search of best rd for amplitude {}'.format(amp))
signal_generator = define_signal_generator(tag)
signal = signal_generator(amp, LD_cor, X, sig=1)
Zscore = signal["Zscore"]
beta = signal["causal"]
if amp==0.15:
pd.Series(Zscore).to_csv("./results/Zscore_{0}.csv".format(tag))
pd.Series(beta).to_csv("./results/Beta_{0}.csv".format(tag))
All_Zscore = pd.DataFrame({"Beta_meta-analysis" : Zscore / X.shape[0]**0.5, "Z_meta-analysis" : Zscore})
res = best_rd(Zscore, LD_cor, ids_known, ids_masked)
print("best rd : {}".format(res['rcond']))
print("Correlation : {}".format(res['correlation']))
print("MAE : {}".format(res['MAE']))
print("R2_genet : {}".format(signal["R2_genet"]))
for i, neff in enumerate(Ssize.N_effective):
print(i, neff)
amplitude_effect.loc[id, "amplitude"] = amp
amplitude_effect.loc[id, "correlation"] = res['correlation']
amplitude_effect.loc[id, "MAE"] = res["MAE"]
amplitude_effect.loc[id, "correlation_large"] = res['correlation_large']
amplitude_effect.loc[id, "MAE_large"] = res["MAE_large"]
id_start = Ssize.N_effective[:i].sum()
id_end = Ssize.N_effective[:(i+1)].sum()
study = Ssize.iloc[i,0]
print("Number of sample for {}".format(study))
print(Ssize.N_effective.iloc[i])
X_study = X.iloc[id_start:id_end,]
signal = signal_generator(amp, LD_cor, X_study, sig=1)
Zscore = signal["Zscore"]
id = id +1
All_Zscore["Z_{0}".format(study)] = Zscore
All_Zscore["Beta_{0}".format(study)] = Zscore / X_study.shape[0]**0.5
print(All_Zscore.head())
amplitude_effect.to_csv("./results/amplitude_effect_{0}.csv".format(tag))
All_Zscore.reset_index().to_csv("../../data/processed/Simulated/Zscores/Zscore_{0}.csv".format(tag))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment