Commit 57d09a93 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

set more realistic parameters for causality simulation

parent 28ad0ecf
This source diff could not be displayed because it is too large. You can view the blob instead.
......@@ -8,13 +8,13 @@ import scipy
import raiss
import os
import sys
from sklearn.preprocessing import StandardScaler
def exponential_decay(nsnp):
exp_decay = np.zeros((nsnp, nsnp))
dec = np.exp(-np.linspace(0,4,nsnp))
for i in range(nsnp):
exp_decay[i] = np.roll(dec, i)
tr = np.triu(exp_decay, k=1)
tr = tr + np.transpose(tr)
np.fill_diagonal(tr,1)
......@@ -22,46 +22,48 @@ def exponential_decay(nsnp):
# # # Create a vector of causal effect
# Insert one causal SNPs
def compute_Y(beta, X, sig=1.0):
noise= np.random.normal(size=X.shape[0], scale=sig)
Y = X.dot(beta) + noise
return Y
def compute_Zscore(Y, X):
def compute_Zscore(Y, X, sig):
nind = X.shape[0]
Zscores = np.zeros(X.shape[1])
for i in range(X.shape[1]):
Zscores[i] = X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
Zscores = Zscores*nind**0.5
x = X.iloc[:,i]
Zscores[i] = x.dot(Y) / (np.linalg.norm(x,2)*sig)#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return Zscores
def generate_one_causal(amplitude, LD_cor, X):
def generate_one_causal(amplitude, LD_cor, X, sig=1.0):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp)
beta = np.random.normal(size=nsnp,scale=0.005)
beta[nsnp//2] = amplitude
Y = compute_Y(beta, X)
Zscore = compute_Zscore(Y, X)
return({"causal":beta, "Zscore":Zscore})
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_two_causal(amplitude, LD_cor, X):
def generate_two_causal(amplitude, LD_cor, X, sig=1.0):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp)
beta = np.random.normal(size=nsnp,scale=0.005)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = amplitude
Y = compute_Y(beta, X)
Zscore = compute_Zscore(Y, X)
return({"causal":beta, "Zscore":Zscore})
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def generate_two_opposite(amplitude, LD_cor, X):
def generate_two_opposite(amplitude, LD_cor, X, sig=1.0):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp)
beta = np.random.normal(size=nsnp,scale=0.005)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = -amplitude
Y = compute_Y(beta, X)
Zscore = compute_Zscore(Y, X)
return({"causal":beta, "Zscore":Zscore})
Y = compute_Y(beta, X, sig)
Zscore = compute_Zscore(Y, X, sig)
return({"causal":beta, "Zscore":Zscore,"R2_genet":np.var(X.dot(beta))/np.var(Y)})
def define_signal_generator(tag):
if tag=="one_causal":
......@@ -74,12 +76,13 @@ def define_signal_generator(tag):
def get_perf(rd, Zscore, LD_cor, ids_known, ids_masked):
imputed = raiss.stat_models.raiss_model(Zscore[ids_known], pd.DataFrame(LD_cor[ids_known,:][:,ids_known]), LD_cor[ids_masked,:][:,ids_known], rcond=rd)
cor_perf = np.corrcoef(imputed['mu'], Zscore[ids_masked])[0,1]
MAE = np.linalg.norm(imputed['mu'] - Zscore[ids_masked], 1) / len(ids_masked) # L1- loss
reconstructed = imputed['mu']* np.sqrt(1-imputed['var'])
cor_perf = np.corrcoef(reconstructed, Zscore[ids_masked])[0,1]
MAE = np.linalg.norm(reconstructed - Zscore[ids_masked], 1) / len(ids_masked) # L1- loss
id_large = np.where(np.abs(Zscore[ids_masked]) > 6)[0]
cor_large = np.corrcoef(imputed['mu'][id_large], Zscore[ids_masked][id_large])[0,1]
MAE_large = np.linalg.norm(imputed['mu'][id_large] - Zscore[ids_masked][id_large], 1) / len(id_large)
cor_large = np.corrcoef(reconstructed[id_large], Zscore[ids_masked][id_large])[0,1]
MAE_large = np.linalg.norm(reconstructed[id_large] - Zscore[ids_masked][id_large], 1) / len(id_large)
return({'cor':cor_perf, "MAE": MAE, "cor_large":cor_large, "MAE_large":MAE_large})
# Get best rd
......@@ -111,28 +114,33 @@ if __name__=="__main__":
#get genotype
X = pd.read_csv("./data/genotype.csv", sep="\t")
nsnp = X.shape[1]
X.shape[0]
# Estimate LD from genotype:
scaler = StandardScaler()
X = pd.DataFrame(scaler.fit_transform(X))
X
LD_cor = X.corr().values
n_rep = 100
n_masked = 40
max_amp = 50
max_amp = 10
ids_masked = np.random.choice(100, n_masked, replace=False)
ids_known = np.setdiff1d(np.array(range(100)), ids_masked)
amplitude_effect = pd.DataFrame(index=np.arange(0, n_rep*max_amp, 1), columns = ["amplitude", "correlation", "MAE", "correlation_large", "MAE_large"])
amplitude_effect = pd.DataFrame(index=np.arange(0, n_rep*max_amp, 1),
columns = ["amplitude", "correlation", "MAE", "correlation_large", "MAE_large"])
id = 0
for amp in range(max_amp):
np.linspace(1.8,2,3)
for amp in np.linspace(0,0.25,50):
for rep in range(n_rep):
print('search of best rd for amplitude {}'.format(amp))
signal_generator = define_signal_generator(tag)
signal = signal_generator(amp, LD_cor, X)
signal = signal_generator(amp, LD_cor, X, sig=1)
Zscore = signal["Zscore"]
beta = signal["causal"]
if amp==25:
if amp==0.15:
pd.Series(Zscore).to_csv("./results/Zscore_{0}.csv".format(tag))
pd.Series(beta).to_csv("./results/Beta_{0}.csv".format(tag))
......@@ -140,6 +148,7 @@ if __name__=="__main__":
print("best rd : {}".format(res['rcond']))
print("Correlation : {}".format(res['correlation']))
print("MAE : {}".format(res['MAE']))
print("R2_genet : {}".format(signal["R2_genet"]))
amplitude_effect.loc[id, "amplitude"] = amp
amplitude_effect.loc[id, "correlation"] = res['correlation']
......@@ -149,7 +158,4 @@ if __name__=="__main__":
id = id +1
amplitude_effect.to_csv("./results/amplitude_effect_{0}.csv".format(tag))
......@@ -67,7 +67,7 @@ for( tag in c("one_causal", "two_causal", "two_opposite"))
}
# MAE graph
p3 = ggplot(perf, aes(x=amplitude, y=MAE)) + geom_point(data=performance, aes(x=amplitude, y=MAE), size=0.2) + geom_line(color="royalblue1",size=1) +geom_errorbar(aes(ymin=MAE-sd_mae/(count^0.5), ymax=MAE+sd_mae/(count^0.5)),color="royalblue1", position=position_dodge(0.05))
p3 = p3 + ylab("")+ ylim(0,15)
p3 = p3 + ylab("")#+ ylim(0,15)
plot_list_for_reviewer[[paste0(tag, "_MAE")]] = p3
# cor_large graph
......@@ -77,7 +77,7 @@ for( tag in c("one_causal", "two_causal", "two_opposite"))
# MAE_large graph
p5 = ggplot(perf, aes(x=amplitude, y=MAE_large)) + geom_point(data=performance, aes(x=amplitude, y=MAE_large), size=0.2) + geom_line(color="royalblue1",size=1) +geom_errorbar(aes(ymin=MAE_large-sd_MAE_large/(count^0.5), ymax=MAE_large+sd_MAE_large/(count^0.5)),color="royalblue1", position=position_dodge(0.05))
p5 = p5 + ylab("")+ ylim(0,15)
p5 = p5 + ylab("")#+ ylim(0,15)
plot_list_for_reviewer[[paste0(tag, "_MAE_large")]] = p5
}
......
......@@ -2,7 +2,7 @@ source("./constraints.R")
library(Matrix)
nvar = 100
P = runif(nvar, 0.05, 0.95)
P = runif(nvar, 0.1, 0.9)
cov_mat = matrix(rep(0, nvar^2), ncol=nvar)
diag(cov_mat) = P*(1-P)
......@@ -18,7 +18,7 @@ for( i in 1:(nvar-1))
p11_range = get_possible_p11(p1,p2)
print(p11_range)
p11 = p11_range[1] + (p11_range[2] - p11_range[1]) * 0.75
p11 = p11_range[1] + (p11_range[2] - p11_range[1]) * 0.8
C12 = p11 - p1 * p2
cov_mat[i, j] = C12
......@@ -34,14 +34,15 @@ p11
## Simulate 10000 x-y pairs, and check that they have the specified
## correlation structure
bandSparse(nvar, nvar ,10 )
larg=25
bandSparse(nvar, nvar ,larg )
dec_mat = matrix(rep(0, nvar^2), ncol=nvar)
dec <- exp(seq(0,-4, len=nvar))
for(i in 1:nvar)
{
upb = min(i+10, nvar)
upb = min(i+larg, nvar)
dec_mat[i, i:upb] = 1 #dec[1:(nvar-i +1)]
}
......@@ -58,5 +59,5 @@ range(bincorr2commonprob(P, CC_dec))
x1 <- rmvbin(10000, margprob = P, bincorr = CC_dec)
x2 <- rmvbin(10000, margprob = P, bincorr = CC_dec)
write.table(x1 + x2, "genotype.csv", sep="\t")
write.table(CC_dec, "LD_matrix.csv", sep="\t")
write.table(x1 + x2, "genotype2.csv", sep="\t")
write.table(CC_dec, "LD_matrix2.csv", sep="\t")
0,-0.8869402161585944
1,0.6612390872476919
2,-0.8335827545945383
3,0.608329895185916
4,-0.3565991247915173
5,-0.9666506145673204
6,-1.0332139137068788
7,0.12959969600651824
8,-0.5743452686645449
9,-0.819768540048948
10,0.3166240551916395
11,-0.001034096845169535
12,-0.33714874226775554
13,1.1412254000650963
14,0.4609867113704986
15,1.1194474549784614
16,-0.7436053955612772
17,-1.2615122020387117
18,1.48342352546725
19,-0.14155180832367575
20,0.2853813484365789
21,-0.5103104091784904
22,1.7899155116797025
23,0.7943661363432482
24,-0.8446995865754815
25,25.0
26,0.579373108154632
27,-0.5691141496296795
28,1.2354022827422597
29,1.0606573530529075
30,0.8282054911958447
31,0.5935815019730685
32,0.025378418638609332
33,-0.08058290637278757
34,0.10061243556502907
35,-0.5477739696258856
36,2.2965829772221413
37,-0.4027922402181882
38,1.438566133990857
39,-0.496701446017469
40,-0.3868979305572139
41,-0.4108489340675367
42,0.679893702286411
43,0.028864108703664192
44,0.6104729637030691
45,-2.110676614652239
46,0.13411814689389556
47,-1.7161320919584688
48,-0.8471466817831336
49,1.3801224494279605
50,1.0632177893315686
51,-2.1887273818483663
52,-0.1513893110005833
53,-0.8474042472933034
54,0.8506221401022309
55,-0.6931308070482582
56,0.6727423446608372
57,0.1410861211357131
58,-1.6074991492059763
59,-0.16363766257680418
60,2.334849150191125
61,1.531318608250547
62,0.048406088914412906
63,1.6537384178833072
64,-0.419075457161441
65,-0.9744701833462486
66,-0.1311465586657584
67,0.16828715630822416
68,0.726867662105939
69,0.7360017259899292
70,0.6329112884596209
71,-0.22138098441855594
72,-0.8974451783257323
73,-0.797744271469423
74,0.06433133330985821
75,-25.0
76,-0.41806935699550474
77,-0.4909744151454624
78,-0.7183804900013461
79,-2.909689953602784
80,-0.05869701473301918
81,-0.47775701750714283
82,-0.9009327316093898
83,0.7023474585701137
84,0.13278437005378263
85,0.30536077272971385
86,0.8861855509074544
87,0.4193405912102993
88,-1.8960155653691282
89,-0.024086326106496192
90,-1.4967541399935167
91,-0.26099653663054495
92,0.31317998973730043
93,0.965000347623915
94,0.4172043647473007
95,0.16142085830463437
96,-0.3263661457564578
97,0.716929978261094
98,-0.16762360860595252
99,-0.2089843253852639
0,0.04694754977582493
1,0.04226844359442919
2,-0.021491228042423843
3,0.0698667588126381
4,-0.0044870850024749135
5,0.058366246866113296
6,-0.02077273846665261
7,0.11215650879283137
8,-0.03454439194825554
9,0.027654317680121172
10,-0.0736195780984284
11,0.05542268143134206
12,-0.09601829359152603
13,-0.035711718591383494
14,-0.03185092952388457
15,0.054940848746061435
16,0.0377160288906693
17,0.044050735224466835
18,-0.00570524803284919
19,0.03857436281529941
20,0.014809284510430499
21,-0.019084179214520453
22,-0.03954292664555769
23,0.03148590536817436
24,0.03703918604441982
25,2.0
26,-0.04049259600716007
27,0.010040185507156052
28,-0.000725309630104701
29,-0.06773859016224815
30,0.0399023754157354
31,0.02557902548446399
32,0.05410905808254046
33,0.03841561700140231
34,-0.03539255856646043
35,0.013868833091799468
36,-0.03281275738236295
37,-0.008571420373758118
38,0.036127937022497024
39,0.02325029753786089
40,-0.037555522580666696
41,-0.008656101252455405
42,0.018455881572649987
43,0.027402591317609106
44,0.0027645989617928572
45,-0.04130234169164259
46,0.0680871993552064
47,-0.043418704700576176
48,0.042913296663970875
49,-0.043218951735771914
50,-0.04632192200970746
51,0.08170664183343535
52,-0.0016497259790776266
53,0.10194670919553656
54,0.07203923146250264
55,-0.03914058399506972
56,0.015629764277463962
57,0.04403112745893175
58,-0.1268903800814886
59,0.03302646129844055
60,0.0008295450268236093
61,0.023451488764675326
62,0.07294067289556867
63,-0.04640712861720048
64,0.09695364918535421
65,-0.026635988182427985
66,-0.06158376328572244
67,-0.04020209552314207
68,-0.03871069171851738
69,0.02207768900710494
70,-0.0420339916135966
71,0.027697586730356064
72,0.02032879508674277
73,0.018529259611598906
74,-0.04173674945227409
75,-2.0
76,0.0017807211059030937
77,0.0011426313766364309
78,-0.02248398730614028
79,0.0002152937953432587
80,-0.002574233069625285
81,0.03695498580746412
82,0.04972808358012498
83,0.01000180548192501
84,0.031094932036629844
85,0.03213792202110334
86,-0.03412696503796042
87,0.10623627026977339
88,0.060156969744476034
89,0.04223574087240625
90,-0.01777897953430534
91,-0.004124258440330784
92,-0.031980073576833835
93,0.0038091234050792754
94,0.040020000564126905
95,-0.017564946698255434
96,-0.03737177979874396
97,0.004512622853279211
98,0.05079944397210188
99,0.06964244722767171
0,-3.095525062540547
1,-2.0394874538435586
2,-2.9856638042356756
3,-1.5690555188027293
4,-0.9917356944553491
5,-6.9233426279333194
6,-4.313352078108361
7,-3.1003176324515898
8,-2.242928676915172
9,-3.128921956835766
10,0.08875851874822568
11,1.5164008344821744
12,0.5217168623481246
13,4.879539443275154
14,0.8207325852675359
15,2.4010326580960872
16,22.177880164815235
17,-0.6085494351165335
18,19.20746937090012
19,21.103557814353564
20,29.9258262205742
21,31.148938068406483
22,19.263276638056766
23,28.723793279271863
24,15.31132946840329
25,71.36287449237025
26,37.84847967428527
27,0.2303686230877523
28,21.54443305440848
29,33.996890006359344
30,28.313580323896165
31,23.22262456923725
32,20.79998272598421
33,12.11190386774618
34,23.03515045268077
35,19.691059064349012
36,5.977267300520829
37,3.7431937143693803
38,3.9916512991437467
39,2.080229013295296
40,1.6346777831332013
41,-1.0626994686708333
42,-1.2919461845332778
43,-1.992878818014338
44,-0.697175772067122
45,-7.332656861773248
46,-2.049683697573815
47,-5.372481831406521
48,-3.0201168106951113
49,-0.34434050570413566
50,1.2352431532453043
51,-4.996265336400906
52,2.352199650194564
53,-0.6007378340234545
54,-1.147391325078494
55,-2.874829450355964
56,6.502298003845161
57,6.504860958753113
58,1.1680182319352206
59,3.7194915716654267
60,4.902182635976309
61,4.581128921123655
62,2.9329670955516365
63,0.6933557526400508
64,-0.04830495822108148
65,-1.6944318226847226
66,2.2234204431425484
67,-3.7713972211167714
68,-17.677228850888046
69,-18.653015110847786
70,-18.626744005547017
71,-6.278244536912221
72,-17.307442450111346
73,-1.0038368695661073
74,-35.95663501034087
75,-66.47326671978807
76,-10.263741334150602
77,0.10227431726584715
78,-39.40487294921039
79,-34.66369292407956
80,-20.666053051813943
81,0.19691158882543155
82,-29.33495976162811
83,-3.8648586567064367
84,-7.959486180708682
85,2.7258963836262726
86,-2.2449684641583882
87,-0.988685132056945
88,-9.421273469055095
89,-3.8497131201268022
90,-3.107573306374102
91,-1.7775196593989966
92,-3.995601719522443
93,2.5595976012321167
94,0.0239810350008681
95,-0.04246647710039686
96,1.0874333402464913
97,1.6712864285678304
98,-0.5532902453162698
99,-1.692207136972914
0,9.439063730990796
1,10.048695672201008
2,-0.41218284877694694
3,6.710076609648377
4,1.6565560703307407
5,-1.49095538164334
6,3.8374253539106595
7,7.230537934037819
8,9.306564819750221
9,5.742574028099734
10,7.679380030679864
11,15.775518130471895
12,2.2394528273280314
13,9.876676734943945
14,4.4826434719712145
15,13.735138129912333
16,69.75229470845842
17,11.413272795506934
18,43.977022928278444
19,52.828020203821275
20,74.94904355794017
21,79.45141323459313
22,37.18384543357539
23,75.15674685889562
24,45.47356368494674
25,203.11583293781507
26,93.63159980413529
27,2.2532596128847486
28,42.48083545005669
29,82.72740931661485
30,72.67110364170168
31,57.41784986883803
32,56.395012113471694
33,31.5595040245512
34,60.69751114930517
35,55.782817159558086
36,-0.1026526970952303
37,7.128138071841939
38,9.261758858021858
39,10.66592001652904
40,3.0747339491180132
41,-1.3801795536569885
42,1.6120440110247158
43,-4.02579239013612
44,-0.1062214597661674
45,-6.758457483343564
46,-0.8477557204137259
47,-8.009675949175042
48,-1.5569154576138813
49,-7.786216457177067
50,-5.246972447940347
51,8.136798580663799
52,7.711192432554995
53,11.540288794368681
54,5.332625289532787
55,2.1374671676123893
56,14.614629486286915
57,16.53885705068143
58,-7.406151298668459
59,0.9839406905007813
60,-3.312799965828185
61,1.0783560715984597
62,6.346415342886887
63,-6.8810102008162435
64,4.892451867782394
65,-1.7540003501635977
66,-0.37789098030939655
67,-16.369744479191237
68,-70.19888997616147
69,-57.826616011627934
70,-60.45401161556913
71,-9.089879861083773
72,-36.48885015508075
73,3.894602855354146
74,-108.5734009380406
75,-201.78226858246566
76,-22.090197601005784
77,8.454497247560465
78,-109.3310358899305
79,-82.18481598426881
80,-53.72245048650761
81,16.51299100952041
82,-68.84075147915338
83,-1.9521443188903347
84,-11.912400366502814
85,16.983836185927426
86,2.973923484524903
87,14.024674033696149
88,2.065122978778379
89,8.174792153731753
90,11.017317383635982
91,10.92380755478732
92,-8.397850554481614
93,13.034059887250272
94,13.174559930750616
95,7.179421816973769
96,4.082104758042993
97,12.512564507725685
98,11.281860953595793
99,7.623932116190209
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment