Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Statistical-Genetics
hgcovid_imputation
Commits
630b993b
Commit
630b993b
authored
Jan 20, 2022
by
Hanna JULIENNE
Browse files
script for dataset generation
parents
Changes
2
Hide whitespace changes
Inline
Side-by-side
src/data/generate_Zscores.py
0 → 100644
View file @
630b993b
# coding: utf-8
# Direct imputation with the presence of causal SNPs
import
numpy
as
np
import
pandas
as
pd
import
seaborn
as
sns
import
scipy
import
raiss
import
os
import
sys
from
sklearn.preprocessing
import
StandardScaler
def
exponential_decay
(
nsnp
):
exp_decay
=
np
.
zeros
((
nsnp
,
nsnp
))
dec
=
np
.
exp
(
-
np
.
linspace
(
0
,
4
,
nsnp
))
for
i
in
range
(
nsnp
):
exp_decay
[
i
]
=
np
.
roll
(
dec
,
i
)
tr
=
np
.
triu
(
exp_decay
,
k
=
1
)
tr
=
tr
+
np
.
transpose
(
tr
)
np
.
fill_diagonal
(
tr
,
1
)
return
(
tr
)
# # # Create a vector of causal effect
# Insert one causal SNPs
def
compute_Y
(
beta
,
X
,
sig
=
1.0
):
noise
=
np
.
random
.
normal
(
size
=
X
.
shape
[
0
],
scale
=
sig
)
Y
=
X
.
dot
(
beta
)
+
noise
return
Y
def
compute_Zscore
(
Y
,
X
,
sig
):
nind
=
X
.
shape
[
0
]
Zscores
=
np
.
zeros
(
X
.
shape
[
1
])
for
i
in
range
(
X
.
shape
[
1
]):
x
=
X
.
iloc
[:,
i
]
Zscores
[
i
]
=
x
.
dot
(
Y
)
/
(
np
.
linalg
.
norm
(
x
,
2
)
*
sig
)
#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return
Zscores
def
generate_one_causal
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
0.005
)
beta
[
nsnp
//
2
]
=
amplitude
Y
=
compute_Y
(
beta
,
X
,
sig
)
Zscore
=
compute_Zscore
(
Y
,
X
,
sig
)
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
generate_two_causal
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
0.005
)
beta
[
nsnp
//
4
]
=
amplitude
beta
[
3
*
(
nsnp
//
4
)]
=
amplitude
Y
=
compute_Y
(
beta
,
X
,
sig
)
Zscore
=
compute_Zscore
(
Y
,
X
,
sig
)
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
generate_two_opposite
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
0.005
)
beta
[
nsnp
//
4
]
=
amplitude
beta
[
3
*
(
nsnp
//
4
)]
=
-
amplitude
Y
=
compute_Y
(
beta
,
X
,
sig
)
Zscore
=
compute_Zscore
(
Y
,
X
,
sig
)
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
define_signal_generator
(
tag
):
if
tag
==
"one_causal"
:
return
generate_one_causal
if
tag
==
"two_causal"
:
return
generate_two_causal
if
tag
==
"two_opposite"
:
return
generate_two_opposite
def
get_perf
(
rd
,
Zscore
,
LD_cor
,
ids_known
,
ids_masked
):
imputed
=
raiss
.
stat_models
.
raiss_model
(
Zscore
[
ids_known
],
pd
.
DataFrame
(
LD_cor
[
ids_known
,:][:,
ids_known
]),
LD_cor
[
ids_masked
,:][:,
ids_known
],
rcond
=
rd
)
reconstructed
=
imputed
[
'mu'
]
*
np
.
sqrt
(
1
-
imputed
[
'var'
])
cor_perf
=
np
.
corrcoef
(
reconstructed
,
Zscore
[
ids_masked
])[
0
,
1
]
MAE
=
np
.
linalg
.
norm
(
reconstructed
-
Zscore
[
ids_masked
],
1
)
/
len
(
ids_masked
)
# L1- loss
id_large
=
np
.
where
(
np
.
abs
(
Zscore
[
ids_masked
])
>
6
)[
0
]
cor_large
=
np
.
corrcoef
(
reconstructed
[
id_large
],
Zscore
[
ids_masked
][
id_large
])[
0
,
1
]
MAE_large
=
np
.
linalg
.
norm
(
reconstructed
[
id_large
]
-
Zscore
[
ids_masked
][
id_large
],
1
)
/
len
(
id_large
)
return
({
'cor'
:
cor_perf
,
"MAE"
:
MAE
,
"cor_large"
:
cor_large
,
"MAE_large"
:
MAE_large
})
# Get best rd
def
best_rd
(
zscore
,
LD_cor
,
ids_known
,
ids_masked
):
rd_list
=
np
.
linspace
(
0.001
,
0.75
,
100
)
cor_perf
=
pd
.
Series
(
index
=
rd_list
)
MAE_perf
=
pd
.
Series
(
index
=
rd_list
)
cor_large
=
pd
.
Series
(
index
=
rd_list
)
MAE_large
=
pd
.
Series
(
index
=
rd_list
)
for
rd
in
rd_list
:
perf
=
get_perf
(
rd
,
zscore
,
LD_cor
,
ids_known
,
ids_masked
)
cor_perf
[
rd
]
=
perf
[
"cor"
]
MAE_perf
[
rd
]
=
perf
[
"MAE"
]
cor_large
[
rd
]
=
perf
[
"cor_large"
]
MAE_large
[
rd
]
=
perf
[
"MAE_large"
]
return
({
"correlation"
:
cor_perf
[
np
.
argmax
(
cor_perf
)],
"MAE"
:
MAE_perf
[
np
.
argmax
(
cor_perf
)],
"correlation_large"
:
cor_large
[
np
.
argmax
(
cor_perf
)],
"MAE_large"
:
MAE_large
[
np
.
argmax
(
cor_perf
)],
"rcond"
:
np
.
argmax
(
cor_perf
)})
if
__name__
==
"__main__"
:
# get the type of simulation from command line
tag
=
sys
.
argv
[
1
]
# possibles values are two_opposite, two_causal, one_causal
print
(
"Simulating values for {}"
.
format
(
tag
))
#get genotype
X
=
pd
.
read_csv
(
"./data/genotype.csv"
,
sep
=
"
\t
"
)
nsnp
=
X
.
shape
[
1
]
# Estimate LD from genotype:
scaler
=
StandardScaler
()
X
=
pd
.
DataFrame
(
scaler
.
fit_transform
(
X
))
X
.
head
()
LD_cor
=
X
.
corr
().
values
n_rep
=
100
n_masked
=
40
max_amp
=
10
ids_masked
=
np
.
random
.
choice
(
100
,
n_masked
,
replace
=
False
)
ids_known
=
np
.
setdiff1d
(
np
.
array
(
range
(
100
)),
ids_masked
)
amplitude_effect
=
pd
.
DataFrame
(
index
=
np
.
arange
(
0
,
n_rep
*
max_amp
,
1
),
columns
=
[
"amplitude"
,
"correlation"
,
"MAE"
,
"correlation_large"
,
"MAE_large"
])
id
=
0
np
.
linspace
(
1.8
,
2
,
3
)
for
amp
in
np
.
linspace
(
0
,
0.25
,
50
):
for
rep
in
range
(
n_rep
):
print
(
'search of best rd for amplitude {}'
.
format
(
amp
))
signal_generator
=
define_signal_generator
(
tag
)
signal
=
signal_generator
(
amp
,
LD_cor
,
X
,
sig
=
1
)
Zscore
=
signal
[
"Zscore"
]
beta
=
signal
[
"causal"
]
if
amp
==
0.15
:
pd
.
Series
(
Zscore
).
to_csv
(
"./results/Zscore_{0}.csv"
.
format
(
tag
))
pd
.
Series
(
beta
).
to_csv
(
"./results/Beta_{0}.csv"
.
format
(
tag
))
res
=
best_rd
(
Zscore
,
LD_cor
,
ids_known
,
ids_masked
)
print
(
"best rd : {}"
.
format
(
res
[
'rcond'
]))
print
(
"Correlation : {}"
.
format
(
res
[
'correlation'
]))
print
(
"MAE : {}"
.
format
(
res
[
'MAE'
]))
print
(
"R2_genet : {}"
.
format
(
signal
[
"R2_genet"
]))
amplitude_effect
.
loc
[
id
,
"amplitude"
]
=
amp
amplitude_effect
.
loc
[
id
,
"correlation"
]
=
res
[
'correlation'
]
amplitude_effect
.
loc
[
id
,
"MAE"
]
=
res
[
"MAE"
]
amplitude_effect
.
loc
[
id
,
"correlation_large"
]
=
res
[
'correlation_large'
]
amplitude_effect
.
loc
[
id
,
"MAE_large"
]
=
res
[
"MAE_large"
]
id
=
id
+
1
amplitude_effect
.
to_csv
(
"./results/amplitude_effect_{0}.csv"
.
format
(
tag
))
src/data/generate_genotype.R
0 → 100644
View file @
630b993b
library
(
Matrix
)
library
(
bindata
)
nvar
=
2000
# Number of SNPs
P
=
runif
(
nvar
,
0.1
,
0.9
)
cov_mat
=
matrix
(
rep
(
0
,
nvar
^
2
),
ncol
=
nvar
)
diag
(
cov_mat
)
=
P
*
(
1
-
P
)
get_possible_p11
<-
function
(
p1
,
p2
){
return
(
c
(
max
(
0
,
p1
+
p2
-1
),
min
(
c
(
p1
,
p2
))))}
for
(
i
in
1
:
(
nvar
-1
)){
for
(
j
in
(
i
+1
)
:
nvar
)
{
print
(
paste
(
i
,
j
))
p1
=
P
[
i
]
p2
=
P
[
j
]
p11_range
=
get_possible_p11
(
p1
,
p2
)
print
(
p11_range
)
p11
=
p11_range
[
1
]
+
(
p11_range
[
2
]
-
p11_range
[
1
])
*
0.55
C12
=
p11
-
p1
*
p2
cov_mat
[
i
,
j
]
=
C12
cov_mat
[
j
,
i
]
=
C12
}
}
CC
<-
cov2cor
(
cov_mat
)
## Simulate 10000 x-y pairs, and check that they have the specified
## correlation structure
larg
=
30
dec_mat
=
matrix
(
rep
(
0
,
nvar
^
2
),
ncol
=
nvar
)
dec
<-
exp
(
seq
(
0
,
-4
,
len
=
larg
+1
))
for
(
i
in
1
:
nvar
){
upb
=
min
(
i
+
larg
,
nvar
)
dec_mat
[
i
,
i
:
upb
]
=
dec
[
1
:
(
upb
-
i
+1
)]
}
dec_mat
=
dec_mat
+
t
(
dec_mat
)
diag
(
dec_mat
)
=
1
CC_dec
=
CC
*
dec_mat
bincor
=
bincorr2commonprob
(
P
,
CC_dec
)
Nind
=
16000
print
(
"## Generate genotype : "
)
x1
<-
rmvbin
(
Nind
,
margprob
=
P
,
bincorr
=
CC_dec
)
x2
<-
rmvbin
(
Nind
,
margprob
=
P
,
bincorr
=
CC_dec
)
print
(
"## save genotype:"
)
write.table
(
x1
+
x2
,
"../../data/processed/Simulated/Genotypes/genotype2.csv"
,
sep
=
"\t"
)
write.table
(
CC_dec
,
"../../data/processed/Simulated/Genotypes/LD_matrix2.csv"
,
sep
=
"\t"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment