Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Statistical-Genetics
hgcovid_imputation
Commits
18917981
Commit
18917981
authored
Jan 26, 2022
by
Hanna JULIENNE
Browse files
script to generate Zscores
parent
c015f608
Changes
1
Hide whitespace changes
Inline
Side-by-side
src/data/generate_Zscores.py
View file @
18917981
...
...
@@ -3,11 +3,10 @@
import
numpy
as
np
import
pandas
as
pd
import
seaborn
as
sns
import
scipy
import
raiss
import
os
import
sys
import
re
from
sklearn.preprocessing
import
StandardScaler
def
exponential_decay
(
nsnp
):
...
...
@@ -33,7 +32,8 @@ def compute_Zscore(Y, X, sig):
Zscores
=
np
.
zeros
(
X
.
shape
[
1
])
for
i
in
range
(
X
.
shape
[
1
]):
x
=
X
.
iloc
[:,
i
]
Zscores
[
i
]
=
x
.
dot
(
Y
)
/
(
np
.
linalg
.
norm
(
x
,
2
)
*
sig
)
#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
Zscores
[
i
]
=
x
.
dot
(
Y
)
/
(
np
.
linalg
.
norm
(
x
,
2
)
*
sig
)
#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return
Zscores
...
...
@@ -73,87 +73,60 @@ def define_signal_generator(tag):
if
tag
==
"two_opposite"
:
return
generate_two_opposite
if
__name__
==
'__main__'
:
# import sample size each study
def
get_perf
(
rd
,
Zscore
,
LD_cor
,
ids_known
,
ids_masked
):
imputed
=
raiss
.
stat_models
.
raiss_model
(
Zscore
[
ids_known
],
pd
.
DataFrame
(
LD_cor
[
ids_known
,:][:,
ids_known
]),
LD_cor
[
ids_masked
,:][:,
ids_known
],
rcond
=
rd
)
reconstructed
=
imputed
[
'mu'
]
*
np
.
sqrt
(
1
-
imputed
[
'var'
])
cor_perf
=
np
.
corrcoef
(
reconstructed
,
Zscore
[
ids_masked
])[
0
,
1
]
MAE
=
np
.
linalg
.
norm
(
reconstructed
-
Zscore
[
ids_masked
],
1
)
/
len
(
ids_masked
)
# L1- loss
id_large
=
np
.
where
(
np
.
abs
(
Zscore
[
ids_masked
])
>
6
)[
0
]
cor_large
=
np
.
corrcoef
(
reconstructed
[
id_large
],
Zscore
[
ids_masked
][
id_large
])[
0
,
1
]
MAE_large
=
np
.
linalg
.
norm
(
reconstructed
[
id_large
]
-
Zscore
[
ids_masked
][
id_large
],
1
)
/
len
(
id_large
)
return
({
'cor'
:
cor_perf
,
"MAE"
:
MAE
,
"cor_large"
:
cor_large
,
"MAE_large"
:
MAE_large
})
# Get best rd
def
best_rd
(
zscore
,
LD_cor
,
ids_known
,
ids_masked
):
rd_list
=
np
.
linspace
(
0.001
,
0.75
,
100
)
cor_perf
=
pd
.
Series
(
index
=
rd_list
)
MAE_perf
=
pd
.
Series
(
index
=
rd_list
)
cor_large
=
pd
.
Series
(
index
=
rd_list
)
MAE_large
=
pd
.
Series
(
index
=
rd_list
)
for
rd
in
rd_list
:
perf
=
get_perf
(
rd
,
zscore
,
LD_cor
,
ids_known
,
ids_masked
)
cor_perf
[
rd
]
=
perf
[
"cor"
]
MAE_perf
[
rd
]
=
perf
[
"MAE"
]
cor_large
[
rd
]
=
perf
[
"cor_large"
]
MAE_large
[
rd
]
=
perf
[
"MAE_large"
]
return
({
"correlation"
:
cor_perf
[
np
.
argmax
(
cor_perf
)],
"MAE"
:
MAE_perf
[
np
.
argmax
(
cor_perf
)],
"correlation_large"
:
cor_large
[
np
.
argmax
(
cor_perf
)],
"MAE_large"
:
MAE_large
[
np
.
argmax
(
cor_perf
)],
"rcond"
:
np
.
argmax
(
cor_perf
)})
if
__name__
==
"__main__"
:
Ssize
=
pd
.
read_csv
(
"../../data/external/meta_data/Ncases_B2_vs_ALL.tsv"
,
sep
=
"
\t
"
)
Ssize
=
Ssize
.
loc
[[
re
.
search
(
"EUR"
,
i
)
!=
None
for
i
in
Ssize
.
Name
],]
Ssize
[
"per"
]
=
Ssize
.
n_cases
/
(
Ssize
.
n_cases
+
Ssize
.
n_controls
)
Ssize
[
"N_effective"
]
=
np
.
floor
(
Ssize
.
per
*
(
1
-
Ssize
.
per
)
*
(
Ssize
.
n_cases
+
Ssize
.
n_controls
))
Ssize
.
N_effective
=
Ssize
.
N_effective
.
astype
(
int
,
copy
=
False
)
# get the type of simulation from command line
tag
=
sys
.
argv
[
1
]
# possibles values are two_opposite, two_causal, one_causal
print
(
"Simulating values for {}"
.
format
(
tag
))
#get genotype
X
=
pd
.
read_csv
(
"./data/genotype.csv"
,
sep
=
"
\t
"
)
nsnp
=
X
.
shape
[
1
]
# Estimate LD from genotype:
scaler
=
StandardScaler
()
X
=
pd
.
DataFrame
(
scaler
.
fit_transform
(
X
))
X
.
head
()
LD_cor
=
X
.
corr
().
values
n_rep
=
100
n_masked
=
40
max_amp
=
10
ids_masked
=
np
.
random
.
choice
(
100
,
n_masked
,
replace
=
False
)
ids_known
=
np
.
setdiff1d
(
np
.
array
(
range
(
100
)),
ids_masked
)
amplitude_effect
=
pd
.
DataFrame
(
index
=
np
.
arange
(
0
,
n_rep
*
max_amp
,
1
),
columns
=
[
"amplitude"
,
"correlation"
,
"MAE"
,
"correlation_large"
,
"MAE_large"
])
id
=
0
np
.
linspace
(
1.8
,
2
,
3
)
for
amp
in
np
.
linspace
(
0
,
0.25
,
50
):
for
rep
in
range
(
n_rep
):
print
(
'search of best rd for amplitude {}'
.
format
(
amp
))
signal_generator
=
define_signal_generator
(
tag
)
signal
=
signal_generator
(
amp
,
LD_cor
,
X
,
sig
=
1
)
Zscore
=
signal
[
"Zscore"
]
beta
=
signal
[
"causal"
]
tag
=
"one_causal"
# possibles values are two_opposite, two_causal, one_causal
for
tag
in
[
'two_opposite'
,
'two_causal'
,
'one_causal'
]:
print
(
"Simulating values for {}"
.
format
(
tag
))
#get genotype
X
=
pd
.
read_csv
(
"../../data/processed/Simulated/Genotypes/genotype2.csv"
,
sep
=
"
\t
"
)
LD_cor
=
pd
.
read_csv
(
"../../data/processed/Simulated/Genotypes/LD_matrix2.csv"
,
sep
=
"
\t
"
)
LD_cor
=
LD_cor
.
values
nsnp
=
X
.
shape
[
1
]
# Estimate LD from genotype:
scaler
=
StandardScaler
()
X
=
pd
.
DataFrame
(
scaler
.
fit_transform
(
X
))
#LD_cor = X.corr().values
n_snp
=
LD_cor
.
shape
[
0
]
n_masked
=
20
amp
=
1
print
(
'search of best rd for amplitude {}'
.
format
(
amp
))
signal_generator
=
define_signal_generator
(
tag
)
signal
=
signal_generator
(
amp
,
LD_cor
,
X
,
sig
=
1
)
Zscore
=
signal
[
"Zscore"
]
beta
=
signal
[
"causal"
]
if
amp
==
0.15
:
pd
.
Series
(
Zscore
).
to_csv
(
"./results/Zscore_{0}.csv"
.
format
(
tag
))
pd
.
Series
(
beta
).
to_csv
(
"./results/Beta_{0}.csv"
.
format
(
tag
))
All_Zscore
=
pd
.
DataFrame
({
"Beta_meta-analysis"
:
Zscore
/
X
.
shape
[
0
]
**
0.5
,
"Z_meta-analysis"
:
Zscore
})
res
=
best_rd
(
Zscore
,
LD_cor
,
ids_known
,
ids_masked
)
print
(
"best rd : {}"
.
format
(
res
[
'rcond'
]))
print
(
"Correlation : {}"
.
format
(
res
[
'correlation'
]))
print
(
"MAE : {}"
.
format
(
res
[
'MAE'
]))
print
(
"R2_genet : {}"
.
format
(
signal
[
"R2_genet"
]))
for
i
,
neff
in
enumerate
(
Ssize
.
N_effective
):
print
(
i
,
neff
)
amplitude_effect
.
loc
[
id
,
"amplitude"
]
=
amp
amplitude_effect
.
loc
[
id
,
"correlation"
]
=
res
[
'correlation'
]
amplitude_effect
.
loc
[
id
,
"MAE"
]
=
res
[
"MAE"
]
amplitude_effect
.
loc
[
id
,
"correlation_large"
]
=
res
[
'correlation_large'
]
amplitude_effect
.
loc
[
id
,
"MAE_large"
]
=
res
[
"MAE_large"
]
id_start
=
Ssize
.
N_effective
[:
i
].
sum
()
id_end
=
Ssize
.
N_effective
[:(
i
+
1
)].
sum
()
study
=
Ssize
.
iloc
[
i
,
0
]
print
(
"Number of sample for {}"
.
format
(
study
))
print
(
Ssize
.
N_effective
.
iloc
[
i
])
X_study
=
X
.
iloc
[
id_start
:
id_end
,]
signal
=
signal_generator
(
amp
,
LD_cor
,
X_study
,
sig
=
1
)
Zscore
=
signal
[
"Zscore"
]
id
=
id
+
1
All_Zscore
[
"Z_{0}"
.
format
(
study
)]
=
Zscore
All_Zscore
[
"Beta_{0}"
.
format
(
study
)]
=
Zscore
/
X_study
.
shape
[
0
]
**
0.5
print
(
All_Zscore
.
head
())
amplitude_effect
.
to_csv
(
"./
results/amplitude_effect
_{0}.csv"
.
format
(
tag
))
All_Zscore
.
reset_index
()
.
to_csv
(
".
.
/
../data/processed/Simulated/Zscores/Zscore
_{0}.csv"
.
format
(
tag
))
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment