Skip to content
GitLab
Menu
Projects
Groups
Snippets
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Statistical-Genetics
hgcovid_imputation
Commits
f083000c
Commit
f083000c
authored
Feb 17, 2022
by
Hanna JULIENNE
Browse files
factorize further code for simulating Zscore for more cohort structure
parent
b9936427
Changes
2
Hide whitespace changes
Inline
Side-by-side
README.md
View file @
f083000c
...
...
@@ -37,14 +37,24 @@ Project Organization
│ │
│ ├── data <- Scripts to download or generate data
│ │ └── make_dataset.py
| |__ 1 generate genotype with a specified LD structure
| |__ 2 generate_Zscores.py
| | Generate estimated Zscore by simulating a continuous phenotypes from genotype and
| normal noise. then estimate beta and Zscore with a linear regression. Cohort size
| are generated based on hgcovid cohort sizes
| _ 3 generate_Zscores_small_cohort.py
Variant of the precedent script to simulate Z-scores using only small cohorts. Hypothesis to test:
Can we recover a proper signal in the meta analysis using very noisy input
│ │
│ ├── features <- Scripts to turn raw data into features for modeling
│ │ └── build_features.py
│ │
│ ├── models <- Scripts to train models and then use trained models to make
│ │ │ predictions
│ │ ├── predict_model.py
│ │ └── train_model.py
│ ├── models
│ │ │
│ │ ├── Impute_simulated_signal.py
impute Zscores using simulated data : 1 mask 50 SNPs on 200, reimpute then save
│ │ └──
│ │
│ └── visualization <- Scripts to create exploratory and results oriented visualizations
│ └── visualize.py
...
...
src/data/generate_Zscores.py
View file @
f083000c
...
...
@@ -26,6 +26,7 @@ def compute_Y(beta, X, sig=1.0):
Y
=
X
.
dot
(
beta
)
+
noise
return
Y
def
compute_Zscore
(
Y
,
X
,
sig
):
nind
=
X
.
shape
[
0
]
...
...
@@ -33,22 +34,27 @@ def compute_Zscore(Y, X, sig):
for
i
in
range
(
X
.
shape
[
1
]):
x
=
X
.
iloc
[:,
i
]
Zscores
[
i
]
=
x
.
dot
(
Y
)
/
(
np
.
linalg
.
norm
(
x
,
2
)
*
sig
)
#X.iloc[:,i].corr(Y)# X.iloc[:,i].dot(Y) / X.iloc[:,i].dot(X.iloc[:,i])
#Zscores = Zscores*nind**0.5
#Zscores = (Zscores - Zscores.mean()) / np.std(Zscores)
return
Zscores
def
generate_one_causal
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
):
def
generate_null_signal
(
amp_dummy
,
LD_cor
,
X
,
sig
=
1.0
,
sig_genet
=
0.00001
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
sig_genet
)
Y
=
compute_Y
(
beta
,
X
,
sig
)
Zscore
=
compute_Zscore
(
Y
,
X
,
sig
)
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
generate_one_causal
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
,
sig_genet
=
0.00001
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
0.005
)
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
sig_genet
)
beta
[
nsnp
//
2
]
=
amplitude
Y
=
compute_Y
(
beta
,
X
,
sig
)
Zscore
=
compute_Zscore
(
Y
,
X
,
sig
)
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
generate_two_causal
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
):
def
generate_two_causal
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
,
sig_genet
=
0.00001
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
0.005
)
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
sig_genet
)
beta
[
nsnp
//
4
]
=
amplitude
beta
[
3
*
(
nsnp
//
4
)]
=
amplitude
Y
=
compute_Y
(
beta
,
X
,
sig
)
...
...
@@ -56,9 +62,9 @@ def generate_two_causal(amplitude, LD_cor, X, sig=1.0):
Zscore
=
compute_Zscore
(
Y
,
X
,
sig
)
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
generate_two_opposite
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
):
def
generate_two_opposite
(
amplitude
,
LD_cor
,
X
,
sig
=
1.0
,
sig_genet
=
0.00001
):
nsnp
=
LD_cor
.
shape
[
0
]
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
0.005
)
beta
=
np
.
random
.
normal
(
size
=
nsnp
,
scale
=
sig_genet
)
beta
[
nsnp
//
4
]
=
amplitude
beta
[
3
*
(
nsnp
//
4
)]
=
-
amplitude
Y
=
compute_Y
(
beta
,
X
,
sig
)
...
...
@@ -66,6 +72,8 @@ def generate_two_opposite(amplitude, LD_cor, X, sig=1.0):
return
({
"causal"
:
beta
,
"Zscore"
:
Zscore
,
"R2_genet"
:
np
.
var
(
X
.
dot
(
beta
))
/
np
.
var
(
Y
)})
def
define_signal_generator
(
tag
):
if
tag
==
"null"
:
return
generate_null_signal
if
tag
==
"one_causal"
:
return
generate_one_causal
if
tag
==
"two_causal"
:
...
...
@@ -73,33 +81,31 @@ def define_signal_generator(tag):
if
tag
==
"two_opposite"
:
return
generate_two_opposite
if
__name__
==
'__main__'
:
# import sample size each study
Ssize
=
pd
.
read_csv
(
"../../data/external/meta_data/Ncases_B2_vs_ALL.tsv"
,
sep
=
"
\t
"
)
Ssize
=
Ssize
.
loc
[[
re
.
search
(
"EUR"
,
i
)
!=
None
for
i
in
Ssize
.
Name
],]
Ssize
[
"per"
]
=
Ssize
.
n_cases
/
(
Ssize
.
n_cases
+
Ssize
.
n_controls
)
Ssize
[
"N_effective"
]
=
np
.
floor
(
Ssize
.
per
*
(
1
-
Ssize
.
per
)
*
(
Ssize
.
n_cases
+
Ssize
.
n_controls
))
Ssize
.
N_effective
=
Ssize
.
N_effective
.
astype
(
int
,
copy
=
False
)
Ssize
.
to_csv
(
"../../data/external/meta_data/N_effective.csv"
)
# get the type of simulation from command line
tag
=
"one_causal"
# possibles values are two_opposite, two_causal, one_causal
for
tag
in
[
'two_opposite'
,
'two_causal'
,
'one_causal'
]:
def
simulate_Z_scores
(
geno_file
,
LD_file
,
cohort_meta
,
Z_score_path
,
amp
=
0.05
):
"""
Parameters :
geno_file (str): path to the genotype file
LD_file (str): path to the genotype file
cohort_meta (pd.DataFrame): Description of the cohort (main point is
to simulate according to specified sample size). Must contain columns: N_effective, study
Z_score_path (str) : Path to the output folder + sim prefix to save the results
amp = amplitude of the causal signal
"""
# possibles values are null, two_opposite, two_causal, one_causal
for
tag
in
[
"null"
,
'two_opposite'
,
'two_causal'
,
'one_causal'
]:
print
(
"Simulating values for {}"
.
format
(
tag
))
#get genotype
X
=
pd
.
read_csv
(
"../../data/processed/Simulated/Genotypes/genotype2.csv"
,
sep
=
"
\t
"
)
LD_cor
=
pd
.
read_csv
(
"../../data/processed/Simulated/Genotypes/LD_matrix2.csv"
,
sep
=
"
\t
"
)
X
=
pd
.
read_csv
(
geno_file
,
sep
=
"
\t
"
)
LD_cor
=
pd
.
read_csv
(
LD_file
,
sep
=
"
\t
"
)
LD_cor
=
LD_cor
.
values
nsnp
=
X
.
shape
[
1
]
# Estimate LD from genotype:
scaler
=
StandardScaler
()
# transform to centered scaled
X
=
pd
.
DataFrame
(
scaler
.
fit_transform
(
X
))
#LD_cor = X.corr().values
n_snp
=
LD_cor
.
shape
[
0
]
n_masked
=
20
amp
=
1
print
(
'search of best rd for amplitude {}'
.
format
(
amp
))
signal_generator
=
define_signal_generator
(
tag
)
...
...
@@ -108,24 +114,39 @@ if __name__ == '__main__':
Zscore
=
signal
[
"Zscore"
]
beta
=
signal
[
"causal"
]
All_Zscore
=
pd
.
DataFrame
({
"Beta_
meta-analysi
s"
:
Zscore
/
X
.
shape
[
0
]
**
0.5
,
"Z
_meta-analysi
s"
:
Zscore
})
All_Zscore
=
pd
.
DataFrame
({
"Beta_
all_SNP
s"
:
Zscore
/
X
.
shape
[
0
]
**
0.5
,
"Z
score_all_SNP
s"
:
Zscore
})
for
i
,
neff
in
enumerate
(
Ssize
.
N_effective
):
for
i
,
neff
in
enumerate
(
cohort_meta
.
N_effective
):
print
(
i
,
neff
)
id_start
=
Ssize
.
N_effective
[:
i
].
sum
()
id_end
=
Ssize
.
N_effective
[:(
i
+
1
)].
sum
()
id_start
=
cohort_meta
.
N_effective
[:
i
].
sum
()
id_end
=
cohort_meta
.
N_effective
[:(
i
+
1
)].
sum
()
study
=
Ssize
.
iloc
[
i
,
0
]
study
=
cohort_meta
.
study
.
iloc
[
i
]
print
(
"Number of sample for {}"
.
format
(
study
))
print
(
Ssize
.
N_effective
.
iloc
[
i
])
print
(
cohort_meta
.
N_effective
.
iloc
[
i
])
X_study
=
X
.
iloc
[
id_start
:
id_end
,]
signal
=
signal_generator
(
amp
,
LD_cor
,
X_study
,
sig
=
1
)
Zscore
=
signal
[
"Zscore"
]
print
(
"R2_genet {}"
.
format
(
signal
[
"R2_genet"
]))
All_Zscore
[
"Z_{0}"
.
format
(
study
)]
=
Zscore
All_Zscore
[
"Beta_{0}"
.
format
(
study
)]
=
Zscore
/
X_study
.
shape
[
0
]
**
0.5
All_Zscore
.
to_csv
(
"../../data/processed/Simulated/Zscores/Zscore_{0}.csv"
.
format
(
tag
))
All_Zscore
.
to_csv
(
"{0}_{1}.csv"
.
format
(
Z_score_path
,
tag
))
if
__name__
==
'__main__'
:
# import sample size each study
Ssize
=
pd
.
read_csv
(
"../../data/external/meta_data/Ncases_B2_vs_ALL.tsv"
,
sep
=
"
\t
"
)
Ssize
=
Ssize
.
loc
[[
re
.
search
(
"EUR"
,
i
)
!=
None
for
i
in
Ssize
.
Name
],]
Ssize
[
"per"
]
=
Ssize
.
n_cases
/
(
Ssize
.
n_cases
+
Ssize
.
n_controls
)
Ssize
[
"study"
]
=
Ssize
.
Name
Ssize
[
"N_effective"
]
=
np
.
floor
(
Ssize
.
per
*
(
1
-
Ssize
.
per
)
*
(
Ssize
.
n_cases
+
Ssize
.
n_controls
))
Ssize
.
N_effective
=
Ssize
.
N_effective
.
astype
(
int
,
copy
=
False
)
Ssize
.
to_csv
(
"../../data/external/meta_data/N_effective.csv"
)
print
(
Ssize
.
head
())
simulate_Z_scores
(
"../../data/processed/Simulated/Genotypes/genotype2.csv"
,
"../../data/processed/Simulated/Genotypes/LD_matrix2.csv"
,
Ssize
,
"../../data/processed/Simulated/Zscores/Zscore"
,
amp
=
0.05
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment