Skip to content
GitLab
Menu
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Statistical-Genetics
RAISS
Commits
3901cc4d
Commit
3901cc4d
authored
Mar 10, 2022
by
Hanna JULIENNE
Browse files
improvement performance report
parent
98125ff6
Changes
1
Hide whitespace changes
Inline
Side-by-side
raiss/imputation_R2.py
View file @
3901cc4d
...
...
@@ -8,6 +8,7 @@
from
raiss.pipes
import
save_chromosome_imputation
import
multiprocessing
import
itertools
from
joblib
import
Parallel
,
delayed
import
pandas
as
pd
import
numpy
as
np
...
...
@@ -151,7 +152,9 @@ def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder,
def
grid_search
(
zscore_folder
,
masked_folder
,
output_folder
,
ref_folder
,
ld_folder
,
gwas
,
chrom
=
"chr22"
,
eigen_ratio_grid
=
[
0.5
,
0.1
,
0.01
],
window_size
=
500000
,
eigen_ratio_grid
=
[
0.5
,
0.1
,
0.01
],
ld_threshold_grid
=
[
0
,
4
,
10
,
20
],
window_size
=
500000
,
buffer_size
=
125000
,
l2_regularization
=
0.1
,
R2_threshold
=
0.6
,
N_to_mask
=
5000
,
ref_panel_preffix
=
""
,
ref_panel_suffix
=
".eur.1pct.bim"
,
ld_type
=
"plink"
,
stratifying_vector
=
None
,
stratifying_bins
=
None
,
LD_threshold
=
4
):
...
...
@@ -172,6 +175,7 @@ def grid_search(zscore_folder, masked_folder, output_folder,
gwas (str): gwas identifier in the following format : 'CONSORTIA_TRAIT'
chrom (str): chromosome in the format "chr.."
eigen_ratio_grid (list): list of eigen_ratio to test (must be between 0 and 1)
ld_threshold_grid (list) : list of minimum-ld to test (must be > 0 )
window_size, buffer_size, l2_regularization, R2_threshold : imputation parameter (see raiss command line documentation)
N_to_mask (int): Number of SNPs masked in the initial dataset to compute the correlation between true value and imputed value
ref_panel_suffix (str): suffix
...
...
@@ -190,39 +194,45 @@ def grid_search(zscore_folder, masked_folder, output_folder,
z_masked
.
to_csv
(
z_masked_file
,
sep
=
"
\t
"
)
masked_SNP
=
res_masked
[
1
]
def
run_imputation
(
cond
):
tag
=
"_{}"
.
format
(
cond
)
def
run_imputation
(
param
):
cond
=
param
[
0
]
min_ld
=
param
[
1
]
tag
=
"_{0}_{1}"
.
format
(
cond
,
min_ld
)
save_chromosome_imputation
(
gwas
,
chrom
,
window_size
,
buffer_size
,
l2_regularization
,
cond
,
masked_folder
,
ref_folder
,
ld_folder
,
output_folder
,
R2_threshold
,
tag
,
ref_panel_preffix
,
ref_panel_suffix
,
ld_type
,
minimum_ld
=
LD_thresho
ld
)
ref_panel_suffix
,
ld_type
,
minimum_ld
=
min_
ld
)
n_cpu
=
multiprocessing
.
cpu_count
()
P
ara
llel
(
n_jobs
=
n_cpu
)(
delayed
(
run_imputation
)(
rd
)
for
rd
in
eigen_ratio
_grid
)
p
ara
m_grid
=
itertools
.
product
(
eigen_ratio_grid
,
ld_threshold
_grid
)
R2_serie
=
pd
.
DataFrame
({
'N_SNP'
:
np
.
nan
,
'fraction_imputed'
:
np
.
nan
,
'cor'
:
np
.
nan
,
'mean_absolute_error'
:
np
.
nan
,
'median_absolute_error'
:
np
.
nan
,
'min_absolute_error'
:
np
.
nan
,
'max_absolute_error'
:
np
.
nan
,
"SNP_max_error"
:
np
.
nan
},
index
=
eigen_ratio_grid
)
for
rd
in
eigen_ratio_grid
:
z_output
=
"{0}/z_{1}_{2}_{3}.txt"
.
format
(
output_folder
,
gwas
,
chrom
,
rd
)
dat_imp
=
pd
.
read_csv
(
z_output
,
sep
=
"
\t
"
,
index_col
=
0
)
print
(
rd
)
try
:
res
=
imputation_performance
(
dat_orig
,
dat_imp
,
masked_SNP
)
except
KeyError
:
# If KeyError none of the masked_SNP are in the imputed dataframe
print
(
e
)
res
=
np
.
nan
R2_serie
.
loc
[
rd
,
'N_SNP'
]
=
res
[
"N_SNP"
]
R2_serie
.
loc
[
rd
,
'cor'
]
=
res
[
"cor"
]
R2_serie
.
loc
[
rd
,
'mean_absolute_error'
]
=
res
[
"mean_absolute_error"
]
R2_serie
.
loc
[
rd
,
'fraction_imputed'
]
=
res
[
"fraction_imputed"
]
R2_serie
.
loc
[
rd
,
'median_absolute_error'
]
=
res
[
"median_absolute_error"
]
R2_serie
.
loc
[
rd
,
'min_absolute_error'
]
=
res
[
"min_absolute_error"
]
R2_serie
.
loc
[
rd
,
'max_absolute_error'
]
=
res
[
"max_absolute_error"
]
R2_serie
.
loc
[
rd
,
'SNP_max_error'
]
=
res
[
"SNP_max_error"
]
print
(
len
(
masked_SNP
))
print
(
"Result for rd {0} = cor: {1}, fraction_imputed: {2}"
.
format
(
rd
,
res
[
"cor"
],
res
[
"fraction_imputed"
]))
'min_absolute_error'
:
np
.
nan
,
'max_absolute_error'
:
np
.
nan
,
"SNP_max_error"
:
np
.
nan
},
index
=
pd
.
MultiIndex
.
from_tuples
(
param_grid
,
names
=
[
"eigen_ratio"
,
"min_ld"
]))
param_grid
=
itertools
.
product
(
eigen_ratio_grid
,
ld_threshold_grid
)
Parallel
(
n_jobs
=
n_cpu
)(
delayed
(
run_imputation
)(
param
)
for
param
in
param_grid
)
for
min_ld
in
ld_threshold_grid
:
for
rd
in
eigen_ratio_grid
:
z_output
=
"{0}/z_{1}_{2}_{3}_{4}.txt"
.
format
(
output_folder
,
gwas
,
chrom
,
rd
,
min_ld
)
dat_imp
=
pd
.
read_csv
(
z_output
,
sep
=
"
\t
"
,
index_col
=
0
)
print
(
rd
)
try
:
res
=
imputation_performance
(
dat_orig
,
dat_imp
,
masked_SNP
)
except
KeyError
:
# If KeyError none of the masked_SNP are in the imputed dataframe
print
(
e
)
res
=
np
.
nan
ind_loop
=
(
rd
,
min_ld
)
R2_serie
.
loc
[
ind_loop
,
'N_SNP'
]
=
res
[
"N_SNP"
]
R2_serie
.
loc
[
ind_loop
,
'cor'
]
=
res
[
"cor"
]
R2_serie
.
loc
[
ind_loop
,
'mean_absolute_error'
]
=
res
[
"mean_absolute_error"
]
R2_serie
.
loc
[
ind_loop
,
'fraction_imputed'
]
=
res
[
"fraction_imputed"
]
R2_serie
.
loc
[
ind_loop
,
'median_absolute_error'
]
=
res
[
"median_absolute_error"
]
R2_serie
.
loc
[
ind_loop
,
'min_absolute_error'
]
=
res
[
"min_absolute_error"
]
R2_serie
.
loc
[
ind_loop
,
'max_absolute_error'
]
=
res
[
"max_absolute_error"
]
R2_serie
.
loc
[
ind_loop
,
'SNP_max_error'
]
=
res
[
"SNP_max_error"
]
print
(
len
(
masked_SNP
))
print
(
"Result for ind_loop {0} = cor: {1}, fraction_imputed: {2}"
.
format
(
ind_loop
,
res
[
"cor"
],
res
[
"fraction_imputed"
]))
return
(
R2_serie
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment