Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Statistical-Genetics
RAISS
Commits
15a3dd75
Commit
15a3dd75
authored
Apr 16, 2018
by
Hanna JULIENNE
Browse files
fixing variance erasing at each ld block
parent
31337253
Changes
3
Hide whitespace changes
Inline
Side-by-side
impute_jass/impute_jass/imputation_launcher.py
View file @
15a3dd75
...
...
@@ -4,7 +4,7 @@ on the genome
"""
import
glob
import
pandas
as
pd
from
.windows
import
ld_region_centered_window_imputation
,
impg_like_imputation
,
realigned_zfiles_on_panel
from
.windows
import
ld_region_centered_window_imputation
,
prepare_zscore_for_imputation
,
impg_like_imputation
,
realigned_zfiles_on_panel
...
...
@@ -13,7 +13,8 @@ class ImputationLauncher(object):
Class perform imputation of snp from summary statistic
"""
def
__init__
(
self
,
window_size
=
10000
,
imputation_style
=
"batch"
,
buf
=
2500
,
lamb
=
0.01
,
pinv_rcond
=
0.01
):
def
__init__
(
self
,
window_size
=
10000
,
imputation_style
=
"batch"
,
buf
=
2500
,
lamb
=
0.01
,
pinv_rcond
=
0.01
):
"""
Initialise the imputation object. Fix the windows size, the buffer size
and the king of imputation employed
...
...
@@ -29,7 +30,7 @@ class ImputationLauncher(object):
matrices to make it less singular
pinv_rcond (float): the rcond scipy.linalg.pinv function argument.
The scipy.linalg.pinv is used to invert
the correlationmatrices
the correlation
matrices
"""
self
.
imputation_style
=
imputation_style
self
.
window_size
=
window_size
...
...
@@ -37,6 +38,7 @@ class ImputationLauncher(object):
self
.
lamb
=
lamb
self
.
rcond
=
pinv_rcond
def
chromosome_imputation
(
self
,
chrom
,
zscore
,
ref_panel
,
ld_folder
):
"""
Impute the panel zscore score for one chromosome and with the specified
...
...
@@ -51,19 +53,27 @@ class ImputationLauncher(object):
Imputed zscore dataframe
"""
pattern
=
"{0}/{1}_*.ld"
.
format
(
ld_folder
,
chrom
)
zscore
=
prepare_zscore_for_imputation
(
ref_panel
,
zscore
)
zscore_results
=
zscore
.
copy
(
deep
=
True
)
if
self
.
imputation_style
==
"online"
:
def
imputer
(
ld_file
):
return
ld_region_centered_window_imputation
(
ld_file
,
ref_panel
,
zscore
,
self
.
window_size
)
return
ld_region_centered_window_imputation
(
ld_file
,
ref_panel
,
zscore
,
self
.
window_size
)
elif
self
.
imputation_style
==
"batch"
:
def
imputer
(
ld_file
):
return
impg_like_imputation
(
ld_file
,
ref_panel
,
zscore
,
self
.
window_size
,
self
.
buffer
,
self
.
lamb
,
self
.
rcond
)
return
impg_like_imputation
(
ld_file
,
ref_panel
,
zscore
,
self
.
window_size
,
self
.
buffer
,
self
.
lamb
,
self
.
rcond
)
for
ld_file
in
glob
.
glob
(
pattern
):
print
(
"processing Region: {0}"
.
format
(
ld_file
))
zscore
=
imputer
(
ld_file
)
ld_batch
=
imputer
(
ld_file
)
zscore_results
=
pd
.
concat
([
zscore_results
,
ld_batch
])
zscore
=
realigned_zfiles_on_panel
(
ref_panel
,
zscore
)
return
zscore
zscore_results
.
sort_values
(
by
=
"pos"
)
zscore_results
=
realigned_zfiles_on_panel
(
ref_panel
,
zscore_results
)
return
zscore_results
def
genome_imputation
(
self
,
gwas_tag
,
ref_panel_folder
,
ld_folder
,
zscore_folder
,
folder_output
):
...
...
impute_jass/impute_jass/stat_models.py
View file @
15a3dd75
...
...
@@ -29,7 +29,7 @@ def compute_mu(sig_i_t, sig_t_inv, zt):
zt (np.array?): Zscores of known snp
Returns:
mu_i (np.array): a vector of length i containing the estimate of zscore
"""
return
np
.
dot
(
sig_i_t
,
np
.
dot
(
sig_t_inv
,
zt
))
...
...
@@ -101,5 +101,4 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
var_norm
=
var_in_boundaries
(
var
,
lamb
)
R2
=
((
1
+
lamb
)
-
var_norm
)
mu
=
mu
/
np
.
sqrt
(
R2
)
return
({
"var"
:
var
,
"mu"
:
mu
,
"ld_score"
:
ld_score
,
"condition_number"
:
condition_number
,
"correct_inversion"
:
correct_inversion
})
impute_jass/impute_jass/windows.py
View file @
15a3dd75
...
...
@@ -27,7 +27,9 @@ def realigned_zfiles_on_panel(ref_panel, zscore):
If not, the coded and other allele are inverted and the zscore sign
is inverted also.
Args:
- ref_panel (pd.dataframe) : snp of reference on the imputed chromosome
- zscore (pd.dataframe):
"""
sub_ref_panel
=
ref_panel
.
loc
[
zscore
.
index
]
allele_inverted
=
(
sub_ref_panel
[
'Ref_all'
]
!=
zscore
.
A0
)
...
...
@@ -40,14 +42,20 @@ def realigned_zfiles_on_panel(ref_panel, zscore):
def
prepare_zscore_for_imputation
(
ref_panel
,
zscore
):
"""
Prepare the known Z score by realigning them on the reference ref_panel
the snps that are not present in the ref panel are filtered
Prepare the known Z score by
- realigning them on the reference ref_panel
- filtering snps that are not present in the ref panel
- Adding columns that will contain information on imputation:
* Var : theoritical variance estimate of z
* Nsnp_to_impute : Number of known snp
* ld_score : the sum of the square correlation of the snp with all other
known snp (give an idea if the we have enough information to compute a
precise z estimate)
"""
zscore
=
realigned_zfiles_on_panel
(
ref_panel
,
zscore
)
zscore
[
'Var'
]
=
-
1
zscore
[
'Var'
]
=
-
1
.0
zscore
[
'Nsnp_to_impute'
]
=
-
1
zscore
[
'ld_score'
]
=
-
1
zscore
[
'ld_score'
]
=
-
1
.0
zscore
=
zscore
.
loc
[
zscore
.
index
.
intersection
(
ref_panel
.
index
)]
return
zscore
...
...
@@ -57,7 +65,17 @@ def in_region(pos_vector, start, end):
def
compute_window_and_size
(
start_ld_block
,
end_ld_block
,
window_size
):
"""
Compute the number of window to pave the ld_block tightly
Compute the number of window to pave the Linkage desiquilibrium block
tightly with a window
size the closest to the one specified by the user
Args:
start_ld_block (int): the start of the Linkage desiquilibrium block
end_ld_block (int): the end of the Linkage desiquilibrium block
window_size (int) : size in bp of the window to compute imputation_style
all unknown snps in the window will be imputed from the known snp in
the windows
return:
a tuple ( number of windows (int), size of the window resize to the ld block (int))
"""
Nwindows
=
((
int
(
end_ld_block
))
-
(
int
(
start_ld_block
)))
//
window_size
...
...
@@ -83,7 +101,8 @@ def format_result_df(imp, ref_panel, known, unknowns):
"correct_inversion"
:
imp
[
"correct_inversion"
],
"Nsnp_to_impute"
:
len
(
known
)
}
column_order
=
[
'pos'
,
'A0'
,
"A1"
,
"Z"
,
"Var"
,
"ld_score"
,
"condition_number"
,
"correct_inversion"
,
"Nsnp_to_impute"
]
column_order
=
[
'pos'
,
'A0'
,
"A1"
,
"Z"
,
"Var"
,
"ld_score"
,
"condition_number"
,
"correct_inversion"
,
"Nsnp_to_impute"
]
batch_df
=
pd
.
DataFrame
(
result_dict
,
columns
=
column_order
)
return
batch_df
...
...
@@ -95,23 +114,28 @@ def print_progression(i, Nwindows):
if
i
%
(
np
.
ceil
(
Nwindows
/
10
))
==
0
:
print
(
"{0}\%"
.
format
(
np
.
round
(
i
/
Nwindows
,
3
)))
def
empty_imputed_dataframe
():
column_order
=
[
'pos'
,
'A0'
,
"A1"
,
"Z"
,
"Var"
,
"ld_score"
,
"condition_number"
,
"correct_inversion"
,
"Nsnp_to_impute"
]
zscore_results
=
pd
.
DataFrame
(
columns
=
column_order
)
return
zscore_results
def
impg_like_imputation
(
ld_file
,
ref_panel
,
zscore
,
window_size
,
buffer
,
lamb
,
rcond
,
file_output
,
unknowns
=
pd
.
Series
([])):
rcond
,
unknowns
=
pd
.
Series
([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
Argument.
Args:
ld_file (str): Linkage desiquilibrium matrix files
ref_panel (pd.dataframe): the dataframe containing reference panel
snps
"""
(
chrom
,
start_ld_block
,
end_ld_block
)
=
parse_region_position
(
ld_file
)
LD_mat
=
generate_sparse_matrix
(
ld_file
,
ref_panel
)
Nwindows
,
window_resize
=
compute_window_and_size
(
start_ld_block
,
end_ld_block
,
window_size
)
all_unknowns
=
ref_panel
.
loc
[
ref_panel
.
index
.
difference
(
zscore
.
index
)]
zscore
=
prepare_zscore_for_imputation
(
ref_panel
,
zscore
)
zscore
.
to_csv
(
z_fo
,
sep
=
'
\t
'
)
zscore_results
=
zscore
.
copy
(
deep
=
True
)
# the dataframe containing results
zscore_results
=
empty_imputed_dataframe
()
print
(
"### Imputation of {0} snps ###"
.
format
(
all_unknowns
.
shape
[
0
]))
...
...
@@ -145,7 +169,6 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
# keep only SNP with non negligible explained variance
snp_well_predicted
=
(
batch_df
.
Var
<
0.9
)
batch_df_filt
=
batch_df
.
loc
[
in_core_window
&
snp_well_predicted
,
zscore_results
.
columns
]
batch_df_filt
.
to_csv
(
z_fo
,
sep
=
'
\t
'
,
mode
=
'a'
)
zscore_results
=
pd
.
concat
([
zscore_results
,
batch_df_filt
])
i
=
i
+
1
print_progression
(
i
,
Nwindows
)
...
...
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment