Skip to content
GitLab
Projects
Groups
Snippets
/
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in
Toggle navigation
Menu
Open sidebar
Statistical-Genetics
RAISS
Commits
eb9d89e8
Commit
eb9d89e8
authored
Mar 13, 2018
by
Hanna JULIENNE
Browse files
fix issues
parent
bcbe25a2
Changes
4
Hide whitespace changes
Inline
Side-by-side
impute_jass/impute_jass/__init__.py
View file @
eb9d89e8
import
impute_jass.ld_matrix
as
LD
import
impute_jass.stat_models
as
model
import
impute_jass.windows
from
impute_jass.imputation_launcher
import
i
mputation
_l
auncher
from
impute_jass.imputation_launcher
import
I
mputation
L
auncher
impute_jass/impute_jass/imputation_launcher.py
View file @
eb9d89e8
...
...
@@ -3,15 +3,16 @@ Function set to launch imputation on a complete chromosome or
on the genome
"""
import
glob
from
.windows
import
L
d_region_centered_window_imputation
from
.windows
import
l
d_region_centered_window_imputation
,
impg_like_imputation
,
realigned_zfiles_on_panel
class
i
mputation
_l
auncher
:
class
I
mputation
L
auncher
(
object
)
:
def
__init__
(
self
,
window_size
=
10000
):
self
.
imputation_style
=
"online"
def
__init__
(
self
,
window_size
=
10000
,
imputation_style
=
"online"
,
buffer
=
2500
):
self
.
imputation_style
=
imputation_style
self
.
window_size
=
window_size
self
.
buffer
=
buffer
def
chromosome_imputation
(
self
,
chrom
,
Z
score
s
,
ref_panel
,
ld_folder
):
def
chromosome_imputation
(
self
,
chrom
,
z
score
,
ref_panel
,
ld_folder
):
"""
"""
...
...
@@ -20,8 +21,17 @@ class imputation_launcher:
#ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
pattern
=
"{0}/{1}*.ld"
.
format
(
ld_folder
,
chrom
)
for
LD_file
in
glob
.
glob
(
pattern
)[:
2
]:
print
(
"processing Region: {0}"
.
format
(
LD_file
))
if
self
.
imputation_style
==
"online"
:
def
imputer
(
ld_file
):
return
ld_region_centered_window_imputation
(
ld_file
,
ref_panel
,
zscore
,
self
.
window_size
)
elif
self
.
imputation_style
==
"batch"
:
def
imputer
(
ld_file
):
return
impg_like_imputation
(
ld_file
,
ref_panel
,
zscore
,
self
.
window_size
,
self
.
buffer
)
Zscores
=
Ld_region_centered_window_imputation
(
LD_file
,
ref_panel
,
Zscores
,
self
.
window_size
)
return
Zscores
for
ld_file
in
glob
.
glob
(
pattern
):
print
(
"processing Region: {0}"
.
format
(
ld_file
))
zscore
=
imputer
(
ld_file
)
zscore
=
realigned_zfiles_on_panel
(
ref_panel
,
zscore
)
return
zscore
impute_jass/impute_jass/stat_models.py
View file @
eb9d89e8
...
...
@@ -3,42 +3,29 @@ function for SNP imputation
"""
import
numpy
as
np
def
ImpG_model_batch
(
Zt
,
Sig_t
,
Sig_i_t
,
lamb
=
0.01
):
"""
Argument:
Zt : (vector) the vector of known Z scores
"""
#np.fill_diagonal(Sig_t.values, 1.01)
#Sig_t.fillna(0, inplace=True)
Sig_t
=
Sig_t
.
values
np
.
fill_diagonal
(
Sig_t
,
(
1
+
lamb
))
Sig_t_inv
=
np
.
linalg
.
pinv
(
Sig_t
)
def
compute_mu
(
sig_i_t
,
sig_t_inv
,
zt
):
return
np
.
dot
(
sig_i_t
,
np
.
dot
(
sig_t_inv
,
zt
))
Var
=
np
.
diag
(
Sig_t
)[
0
]
-
np
.
einsum
(
'ij,jk,ki->i'
,
Sig_i_t
,
Sig_t_inv
,
Sig_i_t
.
transpose
())
def
compute_var
(
sig_i_t
,
sig_t_inv
,
lamb
,
batch
=
True
):
if
batch
:
var
=
(
1
+
lamb
)
-
np
.
einsum
(
'ij,jk,ki->i'
,
sig_i_t
,
sig_t_inv
,
sig_i_t
.
transpose
())
else
:
var
=
(
1
+
lamb
)
-
np
.
dot
(
sig_i_t
,
np
.
dot
(
sig_t_inv
,
sig_i_t
.
transpose
()))
return
var
mu
=
np
.
dot
(
Sig_i_t
,
np
.
dot
(
Sig_t_inv
,
Zt
))
return
({
"Var"
:
Var
,
"mu"
:
mu
})
def
ImpG_model_snp
(
Zt
,
Sig_t
,
Sig_i_t
,
lamb
=
0.01
):
def
impg_model
(
zt
,
sig_t
,
sig_i_t
,
lamb
=
0.01
,
batch
=
True
):
"""
Argument:
Z
t : (vector) the vector of known Z scores
z
t : (vector) the vector of known Z scores
"""
#np.fill_diagonal(Sig_t.values, 1.01)
#Sig_t.fillna(0, inplace=True)
Sig_t
=
Sig_t
.
values
np
.
fill_diagonal
(
Sig_t
,
(
1
+
lamb
))
#I = np.identity(Sig_t.shape[0])
#Sig_t_inv =np.linalg.inv(Sig_t)
Sig_t_inv
=
np
.
linalg
.
pinv
(
Sig_t
)
Var
=
np
.
diag
(
Sig_t
)[
0
]
-
np
.
dot
(
Sig_i_t
,
np
.
dot
(
Sig_t_inv
,
Sig_i_t
.
transpose
()))
if
Var
<
0
:
Var
=
0
#np.einsum('ij,jk,ki->i', Sig_i_t, Sig_t_inv ,Sig_i_t.transpose())
sig_t
=
sig_t
.
values
np
.
fill_diagonal
(
sig_t
,
(
1
+
lamb
))
sig_t_inv
=
np
.
linalg
.
pinv
(
sig_t
)
var
=
compute_var
(
sig_i_t
,
sig_t_inv
,
lamb
,
batch
)
#if var< 0:
# var=0
mu
=
np
.
dot
(
Sig_i_t
,
np
.
dot
(
S
ig_t_inv
,
Z
t
)
)
mu
=
mu
/
((
1
-
V
ar
)
**
0.5
)
return
({
"
V
ar"
:
V
ar
,
"mu"
:
mu
})
mu
=
compute_mu
(
sig_i_t
,
s
ig_t_inv
,
z
t
)
mu
=
mu
/
((
(
1
+
lamb
)
-
v
ar
)
**
0.5
)
return
({
"
v
ar"
:
v
ar
,
"mu"
:
mu
})
impute_jass/impute_jass/windows.py
View file @
eb9d89e8
...
...
@@ -5,162 +5,146 @@ implement the imputation window is sliding along the genome:
- centered_window: A sliding window centered on the Snp to impute
"""
from
.stat_models
import
ImpG_model_batch
,
I
mp
G
_model
_snp
from
.stat_models
import
i
mp
g
_model
from
.ld_matrix
import
generate_sparse_matrix
import
pandas
as
pd
import
numpy
as
np
def
parse_region_position
(
LD
_file
):
def
parse_region_position
(
ld
_file
):
"""
Retrieve the region definition from a ld-file generated by impute_jass
Argument :
LD
_file : A
LD
file generated by jass_impute
ld
_file : A
ld
file generated by jass_impute
"""
(
chrom
,
startpos
,
endpos
)
=
LD
_file
.
split
(
"/"
)[
-
1
].
split
(
"."
)[
0
].
split
(
'_'
)
(
chrom
,
startpos
,
endpos
)
=
ld
_file
.
split
(
"/"
)[
-
1
].
split
(
"."
)[
0
].
split
(
'_'
)
return
(
chrom
,
startpos
,
endpos
)
def
realigned_zfiles_on_panel
(
ref_panel
,
Z
score
s
):
def
realigned_zfiles_on_panel
(
ref_panel
,
z
score
):
"""
Check if the counted allele is the same in the reference panel and
the Zscore files.
If not, the coded and other allele are inverted and the
Z
score
s
sign
If not, the coded and other allele are inverted and the
z
score sign
is inverted also.
"""
allele_inverted
=
(
ref_panel
.
loc
[
Z
score
s
.
index
,
'Ref_all'
]
!=
Z
score
s
.
A0
)
allele_inverted
=
(
ref_panel
.
loc
[
z
score
.
index
,
'Ref_all'
]
!=
z
score
.
A0
)
Z
score
s
.
loc
[
allele_inverted
,
"A0"
]
=
ref_panel
.
alt_all
Z
score
s
.
loc
[
allele_inverted
,
"A1"
]
=
ref_panel
.
Ref_all
Z
score
s
.
loc
[
allele_inverted
,
"Z"
]
=
-
Z
score
s
.
loc
[
allele_inverted
,
"Z"
]
z
score
.
loc
[
allele_inverted
,
"A0"
]
=
ref_panel
.
alt_all
z
score
.
loc
[
allele_inverted
,
"A1"
]
=
ref_panel
.
Ref_all
z
score
.
loc
[
allele_inverted
,
"Z"
]
=
-
z
score
.
loc
[
allele_inverted
,
"Z"
]
return
Z
score
s
return
z
score
def
prepare_
Z
score_for_imputation
(
ref_panel
,
Z
score
s
):
def
prepare_
z
score_for_imputation
(
ref_panel
,
z
score
):
"""
Prepare the known Z score by realigning them on the reference ref_panel
the snps that are not present in the ref panel are filtered
"""
Z
score
s
=
realigned_zfiles_on_panel
(
ref_panel
,
Z
score
s
)
Z
score
s
[
'Var'
]
=
1
Z
score
s
[
'Nsnp_to_impute'
]
=
-
1
Z
score
s
=
Z
score
s
.
loc
[
Z
score
s
.
index
.
intersection
(
ref_panel
.
index
)]
return
Z
score
s
z
score
=
realigned_zfiles_on_panel
(
ref_panel
,
z
score
)
z
score
[
'Var'
]
=
1
z
score
[
'Nsnp_to_impute'
]
=
-
1
z
score
=
z
score
.
loc
[
z
score
.
index
.
intersection
(
ref_panel
.
index
)]
return
z
score
def
in_region
(
pos_vector
,
start
,
end
):
return
((
start
<
pos_vector
)
&
(
pos_vector
<
end
))
def
L
d_region_centered_window_imputation
(
LD
_file
,
ref_panel
,
Z
score
s
,
window_size
,
unknowns
=
pd
.
Series
([])):
def
l
d_region_centered_window_imputation
(
ld
_file
,
ref_panel
,
z
score
,
window_size
,
unknowns
=
pd
.
Series
([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(
chrom
,
start_ld_block
,
end_ld_block
)
=
parse_region_position
(
LD
_file
)
(
chrom
,
start_ld_block
,
end_ld_block
)
=
parse_region_position
(
ld
_file
)
LD_mat
=
generate_sparse_matrix
(
LD_file
,
ref_panel
)
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores
=
prepare_Zscore_for_imputation
(
ref_panel
,
Zscores
)
LD_mat
=
generate_sparse_matrix
(
ld_file
,
ref_panel
)
zscore
=
prepare_zscore_for_imputation
(
ref_panel
,
zscore
)
# Find Snp to impute
if
len
(
unknowns
)
==
0
:
unknowns
=
LD_mat
.
index
.
difference
(
Z
score
s
.
index
)
unknowns
=
LD_mat
.
index
.
difference
(
z
score
.
index
)
N_snp
=
len
(
unknowns
)
print
(
"### Imputation of {0} snps ###"
.
format
(
len
(
unknowns
)))
i
=
0
for
snp_unknown
in
unknowns
:
for
i
,
snp_unknown
in
enumerate
(
unknowns
)
:
# Boundary of the centered_window
#print(((ref_panel.loc[snp_unknown,'pos'] - window_size), float(start_ld_block)))
start_pos
=
max
((
ref_panel
.
loc
[
snp_unknown
,
'pos'
]
-
window_size
),
float
(
start_ld_block
))
end_pos
=
min
(
ref_panel
.
loc
[
snp_unknown
,
'pos'
]
+
window_size
,
float
(
end_ld_block
))
#print(snp_unknown, start_pos, end_pos, start_ld_block, end_ld_block)
in_LD_reg_n_window
=
in_region
(
Z
score
s
.
pos
,
start_pos
,
end_pos
)
in_LD_reg_n_window
=
in_region
(
z
score
.
pos
,
start_pos
,
end_pos
)
known
=
Z
score
s
.
loc
[
in_LD_reg_n_window
].
index
S
ig_t
=
LD_mat
.
loc
[
known
,
known
]
S
ig_i_t
=
LD_mat
.
loc
[
snp_unknown
,
known
]
Z
t
=
Z
score
s
.
loc
[
known
,
'Z'
]
known
=
z
score
.
loc
[
in_LD_reg_n_window
].
index
s
ig_t
=
LD_mat
.
loc
[
known
,
known
]
s
ig_i_t
=
LD_mat
.
loc
[
snp_unknown
,
known
]
z
t
=
z
score
.
loc
[
known
,
'Z'
]
if
(
len
(
known
)
>
0
):
imp
=
ImpG_model_snp
(
Zt
,
Sig_t
,
Sig_i_t
)
Zscores
.
loc
[
snp_unknown
]
=
[
ref_panel
.
loc
[
snp_unknown
,
'pos'
],
ref_panel
.
loc
[
snp_unknown
,
"Ref_all"
],
ref_panel
.
loc
[
snp_unknown
,
"alt_all"
],
imp
[
'mu'
],
imp
[
'Var'
],
len
(
known
)]
# Zscores.loc[snp_unknown, "pos"] = ref_panel.loc[snp_unknown, 'pos']
# Zscores.loc[snp_unknown, "A0"] = ref_panel.loc[snp_unknown, "Ref_all"]
# Zscores.loc[snp_unknown, "A1"] = ref_panel.loc[snp_unknown, "alt_all"]
# Zscores.loc[snp_unknown, "Z"] = imp['mu']
# Zscores.loc[snp_unknown, "Var"] = imp['Var']
# Zscores.loc[snp_unknown, 'Nsnp_to_impute'] = len(known)
i
=
i
+
1
imp
=
impg_model
(
zt
,
sig_t
,
sig_i_t
,
batch
=
False
)
zscore
.
loc
[
snp_unknown
]
=
[
ref_panel
.
loc
[
snp_unknown
,
'pos'
],
ref_panel
.
loc
[
snp_unknown
,
"Ref_all"
],
ref_panel
.
loc
[
snp_unknown
,
"alt_all"
],
imp
[
'mu'
],
imp
[
'var'
],
len
(
known
)]
if
i
%
300
==
0
:
print
(
"{0}\%"
.
format
(
np
.
round
(
i
/
N_snp
,
4
)))
return
Z
score
s
.
sort_values
(
by
=
"pos"
)
return
z
score
.
sort_values
(
by
=
"pos"
)
def
I
mp
G
_like_imputation
(
LD
_file
,
ref_panel
,
Z
score
s
,
window_size
,
buffer
,
unknowns
=
pd
.
Series
([])):
def
i
mp
g
_like_imputation
(
ld
_file
,
ref_panel
,
z
score
,
window_size
,
buffer
,
unknowns
=
pd
.
Series
([])):
"""
Each missing Snp is imputed by known snp found in a window centered on the SNP to impute
Argument
"""
(
chrom
,
start_ld_block
,
end_ld_block
)
=
parse_region_position
(
LD
_file
)
(
chrom
,
start_ld_block
,
end_ld_block
)
=
parse_region_position
(
ld
_file
)
LD_mat
=
generate_sparse_matrix
(
LD
_file
,
ref_panel
)
LD_mat
=
generate_sparse_matrix
(
ld
_file
,
ref_panel
)
Nwindows
=
((
int
(
end_ld_block
))
-
(
int
(
start_ld_block
)))
//
window_size
# adapt window size to cover the LD block
window_resize
=
np
.
ceil
((
int
(
end_ld_block
)
-
(
int
(
start_ld_block
)))
/
Nwindows
)
all_unknowns
=
ref_panel
.
loc
[
ref_panel
.
index
.
difference
(
Zscores
.
index
)]
#Zscores = pd.read_csv(Zfile, index_col=0, sep="\t")
Zscores
=
prepare_Zscore_for_imputation
(
ref_panel
,
Zscores
)
all_unknowns
=
ref_panel
.
loc
[
ref_panel
.
index
.
difference
(
zscore
.
index
)]
#zscore = pd.read_csv(Zfile, index_col=0, sep="\t")
zscore
=
prepare_zscore_for_imputation
(
ref_panel
,
zscore
)
print
(
"### Imputation of {0} snps ###"
.
format
(
unknowns
.
shape
[
0
]))
i
=
0
for
i
in
range
(
Nwindows
):
print
(
i
)
# Boundary of the centered_window
# Boundary of the sliding_window
start_windows
=
int
(
start_ld_block
)
+
i
*
window_resize
-
buffer
end_windows
=
int
(
start_ld_block
)
+
(
i
+
1
)
*
window_resize
+
buffer
start_pos
=
max
(
start_windows
,
float
(
start_ld_block
))
end_pos
=
min
(
end_windows
,
float
(
end_ld_block
))
in_LD_reg_n_window
=
in_region
(
Z
score
s
.
pos
,
start_pos
,
end_pos
)
in_LD_reg_n_window
=
in_region
(
z
score
.
pos
,
start_pos
,
end_pos
)
unknown_in_LD_reg_n_window
=
in_region
(
all_unknowns
.
pos
,
start_pos
,
end_pos
)
known
=
Z
score
s
.
loc
[
in_LD_reg_n_window
].
index
known
=
z
score
.
loc
[
in_LD_reg_n_window
].
index
unknowns
=
all_unknowns
.
loc
[
unknown_in_LD_reg_n_window
].
index
S
ig_t
=
LD_mat
.
loc
[
known
,
known
]
S
ig_i_t
=
LD_mat
.
loc
[
unknowns
,
known
]
Z
t
=
Z
score
s
.
loc
[
known
,
'Z'
]
s
ig_t
=
LD_mat
.
loc
[
known
,
known
]
s
ig_i_t
=
LD_mat
.
loc
[
unknowns
,
known
]
z
t
=
z
score
.
loc
[
known
,
'Z'
]
if
(
len
(
known
)
>
0
):
imp
=
I
mp
G
_model
_batch
(
Z
t
,
S
ig_t
,
S
ig_i_t
)
imp
=
i
mp
g
_model
(
z
t
,
s
ig_t
,
s
ig_i_t
,
batch
=
True
)
batch_df
=
pd
.
DataFrame
({
'pos'
:
ref_panel
.
loc
[
unknowns
,
'pos'
],
'A0'
:
ref_panel
.
loc
[
unknowns
,
"Ref_all"
],
"A1"
:
ref_panel
.
loc
[
unknowns
,
"alt_all"
],
"Z"
:
imp
[
'mu'
],
"Var"
:
imp
[
"
V
ar"
],
"Var"
:
imp
[
"
v
ar"
],
"Nsnp_to_impute"
:
len
(
known
)
})
Zscores
=
pd
.
concat
([
Zscores
,
batch_df
])
# Zscores.loc[unknowns, 'pos'] = ref_panel.loc[unknowns, 'pos']
# Zscores.loc[unknowns, 'A0'] = ref_panel.loc[unknowns, "Ref_all"]
# Zscores.loc[unknowns, 'A1'] = ref_panel.loc[unknowns, "alt_all"]
# Zscores.loc[unknowns, 'Z'] = imp['mu']
# Zscores.loc[unknowns, 'Var'] = imp["Var"]
# Zscores.loc[unknowns, "Nsnp_to_impute"] = len(known)
# keep only snp in the core window
start_windows
=
int
(
start_ld_block
)
+
i
*
window_resize
end_windows
=
int
(
start_ld_block
)
+
(
i
+
1
)
*
window_resize
in_core_window
=
in_region
(
batch_df
.
pos
,
start_windows
,
end_windows
)
zscore
=
pd
.
concat
([
zscore
,
batch_df
.
loc
[
in_core_window
]])
i
=
i
+
1
if
i
%
30
0
==
0
:
print
(
"{0}\%"
.
format
(
np
.
round
(
i
/
N
_snp
,
4
)))
if
i
%
1
0
==
0
:
print
(
"{0}\%"
.
format
(
np
.
round
(
i
/
N
windows
,
4
)))
return
Z
score
s
.
sort_values
(
by
=
"pos"
)
return
z
score
.
sort_values
(
by
=
"pos"
)
Write
Preview
Supports
Markdown
0%
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment