Skip to content
GitLab
Explore
Sign in
Primary navigation
Search or go to…
Project
J
jass_preprocessing
Manage
Activity
Members
Labels
Plan
Issues
Issue boards
Milestones
Wiki
Code
Merge requests
Repository
Branches
Commits
Tags
Repository graph
Compare revisions
Snippets
Build
Pipelines
Jobs
Pipeline schedules
Artifacts
Deploy
Releases
Container registry
Model registry
Operate
Environments
Monitor
Incidents
Analyze
Value stream analytics
Contributor analytics
CI/CD analytics
Repository analytics
Model experiments
Help
Help
Support
GitLab documentation
Compare GitLab plans
Community forum
Contribute to GitLab
Provide feedback
Keyboard shortcuts
?
Snippets
Groups
Projects
Show more breadcrumbs
Statistical-Genetics
jass_preprocessing
Commits
1492680a
Commit
1492680a
authored
7 years ago
by
Vincent LAVILLE
Browse files
Options
Downloads
Patches
Plain Diff
Working copy
parent
6df4e558
Branches
Branches containing commit
Tags
Tags containing commit
No related merge requests found
Changes
1
Hide whitespace changes
Inline
Side-by-side
Showing
1 changed file
pyPCMA_1_format_v1.4_cur_work.py
+377
-0
377 additions, 0 deletions
pyPCMA_1_format_v1.4_cur_work.py
with
377 additions
and
0 deletions
pyPCMA_1_format_v1.4_cur_work.py
0 → 100755
+
377
−
0
View file @
1492680a
"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS
"""
__updated__
=
'
2017-08-29
'
import
h5py
import
numpy
as
np
import
scipy.stats
as
ss
import
sys
import
math
import
os
import
pandas
as
pd
import
matplotlib.pyplot
as
plt
print
(
'
Number of arguments:
'
,
len
(
sys
.
argv
),
'
arguments.
'
)
print
(
'
Argument List:
'
,
str
(
sys
.
argv
))
perSS
=
0.7
netPath
=
"
/mnt/atlas/
"
# '/home/genstat/ATLAS/'
#netPath = '/pasteur/projets/policy01/'
GWAS_labels
=
netPath
+
'
PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt
'
GWAS_path
=
netPath
+
'
PCMA/1._DATA/RAW.GWAS/
'
REF_filename
=
netPath
+
'
PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5
'
pathOUT
=
netPath
+
'
PCMA/1._DATA/RAW.summary/
'
outFileName
=
netPath
+
'
PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5
'
def_missing
=
[
''
,
'
#N/A
'
,
'
#N/A
'
,
'
N/A
'
,
'
#NA
'
,
'
-1.#IND
'
,
'
-1.#QNAN
'
,
'
-NaN
'
,
'
-nan
'
,
'
1.#IND
'
,
'
1.#QNAN
'
,
'
N/A
'
,
'
NA
'
,
'
NULL
'
,
'
NaN
'
,
'
nan
'
,
'
na
'
,
'
.
'
]
out_summary
=
"
summary_GWAS.csv
"
ImpG_output_Folder
=
netPath
+
'
PCMA/1._DATA/ImpG_zfiles/
'
# FUNCTION: find path for a given GWAS file ================================
def
walkfs
(
startdir
,
findfile
):
dircount
=
0
filecount
=
0
for
root
,
dirs
,
files
in
os
.
walk
(
startdir
):
if
findfile
in
files
:
return
dircount
,
filecount
+
files
.
index
(
findfile
),
os
.
path
.
join
(
root
,
findfile
)
dircount
+=
1
filecount
+=
len
(
files
)
# nothing found, return None instead of a full path for the file
return
dircount
,
filecount
,
None
def
dna_complement_base
(
inputbase
):
if
(
inputbase
==
'
A
'
):
return
(
'
T
'
)
if
(
inputbase
==
'
T
'
):
return
(
'
A
'
)
if
(
inputbase
==
'
G
'
):
return
(
'
C
'
)
if
(
inputbase
==
'
C
'
):
return
(
'
G
'
)
return
(
'
Not ATGC
'
)
def
dna_complement
(
input
):
return
([
dna_complement_base
(
x
)
for
x
in
input
])
##############################################################
# READ PARAM FILES
#---open GWAS dictionnary
column_dict
=
pd
.
read_csv
(
GWAS_labels
,
sep
=
'
\t
'
,
na_values
=
'
na
'
)
#---Public GWAS
GWAS_table
=
[
'
GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt
'
,
'
SNP_gwas_mc_merge_nogc.tbl.uniq
'
,
'
GIANT_2015_HIP_COMBINED_EUR.txt
'
,
'
GIANT_2015_WC_COMBINED_EUR.txt
'
,
'
GIANT_2015_WHR_COMBINED_EUR.txt
'
,
'
HDL_ONE_Europeans.tbl
'
,
'
LDL_ONE_Europeans.tbl
'
,
'
TC_ONE_Europeans.tbl
'
,
'
TG_ONE_Europeans.tbl
'
,
'
GWAS_DBP_recoded.txt
'
,
'
GWAS_SBP_recoded.txt
'
,
'
GWAS_PP_recoded.txt
'
,
'
GWAS_MAP_recoded.txt
'
,
'
MAGIC_FastingGlucose.txt
'
,
'
MAGIC_ln_FastingInsulin.txt
'
,
'
MAGIC_ln_HOMA-B.txt
'
,
'
MAGIC_ln_HOMA-IR.txt
'
,
'
MAGIC_2hrGlucose_AdjustedForBMI.txt
'
,
'
MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt
'
,
'
MAGIC_proinsulin_for_release_HMrel27.txt
'
,
'
MAGIC_HbA1C.txt
'
,
'
DIAGRAMv3.2012DEC17.txt
'
,
'
fa2stu.MAF0.005.pos.out
'
,
'
fn2stu.MAF0.005.pos.out
'
,
'
ls2stu.MAF0.005.pos.out
'
,
'
gabriel_asthma_Reformated.txt
'
,
'
EUR.IBD.gwas_filtered.assoc
'
,
'
EUR.CD.gwas_filtered.assoc
'
,
'
EUR.UC.gwas_filtered.assoc
'
,
'
rall.txt
'
,
'
pgc.bip.full.2012-04.txt
'
,
'
pgc.mdd.full.2012-04.txt
'
,
'
IGAP_stage_1.txt
'
,
'
Menopause_HapMap_for_website_18112015.txt
'
,
'
CARDIoGRAM_GWAS_RESULTS.txt
'
,
'
C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT
'
,
'
SSGAC_EduYears_Rietveld2013_publicrelease.txt
'
,
'
SSGAC_College_Rietveld2013_publicrelease.txt
'
,
'
POAG_allGender_nonGC.txt
'
,
'
IOP_allGender_EURonly.txt
'
,
'
IMSGC-GWAS.txt
'
,
'
RA-EUR-GWAS.txt
'
,
'
CKD.txt
'
,
'
eGFRcrea_overall.txt
'
,
'
eGFRcys_overall.txt
'
]
'''
GWAS_table = [
'
GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt
'
,
'
SNP_gwas_mc_merge_nogc.tbl.uniq
'
,
'
GIANT_2015_HIP_COMBINED_EUR.txt
'
,
'
GIANT_2015_WC_COMBINED_EUR.txt
'
]
'''
# NEED ADDITIONAL PRE-CLEANING
# 'RA_GWASmeta_European_v2.txt',
# 'pgc.adhd.full.2012-10.txt',
# 'Menarche_Nature2014_GWASMetaResults_17122014.txt',
# Autism multi-traits GWAS
'''
GWAS_table = [
'
GWAS_case_control_european_no_CHOP.assoc.logistic
'
,
'
GWAS_controls_european_AANAT.assoc.linear
'
,
'
GWAS_controls_european_ASMT_log.assoc.linear
'
,
'
GWAS_controls_european_melatonin_log.assoc.linear
'
,
'
GWAS_controls_european_NAS.assoc.linear
'
,
'
GWAS_controls_european_serotonin_log.assoc.linear
'
]
outFileName = netPath+
'
PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5
'
'''
#---get internal GWAS link
Glink
=
[]
for
GWAS
in
range
(
0
,
len
(
GWAS_table
)):
GWAS_filename
=
GWAS_table
[
GWAS
]
Glink
.
append
({
'
filename
'
:
GWAS_filename
,
'
internalDataLink
'
:
walkfs
(
GWAS_path
,
GWAS_filename
)[
2
]})
Glink
=
pd
.
DataFrame
(
Glink
,
columns
=
(
'
filename
'
,
'
internalDataLink
'
))
#---get descript GWAS table
sub_dict
=
column_dict
[(
column_dict
[
'
filename
'
]).
isin
(
GWAS_table
)]
sub_dict
=
pd
.
merge
(
sub_dict
,
Glink
,
how
=
'
inner
'
,
on
=
'
filename
'
)
summary_phe
=
sub_dict
[[
'
outcome
'
,
'
fullName
'
,
'
consortia
'
,
'
type
'
,
'
reference
'
,
'
linkRef
'
,
'
dataLink
'
,
'
internalDataLink
'
]]
#cg = out.create_group('summary_phe')
#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False)
summary_phe
.
to_csv
(
out_summary
)
# can be read with pd.read_hdf(outFileName,'summary_phe')
#---open hdf5 files
ref
=
h5py
.
File
(
REF_filename
)
out
=
h5py
.
File
(
outFileName
,
"
w
"
)
for
chrom
in
range
(
1
,
23
):
chrom_str
=
'
chr%d
'
%
chrom
cg
=
out
.
create_group
(
chrom_str
)
cg
.
create_dataset
(
'
snp_ids
'
,
data
=
ref
[
chrom_str
][
'
sids
'
][...])
cg
.
create_dataset
(
'
position
'
,
data
=
ref
[
chrom_str
][
'
position
'
][...])
##############################################################
# LOOP OVER ALL GWAS
for
GWAS
in
range
(
0
,
len
(
GWAS_table
)):
##############################################################
# READ GWAS
GWAS_filename
=
GWAS_table
[
GWAS
]
GWAS_link
=
walkfs
(
GWAS_path
,
GWAS_filename
)[
2
]
print
(
'
\n
'
)
print
(
'
**** PARSING %s ******
'
%
GWAS_filename
)
#---check GWAS labels dictionnary
my_labels
=
column_dict
[
column_dict
[
'
filename
'
]
==
GWAS_filename
]
target_lab
=
my_labels
.
values
.
tolist
()[
0
]
my_study
=
my_labels
[
'
consortia
'
].
values
[
0
]
+
\
'
_
'
+
my_labels
[
'
outcome
'
].
values
[
0
]
#---parse header and get column name and order
with
open
(
GWAS_link
)
as
f
:
count_line
=
0
line
=
f
.
readline
()
header
=
line
.
split
()
list_col
=
{}
list_lab
=
{}
for
l
in
range
(
0
,
len
(
target_lab
)):
for
h
in
range
(
0
,
len
(
header
)):
if
header
[
h
]
==
target_lab
[
l
]:
list_lab
[
my_labels
.
columns
.
tolist
()[
l
]]
=
h
list_col
[
h
]
=
my_labels
.
columns
.
tolist
()[
l
]
#---load selected columns, rename columns, and set snpid as index
order
=
sorted
(
list_col
.
keys
())
fullGWAS
=
pd
.
read_csv
(
GWAS_link
,
delim_whitespace
=
True
,
na_values
=
def_missing
,
usecols
=
list_lab
.
values
(),
index_col
=
0
,
names
=
[
list_col
[
i
]
for
i
in
order
],
header
=
0
)
fullGWAS
=
fullGWAS
[
~
fullGWAS
.
index
.
duplicated
(
keep
=
'
first
'
)]
##############################################################
# read REF SNP and MERGE
SNP_in
=
0
SNP_rem
=
0
for
chrom
in
range
(
1
,
23
):
print
(
'
Working on Chromosome %d
'
%
chrom
)
chrom_str
=
'
chr%d
'
%
chrom
Nsnp
=
len
(
ref
[
chrom_str
][
'
position
'
])
# create alernatives SNPid for the SNPid mapping
# == map based on rsid
chrPos1
=
pd
.
DataFrame
(
ref
[
chrom_str
][
'
sids
'
][...],
columns
=
[
'
snpid
'
],
dtype
=
str
)
# == map based on chr:position
chrPos2
=
pd
.
DataFrame
(
pd
.
Series
([
str
(
chrom
)]
*
Nsnp
).
str
.
cat
(
ref
[
chrom_str
][
'
position
'
][...].
astype
(
str
),
sep
=
'
:
'
),
columns
=
[
'
snpid
'
])
# merge the alternative mappings
merge_GWAS
=
pd
.
merge
(
chrPos1
,
fullGWAS
,
how
=
'
left
'
,
right_index
=
True
,
left_on
=
'
snpid
'
,
sort
=
False
,
indicator
=
True
)
altID_1
=
pd
.
merge
(
chrPos2
,
fullGWAS
,
how
=
'
left
'
,
right_index
=
True
,
left_on
=
'
snpid
'
,
sort
=
False
,
indicator
=
True
)
sub_S
=
(
altID_1
[
'
_merge
'
]
==
"
both
"
)
if
(
sub_S
.
any
()):
merge_GWAS
[
sub_S
]
=
altID_1
.
iloc
[
np
.
where
(
sub_S
)[
0
]]
SNP_in
+=
sum
((
merge_GWAS
[
'
_merge
'
]
==
"
both
"
)
*
1
)
# check direction of Zscore
#%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%%
#upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
#sign1 = ((ref[chrom_str]['ref'].value == upper_Allele)*1-0.5)*2
gwasAllele1
=
[
x
.
upper
()
for
x
in
((
merge_GWAS
[
'
a1
'
]).
values
).
astype
(
str
)]
gwasAllele2
=
[
x
.
upper
()
for
x
in
((
merge_GWAS
[
'
a2
'
]).
values
).
astype
(
str
)]
gwasAllele1_c
=
dna_complement
(
gwasAllele1
)
gwasAllele2_c
=
dna_complement
(
gwasAllele2
)
refAllele1
=
ref
[
chrom_str
][
'
ref
'
].
value
.
astype
(
str
)
refAllele2
=
ref
[
chrom_str
][
'
alt
'
].
value
.
astype
(
str
)
case1
=
((
gwasAllele1
==
refAllele1
)
&
(
gwasAllele2
==
refAllele2
))
case2
=
((
gwasAllele2
==
refAllele1
)
&
(
gwasAllele1
==
refAllele2
))
case3
=
((
gwasAllele1_c
==
refAllele1
)
&
(
gwasAllele2_c
==
refAllele2
))
case4
=
((
gwasAllele2_c
==
refAllele1
)
&
(
gwasAllele1_c
==
refAllele2
))
sign1
=
np
.
repeat
(
np
.
nan
,
Nsnp
)
sign1
[
case1
]
=
np
.
repeat
(
+
1.0
,
sum
(
case1
))
sign1
[
case2
]
=
np
.
repeat
(
-
1.0
,
sum
(
case2
))
sign1
[
case3
]
=
np
.
repeat
(
+
1.0
,
sum
(
case3
))
sign1
[
case4
]
=
np
.
repeat
(
-
1.0
,
sum
(
case4
))
# get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00)
if
(
'
z
'
in
fullGWAS
.
columns
):
sign3
=
np
.
sign
(
merge_GWAS
.
z
)
elif
(
'
OR
'
in
fullGWAS
.
columns
):
sign3
=
np
.
sign
(
merge_GWAS
.
OR
-
1
+
(
np
.
random
.
rand
(
len
(
merge_GWAS
.
OR
))
-
0.5
)
*
0.0001
)
else
:
raise
ValueError
(
'
Both the statistic and OR are missing: ZSCORE cannot be derived
'
)
if
(
'
pval
'
not
in
merge_GWAS
):
raise
ValueError
(
'
P-value column was not found
'
)
#ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3
ZSCORE
=
np
.
sqrt
(
ss
.
chi2
.
isf
(
merge_GWAS
[
'
pval
'
],
1
))
*
sign1
*
sign3
# filter base on sample size =====================================================
#---sample size does not exists
if
'
n
'
in
fullGWAS
.
columns
:
myN
=
merge_GWAS
.
n
#--- freq, case-cont N exist
elif
((
'
ncas
'
in
fullGWAS
.
columns
)
&
(
'
ncont
'
in
fullGWAS
.
columns
)):
sumN
=
merge_GWAS
.
ncas
+
merge_GWAS
.
ncont
perCase
=
merge_GWAS
.
ncas
/
sumN
myN
=
sumN
*
perCase
*
(
1
-
perCase
)
#---se, freq exist and QT
elif
((
'
se
'
in
fullGWAS
.
columns
)
&
(
'
freq
'
in
fullGWAS
.
columns
)):
myN
=
1
/
(
merge_GWAS
.
freq
*
(
1
-
merge_GWAS
.
freq
)
*
2
*
merge_GWAS
.
se
**
2
)
#---se exists but no freq, // use ref panel freq
elif
((
'
se
'
in
fullGWAS
.
columns
)
&
(
'
freq
'
not
in
fullGWAS
.
columns
)):
myN
=
1
/
(
ref
[
chrom_str
][
'
eur_mafs
'
][...]
*
(
1
-
ref
[
chrom_str
][
'
eur_mafs
'
][...])
*
2
*
merge_GWAS
.
se
**
2
)
else
:
raise
ValueError
(
'
Both Sample size and SE(beta) are missing: sample size filtering cannot be applied
'
)
# replace infinite value by the max finite one
if
(
sum
(
np
.
isinf
(
myN
))
>
0
):
myN
[
np
.
isinf
(
myN
)]
=
np
.
max
(
myN
[
np
.
isfinite
(
myN
)])
myW_thres
=
np
.
percentile
(
myN
.
dropna
(),
90
)
#myW_thres = np.max(myN)
ss_thres
=
perSS
*
myW_thres
if
chrom
==
1
:
myN_genome
=
myN
else
:
myN_genome
=
np
.
concatenate
((
myN_genome
,
myN
),
axis
=
0
)
toRem
=
np
.
where
(
myN
<=
ss_thres
)
SNP_rem
+=
len
(
toRem
[
0
])
if
(
len
(
toRem
[
0
])
>
0
):
ZSCORE
.
iloc
[
toRem
]
=
np
.
nan
# add created data to the group
cg
=
out
[
chrom_str
]
cg
.
create_dataset
(
'
z
'
+
'
_
'
+
my_study
,
data
=
ZSCORE
)
ImpG_output
=
pd
.
DataFrame
({
'
rsID
'
:
chrPos1
[
'
snpid
'
],
'
pos
'
:
ref
[
chrom_str
][
'
position
'
][...],
'
A0
'
:
[
str
(
i
)[
2
]
for
i
in
ref
[
chrom_str
][
'
ref
'
][...]],
'
A1
'
:
[
str
(
i
)[
2
]
for
i
in
ref
[
chrom_str
][
'
alt
'
][...]],
'
Z
'
:
ZSCORE
.
values
},
columns
=
[
'
rsID
'
,
'
pos
'
,
'
A0
'
,
"
A1
"
,
"
Z
"
])
ImpG_output
.
dropna
(
subset
=
[
"
Z
"
],
how
=
"
any
"
,
inplace
=
True
)
impg_output_file
=
ImpG_output_Folder
+
'
z_
'
+
my_study
+
'
_chr_
'
+
str
(
chrom
)
+
"
.txt
"
print
(
"
WRITING CHR {} results for {} to:
"
.
format
(
chrom
,
my_study
))
ImpG_output
.
to_csv
(
impg_output_file
,
sep
=
"
\t
"
,
index
=
False
)
# ##############################################################
# # SUMMARY + PLOTS
# print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
# SNP_in, SNP_rem))
#
# # the histogram of the #sample per SNP
# fig = plt.figure()
# fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')
# nn = (pd.DataFrame(myN_genome)).dropna()
# n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75)
#
# Ymax = np.max(n)*1.2
#
# plt.ylim((0, Ymax))
# plt.xlabel('Infered weight (i.e. Sample size)')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r')
# plt.axvline(x=ss_thres, linewidth=2, linestyle='dashed', color='g')
#
# Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn))
# ax = fig.add_subplot(111)
#
# ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn)
# [0]), verticalalignment='top', horizontalalignment='left')
# ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn)
# [0]), verticalalignment='top', horizontalalignment='left')
# ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)),
# verticalalignment='top', horizontalalignment='left')
#
# # plt.show()
# fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi)
# plt.close(fig)
# CLOSE h5 FILES
out
.
close
()
ref
.
close
()
print
(
'
\n
'
)
This diff is collapsed.
Click to expand it.
Preview
0%
Loading
Try again
or
attach a new file
.
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Save comment
Cancel
Please
register
or
sign in
to comment