Skip to content
Snippets Groups Projects
Commit 1492680a authored by Vincent  LAVILLE's avatar Vincent LAVILLE
Browse files

Working copy

parent 6df4e558
Branches
Tags
No related merge requests found
"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS
"""
__updated__ = '2017-08-29'
import h5py
import numpy as np
import scipy.stats as ss
import sys
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
print('Number of arguments:', len(sys.argv), 'arguments.')
print('Argument List:', str(sys.argv))
perSS = 0.7
netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/'
#netPath = '/pasteur/projets/policy01/'
GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
# FUNCTION: find path for a given GWAS file ================================
def walkfs(startdir, findfile):
dircount = 0
filecount = 0
for root, dirs, files in os.walk(startdir):
if findfile in files:
return dircount, filecount + files.index(findfile), os.path.join(root, findfile)
dircount += 1
filecount += len(files)
# nothing found, return None instead of a full path for the file
return dircount, filecount, None
def dna_complement_base(inputbase):
if (inputbase == 'A'):
return('T')
if (inputbase == 'T'):
return('A')
if (inputbase == 'G'):
return('C')
if (inputbase == 'C'):
return('G')
return('Not ATGC')
def dna_complement(input):
return([dna_complement_base(x) for x in input])
##############################################################
# READ PARAM FILES
#---open GWAS dictionnary
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
#---Public GWAS
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt',
'GIANT_2015_WHR_COMBINED_EUR.txt',
'HDL_ONE_Europeans.tbl',
'LDL_ONE_Europeans.tbl',
'TC_ONE_Europeans.tbl',
'TG_ONE_Europeans.tbl',
'GWAS_DBP_recoded.txt',
'GWAS_SBP_recoded.txt',
'GWAS_PP_recoded.txt',
'GWAS_MAP_recoded.txt',
'MAGIC_FastingGlucose.txt',
'MAGIC_ln_FastingInsulin.txt',
'MAGIC_ln_HOMA-B.txt',
'MAGIC_ln_HOMA-IR.txt',
'MAGIC_2hrGlucose_AdjustedForBMI.txt',
'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt',
'MAGIC_proinsulin_for_release_HMrel27.txt',
'MAGIC_HbA1C.txt',
'DIAGRAMv3.2012DEC17.txt',
'fa2stu.MAF0.005.pos.out',
'fn2stu.MAF0.005.pos.out',
'ls2stu.MAF0.005.pos.out',
'gabriel_asthma_Reformated.txt',
'EUR.IBD.gwas_filtered.assoc',
'EUR.CD.gwas_filtered.assoc',
'EUR.UC.gwas_filtered.assoc',
'rall.txt',
'pgc.bip.full.2012-04.txt',
'pgc.mdd.full.2012-04.txt',
'IGAP_stage_1.txt',
'Menopause_HapMap_for_website_18112015.txt',
'CARDIoGRAM_GWAS_RESULTS.txt',
'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT',
'SSGAC_EduYears_Rietveld2013_publicrelease.txt',
'SSGAC_College_Rietveld2013_publicrelease.txt',
'POAG_allGender_nonGC.txt',
'IOP_allGender_EURonly.txt',
'IMSGC-GWAS.txt',
'RA-EUR-GWAS.txt',
'CKD.txt',
'eGFRcrea_overall.txt',
'eGFRcys_overall.txt']
'''
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt']
'''
# NEED ADDITIONAL PRE-CLEANING
# 'RA_GWASmeta_European_v2.txt',
# 'pgc.adhd.full.2012-10.txt',
# 'Menarche_Nature2014_GWASMetaResults_17122014.txt',
# Autism multi-traits GWAS
'''
GWAS_table = ['GWAS_case_control_european_no_CHOP.assoc.logistic',
'GWAS_controls_european_AANAT.assoc.linear',
'GWAS_controls_european_ASMT_log.assoc.linear',
'GWAS_controls_european_melatonin_log.assoc.linear',
'GWAS_controls_european_NAS.assoc.linear',
'GWAS_controls_european_serotonin_log.assoc.linear']
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5'
'''
#---get internal GWAS link
Glink = []
for GWAS in range(0, len(GWAS_table)):
GWAS_filename = GWAS_table[GWAS]
Glink.append({'filename': GWAS_filename,
'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]})
Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink'))
#---get descript GWAS table
sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)]
sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename')
summary_phe = sub_dict[['outcome', 'fullName', 'consortia',
'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']]
#cg = out.create_group('summary_phe')
#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False)
summary_phe.to_csv(out_summary)
# can be read with pd.read_hdf(outFileName,'summary_phe')
#---open hdf5 files
ref = h5py.File(REF_filename)
out = h5py.File(outFileName, "w")
for chrom in range(1, 23):
chrom_str = 'chr%d' % chrom
cg = out.create_group(chrom_str)
cg.create_dataset('snp_ids', data=ref[chrom_str]['sids'][...])
cg.create_dataset('position', data=ref[chrom_str]['position'][...])
##############################################################
# LOOP OVER ALL GWAS
for GWAS in range(0, len(GWAS_table)):
##############################################################
# READ GWAS
GWAS_filename = GWAS_table[GWAS]
GWAS_link = walkfs(GWAS_path, GWAS_filename)[2]
print('\n')
print('**** PARSING %s ******' % GWAS_filename)
#---check GWAS labels dictionnary
my_labels = column_dict[column_dict['filename'] == GWAS_filename]
target_lab = my_labels.values.tolist()[0]
my_study = my_labels['consortia'].values[0] + \
'_' + my_labels['outcome'].values[0]
#---parse header and get column name and order
with open(GWAS_link) as f:
count_line = 0
line = f.readline()
header = line.split()
list_col = {}
list_lab = {}
for l in range(0, len(target_lab)):
for h in range(0, len(header)):
if header[h] == target_lab[l]:
list_lab[my_labels.columns.tolist()[l]] = h
list_col[h] = my_labels.columns.tolist()[l]
#---load selected columns, rename columns, and set snpid as index
order = sorted(list_col.keys())
fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing,
usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0)
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
##############################################################
# read REF SNP and MERGE
SNP_in = 0
SNP_rem = 0
for chrom in range(1, 23):
print('Working on Chromosome %d' % chrom)
chrom_str = 'chr%d' % chrom
Nsnp = len(ref[chrom_str]['position'])
# create alernatives SNPid for the SNPid mapping
# == map based on rsid
chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str )
# == map based on chr:position
chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat(
ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid'])
# merge the alternative mappings
merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left',
right_index=True, left_on='snpid', sort=False, indicator=True)
altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True,
left_on='snpid', sort=False, indicator=True)
sub_S = (altID_1['_merge'] == "both")
if(sub_S.any()):
merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]]
SNP_in += sum((merge_GWAS['_merge'] == "both")*1)
# check direction of Zscore
#%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%%
#upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
#sign1 = ((ref[chrom_str]['ref'].value == upper_Allele)*1-0.5)*2
gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)]
gwasAllele1_c = dna_complement(gwasAllele1)
gwasAllele2_c = dna_complement(gwasAllele2)
refAllele1 = ref[chrom_str]['ref'].value.astype(str)
refAllele2 = ref[chrom_str]['alt'].value.astype(str)
case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2))
case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2))
case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2))
case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2))
sign1 = np.repeat(np.nan, Nsnp)
sign1[case1] = np.repeat(+ 1.0, sum(case1))
sign1[case2] = np.repeat(- 1.0, sum(case2))
sign1[case3] = np.repeat(+ 1.0, sum(case3))
sign1[case4] = np.repeat(- 1.0, sum(case4))
# get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00)
if('z' in fullGWAS.columns):
sign3 = np.sign(merge_GWAS.z)
elif('OR' in fullGWAS.columns):
sign3 = np.sign(merge_GWAS.OR - 1 +
(np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001)
else:
raise ValueError(
'Both the statistic and OR are missing: ZSCORE cannot be derived')
if('pval' not in merge_GWAS):
raise ValueError('P-value column was not found')
#ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3
ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3
# filter base on sample size =====================================================
#---sample size does not exists
if 'n' in fullGWAS.columns:
myN = merge_GWAS.n
#--- freq, case-cont N exist
elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)):
sumN = merge_GWAS.ncas + merge_GWAS.ncont
perCase = merge_GWAS.ncas / sumN
myN = sumN * perCase * (1-perCase)
#---se, freq exist and QT
elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)):
myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2)
#---se exists but no freq, // use ref panel freq
elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)):
myN = 1/(ref[chrom_str]['eur_mafs'][...] *
(1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2)
else:
raise ValueError(
'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
# replace infinite value by the max finite one
if(sum(np.isinf(myN)) > 0):
myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
myW_thres = np.percentile(myN.dropna(), 90)
#myW_thres = np.max(myN)
ss_thres = perSS * myW_thres
if chrom == 1:
myN_genome = myN
else:
myN_genome = np.concatenate((myN_genome, myN), axis=0)
toRem = np.where(myN <= ss_thres)
SNP_rem += len(toRem[0])
if(len(toRem[0]) > 0):
ZSCORE.iloc[toRem] = np.nan
# add created data to the group
cg = out[chrom_str]
cg.create_dataset('z'+'_'+my_study, data=ZSCORE)
ImpG_output = pd.DataFrame({
'rsID': chrPos1['snpid'],
'pos': ref[chrom_str]['position'][...],
'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]],
'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]],
'Z': ZSCORE.values
}, columns= ['rsID', 'pos', 'A0', "A1", "Z" ])
ImpG_output.dropna(subset=["Z"], how="any", inplace=True)
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to:".format(chrom, my_study))
ImpG_output.to_csv(impg_output_file, sep="\t", index=False)
# ##############################################################
# # SUMMARY + PLOTS
# print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
# SNP_in, SNP_rem))
#
# # the histogram of the #sample per SNP
# fig = plt.figure()
# fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')
# nn = (pd.DataFrame(myN_genome)).dropna()
# n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75)
#
# Ymax = np.max(n)*1.2
#
# plt.ylim((0, Ymax))
# plt.xlabel('Infered weight (i.e. Sample size)')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r')
# plt.axvline(x=ss_thres, linewidth=2, linestyle='dashed', color='g')
#
# Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn))
# ax = fig.add_subplot(111)
#
# ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn)
# [0]), verticalalignment='top', horizontalalignment='left')
# ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn)
# [0]), verticalalignment='top', horizontalalignment='left')
# ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)),
# verticalalignment='top', horizontalalignment='left')
#
# # plt.show()
# fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi)
# plt.close(fig)
# CLOSE h5 FILES
out.close()
ref.close()
print('\n')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment