Skip to content
Snippets Groups Projects
Commit 2be0f8b7 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

clarified that beta or Z can be specified indifferently

parent b488e211
No related branches found
No related tags found
No related merge requests found
...@@ -112,7 +112,7 @@ Input ...@@ -112,7 +112,7 @@ Input
* n: name of the column storing the sample size by variants (optional, will be inferred from the MAF, genetic effect and standard deviation if absent) * n: name of the column storing the sample size by variants (optional, will be inferred from the MAF, genetic effect and standard deviation if absent)
* ncas: For binary traits, name of the column storing the number of cases by variants (optional) * ncas: For binary traits, name of the column storing the number of cases by variants (optional)
* ncont: For binary traits, name of the column storing the number of controls by variants (optional) * ncont: For binary traits, name of the column storing the number of controls by variants (optional)
* z: name of the column storing the genetic effect (beta) in the gwas file * beta: name of the column storing the genetic effect (beta) in the gwas file
* OR : For binary traits, Odd ratio when available. Not to be confounded with the genetic effect size or 'beta'. * OR : For binary traits, Odd ratio when available. Not to be confounded with the genetic effect size or 'beta'.
* index-type: precise the type of index * index-type: precise the type of index
* imputation_quality: (Optional) column containing individual-based imputation quality. Will be used to filter low quality imputation data from GWASs if the option --imputation-quality-treshold is used * imputation_quality: (Optional) column containing individual-based imputation quality. Will be used to filter low quality imputation data from GWASs if the option --imputation-quality-treshold is used
......
...@@ -14,8 +14,8 @@ def compute_z_score(mgwas): ...@@ -14,8 +14,8 @@ def compute_z_score(mgwas):
add the corresponding column to the mgwas dataframe add the corresponding column to the mgwas dataframe
""" """
if 'z' in mgwas.columns: if 'beta_or_Z' in mgwas.columns:
sign_vect = np.sign(mgwas.z) sign_vect = np.sign(mgwas.beta_or_Z)
else: else:
if "OR" in mgwas.columns: if "OR" in mgwas.columns:
sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8)) sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8))
......
...@@ -146,7 +146,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): ...@@ -146,7 +146,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols = column_map.values, usecols = column_map.values,
compression=compression, compression=compression,
#column_dict['label_position'].keys(),
names= column_map.index, names= column_map.index,
header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN', header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
'-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
......
"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS
"""
__updated__ = '2018-19-02'
import h5py
import numpy as np
import scipy.stats as ss
import sys
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
perSS = 0.7
netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/'
#netPath = '/pasteur/projets/policy01/'
GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
# FUNCTION: find path for a given GWAS file ================================
def walkfs(startdir, findfile):
dircount = 0
filecount = 0
for root, dirs, files in os.walk(startdir):
if findfile in files:
return dircount, filecount + files.index(findfile), os.path.join(root, findfile)
dircount += 1
filecount += len(files)
# nothing found, return None instead of a full path for the file
return dircount, filecount, None
def dna_complement_base(inputbase):
if (inputbase == 'A'):
return('T')
if (inputbase == 'T'):
return('A')
if (inputbase == 'G'):
return('C')
if (inputbase == 'C'):
return('G')
return('Not ATGC')
def dna_complement(input):
return([dna_complement_base(x) for x in input])
##############################################################
# READ PARAM FILES
#---open GWAS dictionnary
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
#---Public GWAS
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt',
'GIANT_2015_WHR_COMBINED_EUR.txt',
'HDL_ONE_Europeans.tbl',
'LDL_ONE_Europeans.tbl',
'TC_ONE_Europeans.tbl',
'TG_ONE_Europeans.tbl',
'GWAS_DBP_recoded.txt',
'GWAS_SBP_recoded.txt',
'GWAS_PP_recoded.txt',
'GWAS_MAP_recoded.txt',
'MAGIC_FastingGlucose.txt',
'MAGIC_ln_FastingInsulin.txt',
'MAGIC_ln_HOMA-B.txt',
'MAGIC_ln_HOMA-IR.txt',
'MAGIC_2hrGlucose_AdjustedForBMI.txt',
'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt',
'MAGIC_proinsulin_for_release_HMrel27.txt',
'MAGIC_HbA1C.txt',
'DIAGRAMv3.2012DEC17.txt',
'fa2stu.MAF0.005.pos.out',
'fn2stu.MAF0.005.pos.out',
'ls2stu.MAF0.005.pos.out',
'gabriel_asthma_Reformated.txt',
'EUR.IBD.gwas_filtered.assoc',
'EUR.CD.gwas_filtered.assoc',
'EUR.UC.gwas_filtered.assoc',
'rall.txt',
'pgc.bip.full.2012-04.txt',
'pgc.mdd.full.2012-04.txt',
'IGAP_stage_1.txt',
'Menopause_HapMap_for_website_18112015.txt',
'CARDIoGRAM_GWAS_RESULTS.txt',
'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT',
'SSGAC_EduYears_Rietveld2013_publicrelease.txt',
'SSGAC_College_Rietveld2013_publicrelease.txt',
'POAG_allGender_nonGC.txt',
'IOP_allGender_EURonly.txt',
'IMSGC-GWAS.txt',
'RA-EUR-GWAS.txt',
'CKD.txt',
'eGFRcrea_overall.txt',
'eGFRcys_overall.txt']
'''
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt']
'''
# NEED ADDITIONAL PRE-CLEANING
# 'RA_GWASmeta_European_v2.txt',
# 'pgc.adhd.full.2012-10.txt',
# 'Menarche_Nature2014_GWASMetaResults_17122014.txt',
# Autism multi-traits GWAS
'''
GWAS_table = ['GWAS_case_control_european_no_CHOP.assoc.logistic',
'GWAS_controls_european_AANAT.assoc.linear',
'GWAS_controls_european_ASMT_log.assoc.linear',
'GWAS_controls_european_melatonin_log.assoc.linear',
'GWAS_controls_european_NAS.assoc.linear',
'GWAS_controls_european_serotonin_log.assoc.linear']
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5'
'''
#---get internal GWAS link
Glink = []
for GWAS in range(0, len(GWAS_table)):
GWAS_filename = GWAS_table[GWAS]
Glink.append({'filename': GWAS_filename,
'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]})
Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink'))
#---get descript GWAS table
sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)]
sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename')
summary_phe = sub_dict[['outcome', 'fullName', 'consortia',
'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']]
#cg = out.create_group('summary_phe')
#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False)
summary_phe.to_csv(out_summary)
# can be read with pd.read_hdf(outFileName,'summary_phe')
#---open hdf5 files
ref = h5py.File(REF_filename)
out = h5py.File(outFileName, "w")
for chrom in range(1, 23):
chrom_str = 'chr%d' % chrom
cg = out.create_group(chrom_str)
cg.create_dataset('snp_ids', data=ref[chrom_str]['sids'][...])
cg.create_dataset('position', data=ref[chrom_str]['position'][...])
##############################################################
# LOOP OVER ALL GWAS
for GWAS in range(0, len(GWAS_table)):
##############################################################
# READ GWAS
GWAS_filename = GWAS_table[GWAS]
GWAS_link = walkfs(GWAS_path, GWAS_filename)[2]
print('\n')
print('**** PARSING %s ******' % GWAS_filename)
#---check GWAS labels dictionnary
my_labels = column_dict[column_dict['filename'] == GWAS_filename]
target_lab = my_labels.values.tolist()[0]
my_study = my_labels['consortia'].values[0] + \
'_' + my_labels['outcome'].values[0]
#---parse header and get column name and order
with open(GWAS_link) as f:
count_line = 0
line = f.readline()
header = line.split()
list_col = {}
list_lab = {}
for l in range(0, len(target_lab)):
for h in range(0, len(header)):
if header[h] == target_lab[l]:
list_lab[my_labels.columns.tolist()[l]] = h
list_col[h] = my_labels.columns.tolist()[l]
#---load selected columns, rename columns, and set snpid as index
order = sorted(list_col.keys())
fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing,
usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0)
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
##############################################################
# read REF SNP and MERGE
SNP_in = 0
SNP_rem = 0
for chrom in range(1, 23):
print('Working on Chromosome %d' % chrom)
chrom_str = 'chr%d' % chrom
Nsnp = len(ref[chrom_str]['position'])
# create alernatives SNPid for the SNPid mapping
# == map based on rsid
chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str )
# == map based on chr:position
chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat(
ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid'])
# merge the alternative mappings
merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left',
right_index=True, left_on='snpid', sort=False, indicator=True)
altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True,
left_on='snpid', sort=False, indicator=True)
sub_S = (altID_1['_merge'] == "both")
if(sub_S.any()):
merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]]
SNP_in += sum((merge_GWAS['_merge'] == "both")*1)
# check direction of Zscore
#%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%%
#upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
#sign1 = ((ref[chrom_str]['ref'].value == upper_Allele)*1-0.5)*2
gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)]
gwasAllele1_c = dna_complement(gwasAllele1)
gwasAllele2_c = dna_complement(gwasAllele2)
refAllele1 = ref[chrom_str]['ref'].value.astype(str)
refAllele2 = ref[chrom_str]['alt'].value.astype(str)
case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2))
case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2))
case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2))
case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2))
sign1 = np.repeat(np.nan, Nsnp)
sign1[case1] = np.repeat(+ 1.0, sum(case1))
sign1[case2] = np.repeat(- 1.0, sum(case2))
sign1[case3] = np.repeat(+ 1.0, sum(case3))
sign1[case4] = np.repeat(- 1.0, sum(case4))
# get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00)
if('z' in fullGWAS.columns):
sign3 = np.sign(merge_GWAS.z)
elif('OR' in fullGWAS.columns):
sign3 = np.sign(merge_GWAS.OR - 1 +
(np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001)
else:
raise ValueError(
'Both the statistic and OR are missing: ZSCORE cannot be derived')
if('pval' not in merge_GWAS):
raise ValueError('P-value column was not found')
#ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3
ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3
# filter base on sample size =====================================================
#---sample size does not exists
if 'n' in fullGWAS.columns:
myN = merge_GWAS.n
#--- freq, case-cont N exist
elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)):
sumN = merge_GWAS.ncas + merge_GWAS.ncont
perCase = merge_GWAS.ncas / sumN
myN = sumN * perCase * (1-perCase)
#---se, freq exist and QT
elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)):
myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2)
#---se exists but no freq, // use ref panel freq
elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)):
myN = 1/(ref[chrom_str]['eur_mafs'][...] *
(1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2)
else:
raise ValueError(
'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
# replace infinite value by the max finite one
if(sum(np.isinf(myN)) > 0):
myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
myW_thres = np.percentile(myN.dropna(), 90)
#myW_thres = np.max(myN)
ss_thres = perSS * myW_thres
if chrom == 1:
myN_genome = myN
else:
myN_genome = np.concatenate((myN_genome, myN), axis=0)
toRem = np.where(myN <= ss_thres)
SNP_rem += len(toRem[0])
if(len(toRem[0]) > 0):
ZSCORE.iloc[toRem] = np.nan
# add created data to the group
cg = out[chrom_str]
cg.create_dataset('z'+'_'+my_study, data=ZSCORE)
ImpG_output = pd.DataFrame({
'rsID': chrPos1['snpid'],
'pos': ref[chrom_str]['position'][...],
'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]],
'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]],
'Z': ZSCORE.values
}, columns= ['rsID', 'pos', 'A0', "A1", "Z" ])
ImpG_output.dropna(subset=["Z"], how="any", inplace=True)
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to:".format(chrom, my_study))
ImpG_output.to_csv(impg_output_file, sep="\t", index=False)
# ##############################################################
# # SUMMARY + PLOTS
# print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
# SNP_in, SNP_rem))
#np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1))
# # the histogram of the #sample per SNP
# fig = plt.figure()
# fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')
# nn = (pd.DataFrame(myN_genome)).dropna()
# n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75)
#
# Ymax = np.max(n)*1.2
#
# plt.ylim((0, Ymax))
# plt.xlabel('Infered weight (i.e. Sample size)')
# plt.ylabel('Frequency')
# plt.grid(True)
# plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r')
# plt.axvline(x=ss_thres, linewidth=2, linestyle='dashed', color='g')
#
# Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn))
# ax = fig.add_subplot(111)
#
# ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn)
# [0]), verticalalignment='top', horizontalalignment='left')
# ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn)
# [0]), verticalalignment='top', horizontalalignment='left')
# ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)),
# verticalalignment='top', horizontalalignment='left')
#
# # plt.show()
# fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi)
# plt.close(fig)
# CLOSE h5 FILES
out.close()
ref.close()
print('\n')
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment