diff --git a/pyPCMA_1_format_v1.4_cur_work.py b/pyPCMA_1_format_v1.4_cur_work.py new file mode 100755 index 0000000000000000000000000000000000000000..47307a55869f9b1a451f378b2882965ef6a4455f --- /dev/null +++ b/pyPCMA_1_format_v1.4_cur_work.py @@ -0,0 +1,377 @@ +""" +Read raw GWAS summary statistics, filter and format +Write clean GWAS +""" +__updated__ = '2017-08-29' + +import h5py +import numpy as np +import scipy.stats as ss +import sys +import math +import os +import pandas as pd +import matplotlib.pyplot as plt + + +print('Number of arguments:', len(sys.argv), 'arguments.') +print('Argument List:', str(sys.argv)) + +perSS = 0.7 +netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' +#netPath = '/pasteur/projets/policy01/' +GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt' +GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' +REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5' +pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' + +outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' +def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', + '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] +out_summary = "summary_GWAS.csv" +ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/' + +# FUNCTION: find path for a given GWAS file ================================ + + +def walkfs(startdir, findfile): + dircount = 0 + filecount = 0 + for root, dirs, files in os.walk(startdir): + if findfile in files: + return dircount, filecount + files.index(findfile), os.path.join(root, findfile) + dircount += 1 + filecount += len(files) + # nothing found, return None instead of a full path for the file + return dircount, filecount, None + + +def dna_complement_base(inputbase): + if (inputbase == 'A'): + return('T') + if (inputbase == 'T'): + return('A') + if (inputbase == 'G'): + return('C') + if (inputbase == 'C'): + return('G') + return('Not ATGC') + + +def dna_complement(input): + return([dna_complement_base(x) for x in input]) + + + +############################################################## +# READ PARAM FILES +#---open GWAS dictionnary +column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') + +#---Public GWAS + +GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', + 'SNP_gwas_mc_merge_nogc.tbl.uniq', + 'GIANT_2015_HIP_COMBINED_EUR.txt', + 'GIANT_2015_WC_COMBINED_EUR.txt', + 'GIANT_2015_WHR_COMBINED_EUR.txt', + 'HDL_ONE_Europeans.tbl', + 'LDL_ONE_Europeans.tbl', + 'TC_ONE_Europeans.tbl', + 'TG_ONE_Europeans.tbl', + 'GWAS_DBP_recoded.txt', + 'GWAS_SBP_recoded.txt', + 'GWAS_PP_recoded.txt', + 'GWAS_MAP_recoded.txt', + 'MAGIC_FastingGlucose.txt', + 'MAGIC_ln_FastingInsulin.txt', + 'MAGIC_ln_HOMA-B.txt', + 'MAGIC_ln_HOMA-IR.txt', + 'MAGIC_2hrGlucose_AdjustedForBMI.txt', + 'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt', + 'MAGIC_proinsulin_for_release_HMrel27.txt', + 'MAGIC_HbA1C.txt', + 'DIAGRAMv3.2012DEC17.txt', + 'fa2stu.MAF0.005.pos.out', + 'fn2stu.MAF0.005.pos.out', + 'ls2stu.MAF0.005.pos.out', + 'gabriel_asthma_Reformated.txt', + 'EUR.IBD.gwas_filtered.assoc', + 'EUR.CD.gwas_filtered.assoc', + 'EUR.UC.gwas_filtered.assoc', + 'rall.txt', + 'pgc.bip.full.2012-04.txt', + 'pgc.mdd.full.2012-04.txt', + 'IGAP_stage_1.txt', + 'Menopause_HapMap_for_website_18112015.txt', + 'CARDIoGRAM_GWAS_RESULTS.txt', + 'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT', + 'SSGAC_EduYears_Rietveld2013_publicrelease.txt', + 'SSGAC_College_Rietveld2013_publicrelease.txt', + 'POAG_allGender_nonGC.txt', + 'IOP_allGender_EURonly.txt', + 'IMSGC-GWAS.txt', + 'RA-EUR-GWAS.txt', + 'CKD.txt', + 'eGFRcrea_overall.txt', + 'eGFRcys_overall.txt'] + +''' +GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', + 'SNP_gwas_mc_merge_nogc.tbl.uniq', + 'GIANT_2015_HIP_COMBINED_EUR.txt', + 'GIANT_2015_WC_COMBINED_EUR.txt'] +''' + + +# NEED ADDITIONAL PRE-CLEANING +# 'RA_GWASmeta_European_v2.txt', +# 'pgc.adhd.full.2012-10.txt', +# 'Menarche_Nature2014_GWASMetaResults_17122014.txt', + +# Autism multi-traits GWAS +''' +GWAS_table = ['GWAS_case_control_european_no_CHOP.assoc.logistic', + 'GWAS_controls_european_AANAT.assoc.linear', + 'GWAS_controls_european_ASMT_log.assoc.linear', + 'GWAS_controls_european_melatonin_log.assoc.linear', + 'GWAS_controls_european_NAS.assoc.linear', + 'GWAS_controls_european_serotonin_log.assoc.linear'] +outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5' +''' + + +#---get internal GWAS link +Glink = [] + +for GWAS in range(0, len(GWAS_table)): + GWAS_filename = GWAS_table[GWAS] + Glink.append({'filename': GWAS_filename, + 'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]}) +Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink')) + +#---get descript GWAS table +sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)] +sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename') + +summary_phe = sub_dict[['outcome', 'fullName', 'consortia', + 'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']] +#cg = out.create_group('summary_phe') + +#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False) + + +summary_phe.to_csv(out_summary) +# can be read with pd.read_hdf(outFileName,'summary_phe') + +#---open hdf5 files +ref = h5py.File(REF_filename) +out = h5py.File(outFileName, "w") +for chrom in range(1, 23): + chrom_str = 'chr%d' % chrom + cg = out.create_group(chrom_str) + cg.create_dataset('snp_ids', data=ref[chrom_str]['sids'][...]) + cg.create_dataset('position', data=ref[chrom_str]['position'][...]) + + +############################################################## +# LOOP OVER ALL GWAS +for GWAS in range(0, len(GWAS_table)): + + ############################################################## + # READ GWAS + GWAS_filename = GWAS_table[GWAS] + GWAS_link = walkfs(GWAS_path, GWAS_filename)[2] + print('\n') + print('**** PARSING %s ******' % GWAS_filename) + + #---check GWAS labels dictionnary + my_labels = column_dict[column_dict['filename'] == GWAS_filename] + target_lab = my_labels.values.tolist()[0] + my_study = my_labels['consortia'].values[0] + \ + '_' + my_labels['outcome'].values[0] + + #---parse header and get column name and order + with open(GWAS_link) as f: + count_line = 0 + line = f.readline() + header = line.split() + list_col = {} + list_lab = {} + for l in range(0, len(target_lab)): + for h in range(0, len(header)): + if header[h] == target_lab[l]: + list_lab[my_labels.columns.tolist()[l]] = h + list_col[h] = my_labels.columns.tolist()[l] + + #---load selected columns, rename columns, and set snpid as index + order = sorted(list_col.keys()) + fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing, + usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0) + fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] + + ############################################################## + # read REF SNP and MERGE + SNP_in = 0 + SNP_rem = 0 + + for chrom in range(1, 23): + print('Working on Chromosome %d' % chrom) + chrom_str = 'chr%d' % chrom + Nsnp = len(ref[chrom_str]['position']) + + # create alernatives SNPid for the SNPid mapping + # == map based on rsid + chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str ) + # == map based on chr:position + chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat( + ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid']) + + + # merge the alternative mappings + merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left', + right_index=True, left_on='snpid', sort=False, indicator=True) + altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True, + left_on='snpid', sort=False, indicator=True) + sub_S = (altID_1['_merge'] == "both") + + + if(sub_S.any()): + merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]] + + + SNP_in += sum((merge_GWAS['_merge'] == "both")*1) + + # check direction of Zscore + #%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%% + #upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)] + #sign1 = ((ref[chrom_str]['ref'].value == upper_Allele)*1-0.5)*2 + gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)] + gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)] + gwasAllele1_c = dna_complement(gwasAllele1) + gwasAllele2_c = dna_complement(gwasAllele2) + + refAllele1 = ref[chrom_str]['ref'].value.astype(str) + refAllele2 = ref[chrom_str]['alt'].value.astype(str) + + case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2)) + case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2)) + case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2)) + case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2)) + + sign1 = np.repeat(np.nan, Nsnp) + sign1[case1] = np.repeat(+ 1.0, sum(case1)) + sign1[case2] = np.repeat(- 1.0, sum(case2)) + sign1[case3] = np.repeat(+ 1.0, sum(case3)) + sign1[case4] = np.repeat(- 1.0, sum(case4)) + + # get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00) + if('z' in fullGWAS.columns): + sign3 = np.sign(merge_GWAS.z) + elif('OR' in fullGWAS.columns): + sign3 = np.sign(merge_GWAS.OR - 1 + + (np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001) + else: + raise ValueError( + 'Both the statistic and OR are missing: ZSCORE cannot be derived') + + if('pval' not in merge_GWAS): + raise ValueError('P-value column was not found') + + #ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3 + ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3 + + # filter base on sample size ===================================================== + #---sample size does not exists + if 'n' in fullGWAS.columns: + myN = merge_GWAS.n + #--- freq, case-cont N exist + elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)): + sumN = merge_GWAS.ncas + merge_GWAS.ncont + perCase = merge_GWAS.ncas / sumN + myN = sumN * perCase * (1-perCase) + #---se, freq exist and QT + elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)): + myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2) + #---se exists but no freq, // use ref panel freq + elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)): + myN = 1/(ref[chrom_str]['eur_mafs'][...] * + (1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2) + else: + raise ValueError( + 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') + # replace infinite value by the max finite one + if(sum(np.isinf(myN)) > 0): + myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) + + myW_thres = np.percentile(myN.dropna(), 90) + #myW_thres = np.max(myN) + ss_thres = perSS * myW_thres + + if chrom == 1: + myN_genome = myN + else: + myN_genome = np.concatenate((myN_genome, myN), axis=0) + + toRem = np.where(myN <= ss_thres) + SNP_rem += len(toRem[0]) + if(len(toRem[0]) > 0): + ZSCORE.iloc[toRem] = np.nan + # add created data to the group + cg = out[chrom_str] + cg.create_dataset('z'+'_'+my_study, data=ZSCORE) + + + ImpG_output = pd.DataFrame({ + 'rsID': chrPos1['snpid'], + 'pos': ref[chrom_str]['position'][...], + 'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]], + 'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]], + 'Z': ZSCORE.values + }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) + + ImpG_output.dropna(subset=["Z"], how="any", inplace=True) + impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt" + print("WRITING CHR {} results for {} to:".format(chrom, my_study)) + ImpG_output.to_csv(impg_output_file, sep="\t", index=False) + + # ############################################################## + # # SUMMARY + PLOTS + # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % ( + # SNP_in, SNP_rem)) + # + # # the histogram of the #sample per SNP + # fig = plt.figure() + # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold') + # nn = (pd.DataFrame(myN_genome)).dropna() + # n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75) + # + # Ymax = np.max(n)*1.2 + # + # plt.ylim((0, Ymax)) + # plt.xlabel('Infered weight (i.e. Sample size)') + # plt.ylabel('Frequency') + # plt.grid(True) + # plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r') + # plt.axvline(x=ss_thres, linewidth=2, linestyle='dashed', color='g') + # + # Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn)) + # ax = fig.add_subplot(111) + # + # ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn) + # [0]), verticalalignment='top', horizontalalignment='left') + # ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn) + # [0]), verticalalignment='top', horizontalalignment='left') + # ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)), + # verticalalignment='top', horizontalalignment='left') + # + # # plt.show() + # fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi) + # plt.close(fig) + + +# CLOSE h5 FILES +out.close() +ref.close() + +print('\n')