Working copy

1492680a · Vincent LAVILLE · 6df4e558 · 1492680a
Commit 1492680a authored 7 years ago by Vincent LAVILLE
--- a/pyPCMA_1_format_v1.4_cur_work.py
+++ b/pyPCMA_1_format_v1.4_cur_work.py
+"""
+Read raw GWAS summary statistics, filter and format
+Write clean GWAS
+"""
+__updated__ = '2017-08-29'
+
+import h5py
+import numpy as np
+import scipy.stats as ss
+import sys
+import math
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+
+
+print('Number of arguments:', len(sys.argv), 'arguments.')
+print('Argument List:', str(sys.argv))
+
+perSS = 0.7
+netPath = "/mnt/atlas/"  # '/home/genstat/ATLAS/'
+#netPath       = '/pasteur/projets/policy01/'
+GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
+GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
+REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5'
+pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
+
+outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
+def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
+               '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
+out_summary = "summary_GWAS.csv"
+ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
+
+# FUNCTION: find path for a given GWAS file ================================
+
+
+def walkfs(startdir, findfile):
+    dircount = 0
+    filecount = 0
+    for root, dirs, files in os.walk(startdir):
+        if findfile in files:
+            return dircount, filecount + files.index(findfile), os.path.join(root, findfile)
+        dircount += 1
+        filecount += len(files)
+    # nothing found, return None instead of a full path for the file
+    return dircount, filecount, None
+
+
+def dna_complement_base(inputbase):
+    if (inputbase == 'A'):
+        return('T')
+    if (inputbase == 'T'):
+        return('A')
+    if (inputbase == 'G'):
+        return('C')
+    if (inputbase == 'C'):
+        return('G')
+    return('Not ATGC')
+
+
+def dna_complement(input):
+    return([dna_complement_base(x) for x in input])
+
+
+
+##############################################################
+# READ PARAM FILES
+#---open GWAS dictionnary
+column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
+
+#---Public GWAS
+
+GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
+              'SNP_gwas_mc_merge_nogc.tbl.uniq',
+              'GIANT_2015_HIP_COMBINED_EUR.txt',
+              'GIANT_2015_WC_COMBINED_EUR.txt',
+              'GIANT_2015_WHR_COMBINED_EUR.txt',
+              'HDL_ONE_Europeans.tbl',
+              'LDL_ONE_Europeans.tbl',
+              'TC_ONE_Europeans.tbl',
+              'TG_ONE_Europeans.tbl',
+              'GWAS_DBP_recoded.txt',
+              'GWAS_SBP_recoded.txt',
+              'GWAS_PP_recoded.txt',
+              'GWAS_MAP_recoded.txt',
+              'MAGIC_FastingGlucose.txt',
+              'MAGIC_ln_FastingInsulin.txt',
+              'MAGIC_ln_HOMA-B.txt',
+              'MAGIC_ln_HOMA-IR.txt',
+              'MAGIC_2hrGlucose_AdjustedForBMI.txt',
+              'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt',
+              'MAGIC_proinsulin_for_release_HMrel27.txt',
+              'MAGIC_HbA1C.txt',
+              'DIAGRAMv3.2012DEC17.txt',
+              'fa2stu.MAF0.005.pos.out',
+              'fn2stu.MAF0.005.pos.out',
+              'ls2stu.MAF0.005.pos.out',
+              'gabriel_asthma_Reformated.txt',
+              'EUR.IBD.gwas_filtered.assoc',
+              'EUR.CD.gwas_filtered.assoc',
+              'EUR.UC.gwas_filtered.assoc',
+              'rall.txt',
+              'pgc.bip.full.2012-04.txt',
+              'pgc.mdd.full.2012-04.txt',
+              'IGAP_stage_1.txt',
+              'Menopause_HapMap_for_website_18112015.txt',
+              'CARDIoGRAM_GWAS_RESULTS.txt',
+              'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT',
+              'SSGAC_EduYears_Rietveld2013_publicrelease.txt',
+              'SSGAC_College_Rietveld2013_publicrelease.txt',
+              'POAG_allGender_nonGC.txt',
+              'IOP_allGender_EURonly.txt',
+              'IMSGC-GWAS.txt',
+              'RA-EUR-GWAS.txt',
+              'CKD.txt',
+              'eGFRcrea_overall.txt',
+              'eGFRcys_overall.txt']
+
+'''
+GWAS_table  = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
+               'SNP_gwas_mc_merge_nogc.tbl.uniq',
+               'GIANT_2015_HIP_COMBINED_EUR.txt',
+               'GIANT_2015_WC_COMBINED_EUR.txt']
+'''
+
+
+# NEED ADDITIONAL PRE-CLEANING
+#               'RA_GWASmeta_European_v2.txt',
+#               'pgc.adhd.full.2012-10.txt',
+#               'Menarche_Nature2014_GWASMetaResults_17122014.txt',
+
+# Autism multi-traits GWAS
+'''
+GWAS_table  = ['GWAS_case_control_european_no_CHOP.assoc.logistic',
+               'GWAS_controls_european_AANAT.assoc.linear',
+               'GWAS_controls_european_ASMT_log.assoc.linear',
+               'GWAS_controls_european_melatonin_log.assoc.linear',
+               'GWAS_controls_european_NAS.assoc.linear',
+               'GWAS_controls_european_serotonin_log.assoc.linear']
+outFileName   = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5'
+'''
+
+
+#---get internal GWAS link
+Glink = []
+
+for GWAS in range(0, len(GWAS_table)):
+    GWAS_filename = GWAS_table[GWAS]
+    Glink.append({'filename': GWAS_filename,
+                  'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]})
+Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink'))
+
+#---get descript GWAS table
+sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)]
+sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename')
+
+summary_phe = sub_dict[['outcome', 'fullName', 'consortia',
+                        'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']]
+#cg          = out.create_group('summary_phe')
+
+#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False)
+
+
+summary_phe.to_csv(out_summary)
+# can be read with pd.read_hdf(outFileName,'summary_phe')
+
+#---open hdf5 files
+ref = h5py.File(REF_filename)
+out = h5py.File(outFileName, "w")
+for chrom in range(1, 23):
+    chrom_str = 'chr%d' % chrom
+    cg = out.create_group(chrom_str)
+    cg.create_dataset('snp_ids',  data=ref[chrom_str]['sids'][...])
+    cg.create_dataset('position', data=ref[chrom_str]['position'][...])
+
+
+##############################################################
+# LOOP OVER ALL GWAS
+for GWAS in range(0, len(GWAS_table)):
+
+    ##############################################################
+    # READ GWAS
+    GWAS_filename = GWAS_table[GWAS]
+    GWAS_link = walkfs(GWAS_path, GWAS_filename)[2]
+    print('\n')
+    print('**** PARSING %s ******' % GWAS_filename)
+
+    #---check GWAS labels dictionnary
+    my_labels = column_dict[column_dict['filename'] == GWAS_filename]
+    target_lab = my_labels.values.tolist()[0]
+    my_study = my_labels['consortia'].values[0] + \
+        '_' + my_labels['outcome'].values[0]
+
+    #---parse header and get column name and order
+    with open(GWAS_link) as f:
+        count_line = 0
+        line = f.readline()
+        header = line.split()
+        list_col = {}
+        list_lab = {}
+        for l in range(0, len(target_lab)):
+            for h in range(0, len(header)):
+                if header[h] == target_lab[l]:
+                    list_lab[my_labels.columns.tolist()[l]] = h
+                    list_col[h] = my_labels.columns.tolist()[l]
+
+    #---load selected columns, rename columns, and set snpid as index
+    order = sorted(list_col.keys())
+    fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing,
+                           usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0)
+    fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
+
+    ##############################################################
+    # read REF SNP and MERGE
+    SNP_in = 0
+    SNP_rem = 0
+
+    for chrom in range(1, 23):
+        print('Working on Chromosome %d' % chrom)
+        chrom_str = 'chr%d' % chrom
+        Nsnp = len(ref[chrom_str]['position'])
+
+        # create alernatives SNPid for the SNPid mapping
+        # == map based on rsid
+        chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str )
+        # == map based on chr:position
+        chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat(
+            ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid'])
+
+
+        # merge the alternative mappings
+        merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left',
+                              right_index=True, left_on='snpid', sort=False, indicator=True)
+        altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True,
+                           left_on='snpid', sort=False, indicator=True)
+        sub_S = (altID_1['_merge'] == "both")
+
+
+        if(sub_S.any()):
+            merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]]
+
+
+        SNP_in += sum((merge_GWAS['_merge'] == "both")*1)
+
+        # check direction of Zscore
+        #%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%%
+        #upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
+        #sign1  = ((ref[chrom_str]['ref'].value ==  upper_Allele)*1-0.5)*2
+        gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
+        gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)]
+        gwasAllele1_c = dna_complement(gwasAllele1)
+        gwasAllele2_c = dna_complement(gwasAllele2)
+
+        refAllele1 = ref[chrom_str]['ref'].value.astype(str)
+        refAllele2 = ref[chrom_str]['alt'].value.astype(str)
+
+        case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2))
+        case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2))
+        case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2))
+        case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2))
+
+        sign1 = np.repeat(np.nan, Nsnp)
+        sign1[case1] = np.repeat(+ 1.0, sum(case1))
+        sign1[case2] = np.repeat(- 1.0, sum(case2))
+        sign1[case3] = np.repeat(+ 1.0, sum(case3))
+        sign1[case4] = np.repeat(- 1.0, sum(case4))
+
+        # get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00)
+        if('z' in fullGWAS.columns):
+            sign3 = np.sign(merge_GWAS.z)
+        elif('OR' in fullGWAS.columns):
+            sign3 = np.sign(merge_GWAS.OR - 1 +
+                        (np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001)
+        else:
+            raise ValueError(
+                'Both the statistic and OR are missing: ZSCORE cannot be derived')
+
+        if('pval' not in merge_GWAS):
+            raise ValueError('P-value column was not found')
+
+        #ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3
+        ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3
+
+        # filter base on sample size =====================================================
+        #---sample size does not exists
+        if 'n' in fullGWAS.columns:
+            myN = merge_GWAS.n
+        #--- freq, case-cont N exist
+        elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)):
+            sumN = merge_GWAS.ncas + merge_GWAS.ncont
+            perCase = merge_GWAS.ncas / sumN
+            myN = sumN * perCase * (1-perCase)
+        #---se, freq exist and QT
+        elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)):
+            myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2)
+        #---se exists but no freq, // use ref panel freq
+        elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)):
+            myN = 1/(ref[chrom_str]['eur_mafs'][...] *
+                     (1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2)
+        else:
+            raise ValueError(
+                'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
+        # replace infinite value by the max finite one
+        if(sum(np.isinf(myN)) > 0):
+            myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
+
+        myW_thres = np.percentile(myN.dropna(), 90)
+        #myW_thres = np.max(myN)
+        ss_thres = perSS * myW_thres
+
+        if chrom == 1:
+            myN_genome = myN
+        else:
+            myN_genome = np.concatenate((myN_genome, myN), axis=0)
+
+        toRem = np.where(myN <= ss_thres)
+        SNP_rem += len(toRem[0])
+        if(len(toRem[0]) > 0):
+            ZSCORE.iloc[toRem] = np.nan
+        # add created data to the group
+        cg = out[chrom_str]
+        cg.create_dataset('z'+'_'+my_study, data=ZSCORE)
+
+
+        ImpG_output = pd.DataFrame({
+                    'rsID': chrPos1['snpid'],
+                    'pos': ref[chrom_str]['position'][...],
+                    'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]],
+                    'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]],
+                    'Z': ZSCORE.values
+        }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ])
+
+        ImpG_output.dropna(subset=["Z"], how="any", inplace=True)
+        impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt"
+        print("WRITING CHR {} results for {} to:".format(chrom, my_study))
+        ImpG_output.to_csv(impg_output_file, sep="\t", index=False)
+
+    # ##############################################################
+    # # SUMMARY + PLOTS
+    # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
+    #     SNP_in, SNP_rem))
+    #
+    # # the histogram of the #sample per SNP
+    # fig = plt.figure()
+    # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')
+    # nn = (pd.DataFrame(myN_genome)).dropna()
+    # n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75)
+    #
+    # Ymax = np.max(n)*1.2
+    #
+    # plt.ylim((0, Ymax))
+    # plt.xlabel('Infered weight (i.e. Sample size)')
+    # plt.ylabel('Frequency')
+    # plt.grid(True)
+    # plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r')
+    # plt.axvline(x=ss_thres,  linewidth=2, linestyle='dashed', color='g')
+    #
+    # Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn))
+    # ax = fig.add_subplot(111)
+    #
+    # ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn)
+    #                                        [0]), verticalalignment='top', horizontalalignment='left')
+    # ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn)
+    #                                        [0]), verticalalignment='top', horizontalalignment='left')
+    # ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)),
+    #         verticalalignment='top', horizontalalignment='left')
+    #
+    # # plt.show()
+    # fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi)
+    # plt.close(fig)
+
+
+# CLOSE h5 FILES
+out.close()
+ref.close()
+
+print('\n')