clarified that beta or Z can be specified indifferently

2be0f8b7 · Hanna JULIENNE · b488e211 · 2be0f8b7 · 2be0f8b7 · 2be0f8b7
Commit 2be0f8b7 authored 8 months ago by Hanna JULIENNE
--- a/doc/source/index.rst
+++ b/doc/source/index.rst
@@ -112,7 +112,7 @@ Input
 * n: name of the column storing the sample size by variants (optional, will be inferred from the MAF, genetic effect and standard deviation if absent)
 * ncas: For binary traits, name of the column storing the number of cases by variants (optional)
 * ncont: For binary traits, name of the column storing the number of controls by variants (optional)
-* z: name of the column storing the genetic effect (beta) in the gwas file
+* beta: name of the column storing the genetic effect (beta) in the gwas file
 * OR : For binary traits, Odd ratio when available. Not to be confounded with the genetic effect size or 'beta'.
 * index-type: precise the type of index 
 * imputation_quality: (Optional) column containing individual-based imputation quality. Will be used to filter low quality imputation data from GWASs if the option --imputation-quality-treshold is used

--- a/jass_preprocessing/compute_score.py
+++ b/jass_preprocessing/compute_score.py
@@ -14,8 +14,8 @@ def compute_z_score(mgwas):
    add the corresponding column to the mgwas dataframe
    """
-    if 'z' in mgwas.columns:
+    if 'beta_or_Z' in mgwas.columns:
-        sign_vect = np.sign(mgwas.z)
+        sign_vect = np.sign(mgwas.beta_or_Z)
    else:
        if "OR" in mgwas.columns:
            sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8))

--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -146,7 +146,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None):
    fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
                               usecols = column_map.values,
                               compression=compression,
-                            #column_dict['label_position'].keys(),
                            names= column_map.index,
                            header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
                                                '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',

--- a/pyPCMA_1_format_v1.4.py
+++ b/pyPCMA_1_format_v1.4.py
-"""
-Read raw GWAS summary statistics, filter and format
-Write clean GWAS
-"""
-__updated__ = '2018-19-02'
-import h5py
-import numpy as np
-import scipy.stats as ss
-import sys
-import math
-import os
-import pandas as pd
-import matplotlib.pyplot as plt
-perSS = 0.7
-netPath = "/mnt/atlas/"  # '/home/genstat/ATLAS/'
-#netPath       = '/pasteur/projets/policy01/'
-GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
-GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
-REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5'
-pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
-outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
-def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
-               '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
-out_summary = "summary_GWAS.csv"
-ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
-# FUNCTION: find path for a given GWAS file ================================
-def walkfs(startdir, findfile):
-    dircount = 0
-    filecount = 0
-    for root, dirs, files in os.walk(startdir):
-        if findfile in files:
-            return dircount, filecount + files.index(findfile), os.path.join(root, findfile)
-        dircount += 1
-        filecount += len(files)
-    # nothing found, return None instead of a full path for the file
-    return dircount, filecount, None
-def dna_complement_base(inputbase):
-    if (inputbase == 'A'):
-        return('T')
-    if (inputbase == 'T'):
-        return('A')
-    if (inputbase == 'G'):
-        return('C')
-    if (inputbase == 'C'):
-        return('G')
-    return('Not ATGC')
-def dna_complement(input):
-    return([dna_complement_base(x) for x in input])
-##############################################################
-# READ PARAM FILES
-#---open GWAS dictionnary
-column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
-#---Public GWAS
-GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
-              'SNP_gwas_mc_merge_nogc.tbl.uniq',
-              'GIANT_2015_HIP_COMBINED_EUR.txt',
-              'GIANT_2015_WC_COMBINED_EUR.txt',
-              'GIANT_2015_WHR_COMBINED_EUR.txt',
-              'HDL_ONE_Europeans.tbl',
-              'LDL_ONE_Europeans.tbl',
-              'TC_ONE_Europeans.tbl',
-              'TG_ONE_Europeans.tbl',
-              'GWAS_DBP_recoded.txt',
-              'GWAS_SBP_recoded.txt',
-              'GWAS_PP_recoded.txt',
-              'GWAS_MAP_recoded.txt',
-              'MAGIC_FastingGlucose.txt',
-              'MAGIC_ln_FastingInsulin.txt',
-              'MAGIC_ln_HOMA-B.txt',
-              'MAGIC_ln_HOMA-IR.txt',
-              'MAGIC_2hrGlucose_AdjustedForBMI.txt',
-              'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt',
-              'MAGIC_proinsulin_for_release_HMrel27.txt',
-              'MAGIC_HbA1C.txt',
-              'DIAGRAMv3.2012DEC17.txt',
-              'fa2stu.MAF0.005.pos.out',
-              'fn2stu.MAF0.005.pos.out',
-              'ls2stu.MAF0.005.pos.out',
-              'gabriel_asthma_Reformated.txt',
-              'EUR.IBD.gwas_filtered.assoc',
-              'EUR.CD.gwas_filtered.assoc',
-              'EUR.UC.gwas_filtered.assoc',
-              'rall.txt',
-              'pgc.bip.full.2012-04.txt',
-              'pgc.mdd.full.2012-04.txt',
-              'IGAP_stage_1.txt',
-              'Menopause_HapMap_for_website_18112015.txt',
-              'CARDIoGRAM_GWAS_RESULTS.txt',
-              'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT',
-              'SSGAC_EduYears_Rietveld2013_publicrelease.txt',
-              'SSGAC_College_Rietveld2013_publicrelease.txt',
-              'POAG_allGender_nonGC.txt',
-              'IOP_allGender_EURonly.txt',
-              'IMSGC-GWAS.txt',
-              'RA-EUR-GWAS.txt',
-              'CKD.txt',
-              'eGFRcrea_overall.txt',
-              'eGFRcys_overall.txt']
-'''
-GWAS_table  = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
-               'SNP_gwas_mc_merge_nogc.tbl.uniq',
-               'GIANT_2015_HIP_COMBINED_EUR.txt',
-               'GIANT_2015_WC_COMBINED_EUR.txt']
-'''
-# NEED ADDITIONAL PRE-CLEANING
-#               'RA_GWASmeta_European_v2.txt',
-#               'pgc.adhd.full.2012-10.txt',
-#               'Menarche_Nature2014_GWASMetaResults_17122014.txt',
-# Autism multi-traits GWAS
-'''
-GWAS_table  = ['GWAS_case_control_european_no_CHOP.assoc.logistic',
-               'GWAS_controls_european_AANAT.assoc.linear',
-               'GWAS_controls_european_ASMT_log.assoc.linear',
-               'GWAS_controls_european_melatonin_log.assoc.linear',
-               'GWAS_controls_european_NAS.assoc.linear',
-               'GWAS_controls_european_serotonin_log.assoc.linear']
-outFileName   = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5'
-'''
-#---get internal GWAS link
-Glink = []
-for GWAS in range(0, len(GWAS_table)):
-    GWAS_filename = GWAS_table[GWAS]
-    Glink.append({'filename': GWAS_filename,
-                  'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]})
-Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink'))
-#---get descript GWAS table
-sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)]
-sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename')
-summary_phe = sub_dict[['outcome', 'fullName', 'consortia',
-                        'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']]
-#cg          = out.create_group('summary_phe')
-#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False)
-summary_phe.to_csv(out_summary)
-# can be read with pd.read_hdf(outFileName,'summary_phe')
-#---open hdf5 files
-ref = h5py.File(REF_filename)
-out = h5py.File(outFileName, "w")
-for chrom in range(1, 23):
-    chrom_str = 'chr%d' % chrom
-    cg = out.create_group(chrom_str)
-    cg.create_dataset('snp_ids',  data=ref[chrom_str]['sids'][...])
-    cg.create_dataset('position', data=ref[chrom_str]['position'][...])
-##############################################################
-# LOOP OVER ALL GWAS
-for GWAS in range(0, len(GWAS_table)):
-    ##############################################################
-    # READ GWAS
-    GWAS_filename = GWAS_table[GWAS]
-    GWAS_link = walkfs(GWAS_path, GWAS_filename)[2]
-    print('\n')
-    print('**** PARSING %s ******' % GWAS_filename)
-    #---check GWAS labels dictionnary
-    my_labels = column_dict[column_dict['filename'] == GWAS_filename]
-    target_lab = my_labels.values.tolist()[0]
-    my_study = my_labels['consortia'].values[0] + \
-        '_' + my_labels['outcome'].values[0]
-    #---parse header and get column name and order
-    with open(GWAS_link) as f:
-        count_line = 0
-        line = f.readline()
-        header = line.split()
-        list_col = {}
-        list_lab = {}
-        for l in range(0, len(target_lab)):
-            for h in range(0, len(header)):
-                if header[h] == target_lab[l]:
-                    list_lab[my_labels.columns.tolist()[l]] = h
-                    list_col[h] = my_labels.columns.tolist()[l]
-    #---load selected columns, rename columns, and set snpid as index
-    order = sorted(list_col.keys())
-    fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing,
-                           usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0)
-    fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
-    ##############################################################
-    # read REF SNP and MERGE
-    SNP_in = 0
-    SNP_rem = 0
-    for chrom in range(1, 23):
-        print('Working on Chromosome %d' % chrom)
-        chrom_str = 'chr%d' % chrom
-        Nsnp = len(ref[chrom_str]['position'])
-        # create alernatives SNPid for the SNPid mapping
-        # == map based on rsid
-        chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str )
-        # == map based on chr:position
-        chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat(
-            ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid'])
-        # merge the alternative mappings
-        merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left',
-                              right_index=True, left_on='snpid', sort=False, indicator=True)
-        altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True,
-                           left_on='snpid', sort=False, indicator=True)
-        sub_S = (altID_1['_merge'] == "both")
-        if(sub_S.any()):
-            merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]]
-        SNP_in += sum((merge_GWAS['_merge'] == "both")*1)
-        # check direction of Zscore
-        #%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%%
-        #upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
-        #sign1  = ((ref[chrom_str]['ref'].value ==  upper_Allele)*1-0.5)*2
-        gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)]
-        gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)]
-        gwasAllele1_c = dna_complement(gwasAllele1)
-        gwasAllele2_c = dna_complement(gwasAllele2)
-        refAllele1 = ref[chrom_str]['ref'].value.astype(str)
-        refAllele2 = ref[chrom_str]['alt'].value.astype(str)
-        case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2))
-        case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2))
-        case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2))
-        case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2))
-        sign1 = np.repeat(np.nan, Nsnp)
-        sign1[case1] = np.repeat(+ 1.0, sum(case1))
-        sign1[case2] = np.repeat(- 1.0, sum(case2))
-        sign1[case3] = np.repeat(+ 1.0, sum(case3))
-        sign1[case4] = np.repeat(- 1.0, sum(case4))
-        # get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00)
-        if('z' in fullGWAS.columns):
-            sign3 = np.sign(merge_GWAS.z)
-        elif('OR' in fullGWAS.columns):
-            sign3 = np.sign(merge_GWAS.OR - 1 +
-                        (np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001)
-        else:
-            raise ValueError(
-                'Both the statistic and OR are missing: ZSCORE cannot be derived')
-        if('pval' not in merge_GWAS):
-            raise ValueError('P-value column was not found')
-        #ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3
-        ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3
-        # filter base on sample size =====================================================
-        #---sample size does not exists
-        if 'n' in fullGWAS.columns:
-            myN = merge_GWAS.n
-        #--- freq, case-cont N exist
-        elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)):
-            sumN = merge_GWAS.ncas + merge_GWAS.ncont
-            perCase = merge_GWAS.ncas / sumN
-            myN = sumN * perCase * (1-perCase)
-        #---se, freq exist and QT
-        elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)):
-            myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2)
-        #---se exists but no freq, // use ref panel freq
-        elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)):
-            myN = 1/(ref[chrom_str]['eur_mafs'][...] *
-                     (1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2)
-        else:
-            raise ValueError(
-                'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied')
-        # replace infinite value by the max finite one
-        if(sum(np.isinf(myN)) > 0):
-            myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)])
-        myW_thres = np.percentile(myN.dropna(), 90)
-        #myW_thres = np.max(myN)
-        ss_thres = perSS * myW_thres
-        if chrom == 1:
-            myN_genome = myN
-        else:
-            myN_genome = np.concatenate((myN_genome, myN), axis=0)
-        toRem = np.where(myN <= ss_thres)
-        SNP_rem += len(toRem[0])
-        if(len(toRem[0]) > 0):
-            ZSCORE.iloc[toRem] = np.nan
-        # add created data to the group
-        cg = out[chrom_str]
-        cg.create_dataset('z'+'_'+my_study, data=ZSCORE)
-        ImpG_output = pd.DataFrame({
-                    'rsID': chrPos1['snpid'],
-                    'pos': ref[chrom_str]['position'][...],
-                    'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]],
-                    'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]],
-                    'Z': ZSCORE.values
-        }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ])
-        ImpG_output.dropna(subset=["Z"], how="any", inplace=True)
-        impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt"
-        print("WRITING CHR {} results for {} to:".format(chrom, my_study))
-        ImpG_output.to_csv(impg_output_file, sep="\t", index=False)
-    # ##############################################################
-    # # SUMMARY + PLOTS
-    # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % (
-    #     SNP_in, SNP_rem))
-    #np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1))
-    # # the histogram of the #sample per SNP
-    # fig = plt.figure()
-    # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold')
-    # nn = (pd.DataFrame(myN_genome)).dropna()
-    # n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75)
-    #
-    # Ymax = np.max(n)*1.2
-    #
-    # plt.ylim((0, Ymax))
-    # plt.xlabel('Infered weight (i.e. Sample size)')
-    # plt.ylabel('Frequency')
-    # plt.grid(True)
-    # plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r')
-    # plt.axvline(x=ss_thres,  linewidth=2, linestyle='dashed', color='g')
-    #
-    # Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn))
-    # ax = fig.add_subplot(111)
-    #
-    # ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn)
-    #                                        [0]), verticalalignment='top', horizontalalignment='left')
-    # ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn)
-    #                                        [0]), verticalalignment='top', horizontalalignment='left')
-    # ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)),
-    #         verticalalignment='top', horizontalalignment='left')
-    #
-    # # plt.show()
-    # fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi)
-    # plt.close(fig)
-# CLOSE h5 FILES
-out.close()
-ref.close()
-print('\n')