diff --git a/doc/source/index.rst b/doc/source/index.rst index cabc46b363d70e1f4214a9629930b7b790d5525e..b2615b09594fb50caea8c80eb41d5d37417ceeb5 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -108,11 +108,11 @@ Input * a1: effect allele * a2: Other allele * freq: name of the column storing the minor allele frequence in the gwas file -* pval: name of the column storing the pvalue in the gwas file +* pval: name of the column storing the pvalue in the gwas file. This column will be used to derive the Z-score of genetic effect. * n: name of the column storing the sample size by variants (optional, will be inferred from the MAF, genetic effect and standard deviation if absent) * ncas: For binary traits, name of the column storing the number of cases by variants (optional) * ncont: For binary traits, name of the column storing the number of controls by variants (optional) -* z: name of the column storing the genetic effect (beta) in the gwas file +* beta_or_z: name of the column storing the genetic effect (beta) in the gwas file. This column will be used only to retrieve the sign of the genetic effect with respect to the reference and effect allele. * OR : For binary traits, Odd ratio when available. Not to be confounded with the genetic effect size or 'beta'. * index-type: precise the type of index * imputation_quality: (Optional) column containing individual-based imputation quality. Will be used to filter low quality imputation data from GWASs if the option --imputation-quality-treshold is used diff --git a/jass_preprocessing/compute_score.py b/jass_preprocessing/compute_score.py index 9e4ccddc610886ba054214487ac64a0e4b834705..8bca34d95fc91806f04cffe67ef7e2ef7988687f 100644 --- a/jass_preprocessing/compute_score.py +++ b/jass_preprocessing/compute_score.py @@ -14,8 +14,8 @@ def compute_z_score(mgwas): add the corresponding column to the mgwas dataframe """ - if 'z' in mgwas.columns: - sign_vect = np.sign(mgwas.z) + if 'beta_or_Z' in mgwas.columns: + sign_vect = np.sign(mgwas.beta_or_Z) else: if "OR" in mgwas.columns: sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8)) diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 4e25925ae7241d8f50f0c3749ba64fd26b2640c4..5e73858f268e934b36db38818bc1f05d3ebb57d3 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -146,7 +146,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, usecols = column_map.values, compression=compression, - #column_dict['label_position'].keys(), names= column_map.index, header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', diff --git a/pyPCMA_1_format_v1.4.py b/pyPCMA_1_format_v1.4.py deleted file mode 100755 index d3baa31ad3a76ea44018d86c6a70edbcf1201424..0000000000000000000000000000000000000000 --- a/pyPCMA_1_format_v1.4.py +++ /dev/null @@ -1,374 +0,0 @@ -""" -Read raw GWAS summary statistics, filter and format -Write clean GWAS -""" -__updated__ = '2018-19-02' - -import h5py -import numpy as np -import scipy.stats as ss -import sys -import math -import os -import pandas as pd -import matplotlib.pyplot as plt - - -perSS = 0.7 -netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' -#netPath = '/pasteur/projets/policy01/' -GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt' -GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' -REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5' -pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' - -outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' -def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', - '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] -out_summary = "summary_GWAS.csv" -ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/' - -# FUNCTION: find path for a given GWAS file ================================ - - -def walkfs(startdir, findfile): - dircount = 0 - filecount = 0 - for root, dirs, files in os.walk(startdir): - if findfile in files: - return dircount, filecount + files.index(findfile), os.path.join(root, findfile) - dircount += 1 - filecount += len(files) - # nothing found, return None instead of a full path for the file - return dircount, filecount, None - - -def dna_complement_base(inputbase): - if (inputbase == 'A'): - return('T') - if (inputbase == 'T'): - return('A') - if (inputbase == 'G'): - return('C') - if (inputbase == 'C'): - return('G') - return('Not ATGC') - - -def dna_complement(input): - return([dna_complement_base(x) for x in input]) - - - -############################################################## -# READ PARAM FILES -#---open GWAS dictionnary -column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') - -#---Public GWAS - -GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', - 'SNP_gwas_mc_merge_nogc.tbl.uniq', - 'GIANT_2015_HIP_COMBINED_EUR.txt', - 'GIANT_2015_WC_COMBINED_EUR.txt', - 'GIANT_2015_WHR_COMBINED_EUR.txt', - 'HDL_ONE_Europeans.tbl', - 'LDL_ONE_Europeans.tbl', - 'TC_ONE_Europeans.tbl', - 'TG_ONE_Europeans.tbl', - 'GWAS_DBP_recoded.txt', - 'GWAS_SBP_recoded.txt', - 'GWAS_PP_recoded.txt', - 'GWAS_MAP_recoded.txt', - 'MAGIC_FastingGlucose.txt', - 'MAGIC_ln_FastingInsulin.txt', - 'MAGIC_ln_HOMA-B.txt', - 'MAGIC_ln_HOMA-IR.txt', - 'MAGIC_2hrGlucose_AdjustedForBMI.txt', - 'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt', - 'MAGIC_proinsulin_for_release_HMrel27.txt', - 'MAGIC_HbA1C.txt', - 'DIAGRAMv3.2012DEC17.txt', - 'fa2stu.MAF0.005.pos.out', - 'fn2stu.MAF0.005.pos.out', - 'ls2stu.MAF0.005.pos.out', - 'gabriel_asthma_Reformated.txt', - 'EUR.IBD.gwas_filtered.assoc', - 'EUR.CD.gwas_filtered.assoc', - 'EUR.UC.gwas_filtered.assoc', - 'rall.txt', - 'pgc.bip.full.2012-04.txt', - 'pgc.mdd.full.2012-04.txt', - 'IGAP_stage_1.txt', - 'Menopause_HapMap_for_website_18112015.txt', - 'CARDIoGRAM_GWAS_RESULTS.txt', - 'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT', - 'SSGAC_EduYears_Rietveld2013_publicrelease.txt', - 'SSGAC_College_Rietveld2013_publicrelease.txt', - 'POAG_allGender_nonGC.txt', - 'IOP_allGender_EURonly.txt', - 'IMSGC-GWAS.txt', - 'RA-EUR-GWAS.txt', - 'CKD.txt', - 'eGFRcrea_overall.txt', - 'eGFRcys_overall.txt'] - -''' -GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', - 'SNP_gwas_mc_merge_nogc.tbl.uniq', - 'GIANT_2015_HIP_COMBINED_EUR.txt', - 'GIANT_2015_WC_COMBINED_EUR.txt'] -''' - - -# NEED ADDITIONAL PRE-CLEANING -# 'RA_GWASmeta_European_v2.txt', -# 'pgc.adhd.full.2012-10.txt', -# 'Menarche_Nature2014_GWASMetaResults_17122014.txt', - -# Autism multi-traits GWAS -''' -GWAS_table = ['GWAS_case_control_european_no_CHOP.assoc.logistic', - 'GWAS_controls_european_AANAT.assoc.linear', - 'GWAS_controls_european_ASMT_log.assoc.linear', - 'GWAS_controls_european_melatonin_log.assoc.linear', - 'GWAS_controls_european_NAS.assoc.linear', - 'GWAS_controls_european_serotonin_log.assoc.linear'] -outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5' -''' - - -#---get internal GWAS link -Glink = [] - -for GWAS in range(0, len(GWAS_table)): - GWAS_filename = GWAS_table[GWAS] - Glink.append({'filename': GWAS_filename, - 'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]}) -Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink')) - -#---get descript GWAS table -sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)] -sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename') - -summary_phe = sub_dict[['outcome', 'fullName', 'consortia', - 'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']] -#cg = out.create_group('summary_phe') - -#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False) - - -summary_phe.to_csv(out_summary) -# can be read with pd.read_hdf(outFileName,'summary_phe') - -#---open hdf5 files -ref = h5py.File(REF_filename) -out = h5py.File(outFileName, "w") -for chrom in range(1, 23): - chrom_str = 'chr%d' % chrom - cg = out.create_group(chrom_str) - cg.create_dataset('snp_ids', data=ref[chrom_str]['sids'][...]) - cg.create_dataset('position', data=ref[chrom_str]['position'][...]) - - -############################################################## -# LOOP OVER ALL GWAS -for GWAS in range(0, len(GWAS_table)): - - ############################################################## - # READ GWAS - GWAS_filename = GWAS_table[GWAS] - GWAS_link = walkfs(GWAS_path, GWAS_filename)[2] - print('\n') - print('**** PARSING %s ******' % GWAS_filename) - - #---check GWAS labels dictionnary - my_labels = column_dict[column_dict['filename'] == GWAS_filename] - target_lab = my_labels.values.tolist()[0] - my_study = my_labels['consortia'].values[0] + \ - '_' + my_labels['outcome'].values[0] - - #---parse header and get column name and order - with open(GWAS_link) as f: - count_line = 0 - line = f.readline() - header = line.split() - list_col = {} - list_lab = {} - for l in range(0, len(target_lab)): - for h in range(0, len(header)): - if header[h] == target_lab[l]: - list_lab[my_labels.columns.tolist()[l]] = h - list_col[h] = my_labels.columns.tolist()[l] - - #---load selected columns, rename columns, and set snpid as index - order = sorted(list_col.keys()) - fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing, - usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0) - fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] - - ############################################################## - # read REF SNP and MERGE - SNP_in = 0 - SNP_rem = 0 - - for chrom in range(1, 23): - print('Working on Chromosome %d' % chrom) - chrom_str = 'chr%d' % chrom - Nsnp = len(ref[chrom_str]['position']) - - # create alernatives SNPid for the SNPid mapping - # == map based on rsid - chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str ) - # == map based on chr:position - chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat( - ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid']) - - - # merge the alternative mappings - merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left', - right_index=True, left_on='snpid', sort=False, indicator=True) - altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True, - left_on='snpid', sort=False, indicator=True) - sub_S = (altID_1['_merge'] == "both") - - - if(sub_S.any()): - merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]] - - - SNP_in += sum((merge_GWAS['_merge'] == "both")*1) - - # check direction of Zscore - #%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%% - #upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)] - #sign1 = ((ref[chrom_str]['ref'].value == upper_Allele)*1-0.5)*2 - gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)] - gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)] - gwasAllele1_c = dna_complement(gwasAllele1) - gwasAllele2_c = dna_complement(gwasAllele2) - - refAllele1 = ref[chrom_str]['ref'].value.astype(str) - refAllele2 = ref[chrom_str]['alt'].value.astype(str) - - case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2)) - case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2)) - case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2)) - case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2)) - - sign1 = np.repeat(np.nan, Nsnp) - sign1[case1] = np.repeat(+ 1.0, sum(case1)) - sign1[case2] = np.repeat(- 1.0, sum(case2)) - sign1[case3] = np.repeat(+ 1.0, sum(case3)) - sign1[case4] = np.repeat(- 1.0, sum(case4)) - - # get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00) - if('z' in fullGWAS.columns): - sign3 = np.sign(merge_GWAS.z) - elif('OR' in fullGWAS.columns): - sign3 = np.sign(merge_GWAS.OR - 1 + - (np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001) - else: - raise ValueError( - 'Both the statistic and OR are missing: ZSCORE cannot be derived') - - if('pval' not in merge_GWAS): - raise ValueError('P-value column was not found') - - #ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3 - ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3 - - # filter base on sample size ===================================================== - #---sample size does not exists - if 'n' in fullGWAS.columns: - myN = merge_GWAS.n - #--- freq, case-cont N exist - elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)): - sumN = merge_GWAS.ncas + merge_GWAS.ncont - perCase = merge_GWAS.ncas / sumN - myN = sumN * perCase * (1-perCase) - #---se, freq exist and QT - elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)): - myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2) - #---se exists but no freq, // use ref panel freq - elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)): - myN = 1/(ref[chrom_str]['eur_mafs'][...] * - (1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2) - else: - raise ValueError( - 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') - # replace infinite value by the max finite one - if(sum(np.isinf(myN)) > 0): - myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) - - myW_thres = np.percentile(myN.dropna(), 90) - #myW_thres = np.max(myN) - ss_thres = perSS * myW_thres - - if chrom == 1: - myN_genome = myN - else: - myN_genome = np.concatenate((myN_genome, myN), axis=0) - - toRem = np.where(myN <= ss_thres) - SNP_rem += len(toRem[0]) - if(len(toRem[0]) > 0): - ZSCORE.iloc[toRem] = np.nan - # add created data to the group - cg = out[chrom_str] - cg.create_dataset('z'+'_'+my_study, data=ZSCORE) - - - ImpG_output = pd.DataFrame({ - 'rsID': chrPos1['snpid'], - 'pos': ref[chrom_str]['position'][...], - 'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]], - 'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]], - 'Z': ZSCORE.values - }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) - - ImpG_output.dropna(subset=["Z"], how="any", inplace=True) - impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt" - print("WRITING CHR {} results for {} to:".format(chrom, my_study)) - ImpG_output.to_csv(impg_output_file, sep="\t", index=False) - - # ############################################################## - # # SUMMARY + PLOTS - # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % ( - # SNP_in, SNP_rem)) - #np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) - # # the histogram of the #sample per SNP - # fig = plt.figure() - # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold') - # nn = (pd.DataFrame(myN_genome)).dropna() - # n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75) - # - # Ymax = np.max(n)*1.2 - # - # plt.ylim((0, Ymax)) - # plt.xlabel('Infered weight (i.e. Sample size)') - # plt.ylabel('Frequency') - # plt.grid(True) - # plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r') - # plt.axvline(x=ss_thres, linewidth=2, linestyle='dashed', color='g') - # - # Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn)) - # ax = fig.add_subplot(111) - # - # ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn) - # [0]), verticalalignment='top', horizontalalignment='left') - # ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn) - # [0]), verticalalignment='top', horizontalalignment='left') - # ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)), - # verticalalignment='top', horizontalalignment='left') - # - # # plt.show() - # fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi) - # plt.close(fig) - - -# CLOSE h5 FILES -out.close() -ref.close() - -print('\n')