diff --git a/doc/source/index.rst b/doc/source/index.rst index cabc46b363d70e1f4214a9629930b7b790d5525e..0e3b5deb34988ceb3362671583f8750495e1769c 100644 --- a/doc/source/index.rst +++ b/doc/source/index.rst @@ -112,7 +112,7 @@ Input * n: name of the column storing the sample size by variants (optional, will be inferred from the MAF, genetic effect and standard deviation if absent) * ncas: For binary traits, name of the column storing the number of cases by variants (optional) * ncont: For binary traits, name of the column storing the number of controls by variants (optional) -* z: name of the column storing the genetic effect (beta) in the gwas file +* beta: name of the column storing the genetic effect (beta) in the gwas file * OR : For binary traits, Odd ratio when available. Not to be confounded with the genetic effect size or 'beta'. * index-type: precise the type of index * imputation_quality: (Optional) column containing individual-based imputation quality. Will be used to filter low quality imputation data from GWASs if the option --imputation-quality-treshold is used diff --git a/jass_preprocessing/compute_score.py b/jass_preprocessing/compute_score.py index 9e4ccddc610886ba054214487ac64a0e4b834705..8bca34d95fc91806f04cffe67ef7e2ef7988687f 100644 --- a/jass_preprocessing/compute_score.py +++ b/jass_preprocessing/compute_score.py @@ -14,8 +14,8 @@ def compute_z_score(mgwas): add the corresponding column to the mgwas dataframe """ - if 'z' in mgwas.columns: - sign_vect = np.sign(mgwas.z) + if 'beta_or_Z' in mgwas.columns: + sign_vect = np.sign(mgwas.beta_or_Z) else: if "OR" in mgwas.columns: sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8)) diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 4e25925ae7241d8f50f0c3749ba64fd26b2640c4..5e73858f268e934b36db38818bc1f05d3ebb57d3 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -146,7 +146,6 @@ def read_gwas( gwas_internal_link, column_map, imputation_treshold=None): fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, usecols = column_map.values, compression=compression, - #column_dict['label_position'].keys(), names= column_map.index, header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', diff --git a/pyPCMA_1_format_v1.4.py b/pyPCMA_1_format_v1.4.py deleted file mode 100755 index d3baa31ad3a76ea44018d86c6a70edbcf1201424..0000000000000000000000000000000000000000 --- a/pyPCMA_1_format_v1.4.py +++ /dev/null @@ -1,374 +0,0 @@ -""" -Read raw GWAS summary statistics, filter and format -Write clean GWAS -""" -__updated__ = '2018-19-02' - -import h5py -import numpy as np -import scipy.stats as ss -import sys -import math -import os -import pandas as pd -import matplotlib.pyplot as plt - - -perSS = 0.7 -netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' -#netPath = '/pasteur/projets/policy01/' -GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt' -GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' -REF_filename = netPath+'PCMA/0._REF/1KGENOME/1K_SNP_INFO_EUR_MAF0.05.hdf5' -pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' - -outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' -def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', - '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] -out_summary = "summary_GWAS.csv" -ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/' - -# FUNCTION: find path for a given GWAS file ================================ - - -def walkfs(startdir, findfile): - dircount = 0 - filecount = 0 - for root, dirs, files in os.walk(startdir): - if findfile in files: - return dircount, filecount + files.index(findfile), os.path.join(root, findfile) - dircount += 1 - filecount += len(files) - # nothing found, return None instead of a full path for the file - return dircount, filecount, None - - -def dna_complement_base(inputbase): - if (inputbase == 'A'): - return('T') - if (inputbase == 'T'): - return('A') - if (inputbase == 'G'): - return('C') - if (inputbase == 'C'): - return('G') - return('Not ATGC') - - -def dna_complement(input): - return([dna_complement_base(x) for x in input]) - - - -############################################################## -# READ PARAM FILES -#---open GWAS dictionnary -column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') - -#---Public GWAS - -GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', - 'SNP_gwas_mc_merge_nogc.tbl.uniq', - 'GIANT_2015_HIP_COMBINED_EUR.txt', - 'GIANT_2015_WC_COMBINED_EUR.txt', - 'GIANT_2015_WHR_COMBINED_EUR.txt', - 'HDL_ONE_Europeans.tbl', - 'LDL_ONE_Europeans.tbl', - 'TC_ONE_Europeans.tbl', - 'TG_ONE_Europeans.tbl', - 'GWAS_DBP_recoded.txt', - 'GWAS_SBP_recoded.txt', - 'GWAS_PP_recoded.txt', - 'GWAS_MAP_recoded.txt', - 'MAGIC_FastingGlucose.txt', - 'MAGIC_ln_FastingInsulin.txt', - 'MAGIC_ln_HOMA-B.txt', - 'MAGIC_ln_HOMA-IR.txt', - 'MAGIC_2hrGlucose_AdjustedForBMI.txt', - 'MAGIC_INSULIN_SECRETION_CIR_for_release_HMrel27.txt', - 'MAGIC_proinsulin_for_release_HMrel27.txt', - 'MAGIC_HbA1C.txt', - 'DIAGRAMv3.2012DEC17.txt', - 'fa2stu.MAF0.005.pos.out', - 'fn2stu.MAF0.005.pos.out', - 'ls2stu.MAF0.005.pos.out', - 'gabriel_asthma_Reformated.txt', - 'EUR.IBD.gwas_filtered.assoc', - 'EUR.CD.gwas_filtered.assoc', - 'EUR.UC.gwas_filtered.assoc', - 'rall.txt', - 'pgc.bip.full.2012-04.txt', - 'pgc.mdd.full.2012-04.txt', - 'IGAP_stage_1.txt', - 'Menopause_HapMap_for_website_18112015.txt', - 'CARDIoGRAM_GWAS_RESULTS.txt', - 'C4D_CAD_DISCOVERY_METAANALYSIS_UPDATE.TXT', - 'SSGAC_EduYears_Rietveld2013_publicrelease.txt', - 'SSGAC_College_Rietveld2013_publicrelease.txt', - 'POAG_allGender_nonGC.txt', - 'IOP_allGender_EURonly.txt', - 'IMSGC-GWAS.txt', - 'RA-EUR-GWAS.txt', - 'CKD.txt', - 'eGFRcrea_overall.txt', - 'eGFRcys_overall.txt'] - -''' -GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', - 'SNP_gwas_mc_merge_nogc.tbl.uniq', - 'GIANT_2015_HIP_COMBINED_EUR.txt', - 'GIANT_2015_WC_COMBINED_EUR.txt'] -''' - - -# NEED ADDITIONAL PRE-CLEANING -# 'RA_GWASmeta_European_v2.txt', -# 'pgc.adhd.full.2012-10.txt', -# 'Menarche_Nature2014_GWASMetaResults_17122014.txt', - -# Autism multi-traits GWAS -''' -GWAS_table = ['GWAS_case_control_european_no_CHOP.assoc.logistic', - 'GWAS_controls_european_AANAT.assoc.linear', - 'GWAS_controls_european_ASMT_log.assoc.linear', - 'GWAS_controls_european_melatonin_log.assoc.linear', - 'GWAS_controls_european_NAS.assoc.linear', - 'GWAS_controls_european_serotonin_log.assoc.linear'] -outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_AUTISM.hdf5' -''' - - -#---get internal GWAS link -Glink = [] - -for GWAS in range(0, len(GWAS_table)): - GWAS_filename = GWAS_table[GWAS] - Glink.append({'filename': GWAS_filename, - 'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]}) -Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink')) - -#---get descript GWAS table -sub_dict = column_dict[(column_dict['filename']).isin(GWAS_table)] -sub_dict = pd.merge(sub_dict, Glink, how='inner', on='filename') - -summary_phe = sub_dict[['outcome', 'fullName', 'consortia', - 'type', 'reference', 'linkRef', 'dataLink', 'internalDataLink']] -#cg = out.create_group('summary_phe') - -#summary_phe.to_hdf(outFileName, 'summary_table', format='table', dropna=False) - - -summary_phe.to_csv(out_summary) -# can be read with pd.read_hdf(outFileName,'summary_phe') - -#---open hdf5 files -ref = h5py.File(REF_filename) -out = h5py.File(outFileName, "w") -for chrom in range(1, 23): - chrom_str = 'chr%d' % chrom - cg = out.create_group(chrom_str) - cg.create_dataset('snp_ids', data=ref[chrom_str]['sids'][...]) - cg.create_dataset('position', data=ref[chrom_str]['position'][...]) - - -############################################################## -# LOOP OVER ALL GWAS -for GWAS in range(0, len(GWAS_table)): - - ############################################################## - # READ GWAS - GWAS_filename = GWAS_table[GWAS] - GWAS_link = walkfs(GWAS_path, GWAS_filename)[2] - print('\n') - print('**** PARSING %s ******' % GWAS_filename) - - #---check GWAS labels dictionnary - my_labels = column_dict[column_dict['filename'] == GWAS_filename] - target_lab = my_labels.values.tolist()[0] - my_study = my_labels['consortia'].values[0] + \ - '_' + my_labels['outcome'].values[0] - - #---parse header and get column name and order - with open(GWAS_link) as f: - count_line = 0 - line = f.readline() - header = line.split() - list_col = {} - list_lab = {} - for l in range(0, len(target_lab)): - for h in range(0, len(header)): - if header[h] == target_lab[l]: - list_lab[my_labels.columns.tolist()[l]] = h - list_col[h] = my_labels.columns.tolist()[l] - - #---load selected columns, rename columns, and set snpid as index - order = sorted(list_col.keys()) - fullGWAS = pd.read_csv(GWAS_link, delim_whitespace=True, na_values=def_missing, - usecols=list_lab.values(), index_col=0, names=[list_col[i] for i in order], header=0) - fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] - - ############################################################## - # read REF SNP and MERGE - SNP_in = 0 - SNP_rem = 0 - - for chrom in range(1, 23): - print('Working on Chromosome %d' % chrom) - chrom_str = 'chr%d' % chrom - Nsnp = len(ref[chrom_str]['position']) - - # create alernatives SNPid for the SNPid mapping - # == map based on rsid - chrPos1 = pd.DataFrame(ref[chrom_str]['sids'][...], columns=['snpid'], dtype=str ) - # == map based on chr:position - chrPos2 = pd.DataFrame(pd.Series([str(chrom)]*Nsnp).str.cat( - ref[chrom_str]['position'][...].astype(str), sep=':'), columns=['snpid']) - - - # merge the alternative mappings - merge_GWAS = pd.merge(chrPos1, fullGWAS, how='left', - right_index=True, left_on='snpid', sort=False, indicator=True) - altID_1 = pd.merge(chrPos2, fullGWAS, how='left', right_index=True, - left_on='snpid', sort=False, indicator=True) - sub_S = (altID_1['_merge'] == "both") - - - if(sub_S.any()): - merge_GWAS[sub_S] = altID_1.iloc[np.where(sub_S)[0]] - - - SNP_in += sum((merge_GWAS['_merge'] == "both")*1) - - # check direction of Zscore - #%%%%%%%%%STILL NEED TO CHECK ALTERNATIVE ALLELE%%%%%%%%%%%%%%%% - #upper_Allele = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)] - #sign1 = ((ref[chrom_str]['ref'].value == upper_Allele)*1-0.5)*2 - gwasAllele1 = [x.upper() for x in ((merge_GWAS['a1']).values).astype(str)] - gwasAllele2 = [x.upper() for x in ((merge_GWAS['a2']).values).astype(str)] - gwasAllele1_c = dna_complement(gwasAllele1) - gwasAllele2_c = dna_complement(gwasAllele2) - - refAllele1 = ref[chrom_str]['ref'].value.astype(str) - refAllele2 = ref[chrom_str]['alt'].value.astype(str) - - case1 = ((gwasAllele1 == refAllele1) & (gwasAllele2 == refAllele2)) - case2 = ((gwasAllele2 == refAllele1) & (gwasAllele1 == refAllele2)) - case3 = ((gwasAllele1_c == refAllele1) & (gwasAllele2_c == refAllele2)) - case4 = ((gwasAllele2_c == refAllele1) & (gwasAllele1_c == refAllele2)) - - sign1 = np.repeat(np.nan, Nsnp) - sign1[case1] = np.repeat(+ 1.0, sum(case1)) - sign1[case2] = np.repeat(- 1.0, sum(case2)) - sign1[case3] = np.repeat(+ 1.0, sum(case3)) - sign1[case4] = np.repeat(- 1.0, sum(case4)) - - # get sign of the stat or OR (for the later need to arbitrarily decide when OR=1.00) - if('z' in fullGWAS.columns): - sign3 = np.sign(merge_GWAS.z) - elif('OR' in fullGWAS.columns): - sign3 = np.sign(merge_GWAS.OR - 1 + - (np.random.rand(len(merge_GWAS.OR))-0.5)*0.0001) - else: - raise ValueError( - 'Both the statistic and OR are missing: ZSCORE cannot be derived') - - if('pval' not in merge_GWAS): - raise ValueError('P-value column was not found') - - #ZSCORE = np.sqrt(ss.chi2.ppf(1-merge_GWAS['pval'],1)) * sign1 * sign3 - ZSCORE = np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) * sign1 * sign3 - - # filter base on sample size ===================================================== - #---sample size does not exists - if 'n' in fullGWAS.columns: - myN = merge_GWAS.n - #--- freq, case-cont N exist - elif(('ncas' in fullGWAS.columns) & ('ncont' in fullGWAS.columns)): - sumN = merge_GWAS.ncas + merge_GWAS.ncont - perCase = merge_GWAS.ncas / sumN - myN = sumN * perCase * (1-perCase) - #---se, freq exist and QT - elif(('se' in fullGWAS.columns) & ('freq' in fullGWAS.columns)): - myN = 1/(merge_GWAS.freq * (1-merge_GWAS.freq)* 2 * merge_GWAS.se**2) - #---se exists but no freq, // use ref panel freq - elif(('se' in fullGWAS.columns) & ('freq' not in fullGWAS.columns)): - myN = 1/(ref[chrom_str]['eur_mafs'][...] * - (1-ref[chrom_str]['eur_mafs'][...])* 2 * merge_GWAS.se**2) - else: - raise ValueError( - 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') - # replace infinite value by the max finite one - if(sum(np.isinf(myN)) > 0): - myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) - - myW_thres = np.percentile(myN.dropna(), 90) - #myW_thres = np.max(myN) - ss_thres = perSS * myW_thres - - if chrom == 1: - myN_genome = myN - else: - myN_genome = np.concatenate((myN_genome, myN), axis=0) - - toRem = np.where(myN <= ss_thres) - SNP_rem += len(toRem[0]) - if(len(toRem[0]) > 0): - ZSCORE.iloc[toRem] = np.nan - # add created data to the group - cg = out[chrom_str] - cg.create_dataset('z'+'_'+my_study, data=ZSCORE) - - - ImpG_output = pd.DataFrame({ - 'rsID': chrPos1['snpid'], - 'pos': ref[chrom_str]['position'][...], - 'A0': [str(i)[2] for i in ref[chrom_str]['ref'][...]], - 'A1': [str(i)[2] for i in ref[chrom_str]['alt'][...]], - 'Z': ZSCORE.values - }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) - - ImpG_output.dropna(subset=["Z"], how="any", inplace=True) - impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt" - print("WRITING CHR {} results for {} to:".format(chrom, my_study)) - ImpG_output.to_csv(impg_output_file, sep="\t", index=False) - - # ############################################################## - # # SUMMARY + PLOTS - # print("There was %d SNPs available, of which %d were filtered out because of small sample size" % ( - # SNP_in, SNP_rem)) - #np.sqrt(ss.chi2.isf(merge_GWAS['pval'], 1)) - # # the histogram of the #sample per SNP - # fig = plt.figure() - # fig.suptitle(GWAS_filename, fontsize=14, fontweight='bold') - # nn = (pd.DataFrame(myN_genome)).dropna() - # n, bins, patches = plt.hist(nn, 50, normed=1, facecolor='blue', alpha=0.75) - # - # Ymax = np.max(n)*1.2 - # - # plt.ylim((0, Ymax)) - # plt.xlabel('Infered weight (i.e. Sample size)') - # plt.ylabel('Frequency') - # plt.grid(True) - # plt.axvline(x=myW_thres, linewidth=2, linestyle='dashed', color='r') - # plt.axvline(x=ss_thres, linewidth=2, linestyle='dashed', color='g') - # - # Xpos = np.min(nn) + 0.01*(np.max(nn) - np.min(nn)) - # ax = fig.add_subplot(111) - # - # ax.text(Xpos, Ymax*0.96, 'max(N)='+str(np.max(nn) - # [0]), verticalalignment='top', horizontalalignment='left') - # ax.text(Xpos, Ymax*0.92, 'min(N)='+str(np.min(nn) - # [0]), verticalalignment='top', horizontalalignment='left') - # ax.text(Xpos, Ymax*0.88, '#SNPs='+str(len(nn)), - # verticalalignment='top', horizontalalignment='left') - # - # # plt.show() - # fig.savefig(pathOUT+GWAS_filename+'_SampleSizeDist.png', dpi=fig.dpi) - # plt.close(fig) - - -# CLOSE h5 FILES -out.close() -ref.close() - -print('\n')