diff --git a/jass_preprocessing/jass_preprocessing/compute_score/__init__.py b/jass_preprocessing/jass_preprocessing/compute_score/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/jass_preprocessing/jass_preprocessing/compute_score/compute.py b/jass_preprocessing/jass_preprocessing/compute_score/compute.py new file mode 100644 index 0000000000000000000000000000000000000000..b8bbd44f1392e7a8e81e6ad07280f7c1bb462b8a --- /dev/null +++ b/jass_preprocessing/jass_preprocessing/compute_score/compute.py @@ -0,0 +1,54 @@ +import pandas as pd +import numpy as np +import jass_preprocessing.dna_utils as dna_u +import warnings +import scipy.stats as ss +import seaborn as sns +import matplotlib.pyplot as plt + +perSS = 0.7 + +def compute_z_score(mgwas): + """ + Compute zscore value and sign1 + add the corresponding column to the mgwas dataframe + """ + mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"] + return mgwas + +def compute_sample_size(mgwas, diagnostic_folder, trait ): + + if 'n' in mgwas.columns: + myN = mgwas.n + #--- freq, case-cont N exist + elif(('ncas' in mgwas.columns) & ('ncont' in mgwas.columns)): + sumN = mgwas.ncas + mgwas.ncont + perCase = mgwas.ncas / sumN + myN = sumN * perCase * (1-perCase) + #---se, freq exist and QT + elif(('se' in mgwas.columns) & ('freq' in mgwas.columns)): + myN = 1/(mgwas.freq * (1-mgwas.freq)* 2 * mgwas.se**2) + #---se exists but no freq, // use ref panel freq + elif(('se' in mgwas.columns) & ('freq' not in mgwas.columns)): + myN = 1/(mgwas.MAF * (1-mgwas.MAF)* 2 * mgwas.se**2) + else: + raise ValueError( + 'Both Sample size and SE(beta) are missing: sample size filtering cannot be applied') + # replace infinite value by the max finite one + if(np.isinf(myN).any()): + myN[np.isinf(myN)] = np.max(myN[np.isfinite(myN)]) + warnings.warn("Some snp had an infinite sample size") + + myW_thres = np.percentile(myN.dropna(), 90) + ss_thres = perSS * myW_thres + mgwas["computed_N"] = myN + plt.clf() + p1 = sns.distplot(mgwas.computed_N) + p1.axvline(x=ss_thres) + fo = "{0}/Sample_size_distribution_{1}.png".format(diagnostic_folder, trait) + p1.figure.savefig(fo) + # Filter SNP with a too small sample _SampleSize + print("NSNP before sample size filtering: {}".format(mgwas.shape[0])) + mgwas = mgwas.loc[(myN >= ss_thres)] + print("NSNP after sample size filtering: {}".format(mgwas.shape[0])) + return(mgwas) diff --git a/jass_preprocessing/jass_preprocessing/save_output/save_output.py b/jass_preprocessing/jass_preprocessing/save_output/save_output.py index a605323ca30258825ce4695ce33a9449cbaa5fdd..a10e719e0fbe568fbe078cfa2a68c856287404ce 100644 --- a/jass_preprocessing/jass_preprocessing/save_output/save_output.py +++ b/jass_preprocessing/jass_preprocessing/save_output/save_output.py @@ -7,16 +7,19 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): Write the preprocessed Gwas """ mgwas.reset_index(inplace=True) - mgwas = pd.DataFrame({ - 'rsID': mgwas_chr.snp_id, - 'pos': mgwas_chr.pos, - 'A0': mgwas_chr.ref, - 'A1':mgwas_chr.alt, - 'Z': mgwas_chr.computed_z - }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) + mgwas.set_index("chr", inplace=True) - mgwas.dropna(subset=["Z"], how="any", inplace=True) - for chrom in mgwas.index.unique: - impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt" - print("WRITING CHR {} results for {} to:".format(chrom, my_study)) - mgwas.loc[chrom].to_csv(impg_output_file, sep="\t", index=False) + mgwas.dropna(subset=["computed_z"], how="any", inplace=True) + for chrom in mgwas.index.unique(): + + mgwas_chr = pd.DataFrame({ + 'rsID': mgwas.loc[chrom].snp_id, + 'pos': mgwas.loc[chrom].pos, + 'A0': mgwas.loc[chrom].ref, + 'A1':mgwas.loc[chrom].alt, + 'Z': mgwas.loc[chrom].computed_z + }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) + + impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt" + print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder)) + mgwas_chr.sort_values(by="pos").to_csv(impg_output_file, sep="\t", index=False) diff --git a/main_preprocessing.py b/main_preprocessing.py index 091d176f6dcefe6424b73da86d5e8564ef23659b..a0c4637ac529e902948ff555e6eaa6629c47f08e 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -32,11 +32,12 @@ def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', out_summary = "summary_GWAS.csv" ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' -GWAS_labels gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) -GWAS_table = ["GWAS_SBP_recoded_dummy.txt"]#["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", - #"GWAS_PP_recoded.txt","GWAS_SBP_recoded_dummy.txt"] +GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", + "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt", + "GIANT_2015_HIP_COMBINED_EUR.txt", "MAGIC_HbA1C.txt", + "EUR.CD.gwas_filtered.assoc"] for GWAS_filename in GWAS_table: