diff --git a/jass_preprocessing/jass_preprocessing/compute_score/compute.py b/jass_preprocessing/jass_preprocessing/compute_score/compute.py index 218cde21b15c92427021427060567bba2a771da2..841eba01dea572d3a506b4651ecf8a47cb1fbc1f 100644 --- a/jass_preprocessing/jass_preprocessing/compute_score/compute.py +++ b/jass_preprocessing/jass_preprocessing/compute_score/compute.py @@ -13,7 +13,18 @@ def compute_z_score(mgwas): Compute zscore value and sign1 add the corresponding column to the mgwas dataframe """ - mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"] + + if 'z' in mgwas.columns: + sign_vect = np.sign(mgwas.z) + else: + if "OR" in mgwas.columns: + sign_vect = np.sign(mgwas["OR"] - 1.0 + 10**(-8)) + else: + raise ValueError( + 'The gwas data frame doesn"t contain effect column') + + mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * sign_vect * mgwas["sign_flip"] + return mgwas def compute_sample_size(mgwas, diagnostic_folder, trait): diff --git a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py index 6bc20ef3f602d5fb7d0a52ca1b5b2a37ffee2d22..26f1ae2829f72cfe6e9ee460e64efd4675ee84f7 100644 --- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py +++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py @@ -33,8 +33,11 @@ def map_on_ref_panel(gw_df , ref_panel): ref_panel['key2'] = "chr"+ref_panel.chr.map(str) + ":" +ref_panel.pos.map(str) other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True) - merge_GWAS.loc[other_snp.index] = other_snp + merge_GWAS = pd.concat([other_snp, merge_GWAS]) + merge_GWAS = merge_GWAS[~merge_GWAS.index.duplicated(keep='first')] + + merge_GWAS.index.rename("snp_id", inplace=True) return(merge_GWAS) diff --git a/main_preprocessing.py b/main_preprocessing.py index 1f6e1626f2aaff88d293cae639512fd0e641fb35..8b719f8e4437a66a0484d466d079153e3d33a570 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -14,6 +14,7 @@ import matplotlib.pyplot as plt import jass_preprocessing as jp import pandas as pd import seaborn as sns +import time perSS = 0.7 netPath = "/mnt/atlas/" @@ -33,18 +34,18 @@ def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', out_summary = "summary_GWAS.csv" ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) - -GWAS_table = ["EUR.CD.gwas_filtered.assoc"] +gwas_map.index +GWAS_table = gwas_map.index["GIANT_2015_HIP_COMBINED_EUR.txt"]#"EUR.CD.gwas_filtered.assoc"] # "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt" # "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt", # "GIANT_2015_HIP_COMBINED_EUR.txt", # ] +for GWAS_filename in GWAS_table.index: -for GWAS_filename in GWAS_table: - - tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'], gwas_map.loc[GWAS_filename, 'outcome']) + tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'], + gwas_map.loc[GWAS_filename, 'outcome']) print('processing GWAS: {}'.format(tag)) - + start = time.time() gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') @@ -52,23 +53,26 @@ for GWAS_filename in GWAS_table: mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) + ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id") - inter_index = ref.index.intersection(gw_df.index) - test_merge = pd.merge(ref.loc[inter_index], gw_df.loc[inter_index], how='inner', - indicator=True, left_index=True, right_index=True) + if gw_df.index.map(str).str.contains("^chr*", case=False).any(): + ref['key2'] = "chr"+ref.chr.map(str) + ":" +ref.pos.map(str) + other_snp = pd.merge(ref, gw_df, how='inner', indicator=True, + left_on ='key2', left_index=False, right_index=True) - print(jp.map_reference.map_on_ref_panel) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) mgwas = jp.map_reference.compute_snp_alignement(mgwas) - mgwas = jp.compute_score.compute_z_score(mgwas) + mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag) + end = time.time() + print("Preprocessing of {0} in {1}s".format(tag, end-start)) + jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag) jp.save_output.save_output(mgwas, ldscore_format, tag) - jp.save_output.save_output mgwas.reset_index(inplace=True) mgwas.sort_values(['chr', "pos"]).head(100)