diff --git a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py index 3b40c888a195168779fcd0ce8aa8072de51199b8..1ee293f5ffcdd1a814e68d3baaa712181268e001 100644 --- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py +++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py @@ -23,8 +23,12 @@ def map_on_ref_panel(gw_df , ref_panel): ref_panel['key2'] = ref_panel.apply(key2,1) - merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True) - other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True) + inter_index = ref_panel.index.intersection(gw_df.index) + print("SNps {}".format(len(inter_index))) + merge_GWAS = pd.merge(ref_panel.loc[inter_index], gw_df.loc[inter_index], how='inner', indicator=True, left_index=True, right_index=True) + + inter_index = gw_df.index.intersection(ref_panel.index) + other_snp = pd.merge(ref_panel.loc[inter_index], gw_df.loc[inter_index], how='inner', indicator=True, left_on ='key2', right_index=True) merge_GWAS.loc[other_snp.index] = other_snp return(merge_GWAS) diff --git a/main_preprocessing.py b/main_preprocessing.py index 4d94748895ea26b100ca2cd3387ddaffaf859b1d..823318df0011ec46b6e48f451bc0fe38af0029ca 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -13,7 +13,7 @@ import pandas as pd import matplotlib.pyplot as plt import jass_preprocessing as jp import pandas as pd - +import seaborn as sns perSS = 0.7 netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' @@ -27,22 +27,22 @@ outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] out_summary = "summary_GWAS.csv" -ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/' - +ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' -gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) +GWAS_labels +gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0, nrows=10) -GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt"] +GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded_dummy.txt"] gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) - +gwas column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]] column_dict[['freq']] # READ GWAS -GWAS_filename = GWAS_table[0] +GWAS_filename = GWAS_table[3] GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] GWAS_link @@ -54,12 +54,20 @@ gw_df.head() ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id") +inter_index = ref.index.intersection(gw_df.index) +test_merge = pd.merge(ref.loc[inter_index], gw_df.loc[inter_index], how='inner', + indicator=True, left_index=True, right_index=True) + + +print(jp.map_reference.map_on_ref_panel) mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) +mgwas mgwas = jp.map_reference.compute_snp_alignement(mgwas) -mgwas.head() -zscore = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"] - +mgwas = jp.compute_score.compute_z_score(mgwas) +mgwas = jp.compute_score.compute_sample_size(mgwas, "/mnt/atlas/PCMA/1._DATA/RAW.GWAS/ICPB_bloodPress/", "test_samp") +mgwas.reset_index(inplace=True) +mgwas.set_index("chr", inplace=True) -np.isinf(ref.head().pos).any() +jp.