diff --git a/jass_preprocessing/jass_preprocessing/compute_score/compute.py b/jass_preprocessing/jass_preprocessing/compute_score/compute.py index b8bbd44f1392e7a8e81e6ad07280f7c1bb462b8a..218cde21b15c92427021427060567bba2a771da2 100644 --- a/jass_preprocessing/jass_preprocessing/compute_score/compute.py +++ b/jass_preprocessing/jass_preprocessing/compute_score/compute.py @@ -16,7 +16,7 @@ def compute_z_score(mgwas): mgwas["computed_z"] = np.sqrt(ss.chi2.isf(mgwas['pval'], 1)) * np.sign(mgwas.z) * mgwas["sign_flip"] return mgwas -def compute_sample_size(mgwas, diagnostic_folder, trait ): +def compute_sample_size(mgwas, diagnostic_folder, trait): if 'n' in mgwas.columns: myN = mgwas.n diff --git a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py index 2e89f6fbbc12d9d69fb052f10b433edf4c1b7248..16f12491cd7d68c504316bdc1811bca4debd361e 100644 --- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py +++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py @@ -16,7 +16,9 @@ def read_reference(gwas_reference_panel): def map_on_ref_panel(gw_df , ref_panel): """ - Merge Gwas df with the reference panel + Merge Gwas dataframe with the reference panel + Make sure that the same SNPs are in the reference panel and the gwas + """ def key2(x): return ('chr' + str(x["chr"]) + ":" + str(x["pos"])) @@ -35,6 +37,11 @@ def map_on_ref_panel(gw_df , ref_panel): def compute_is_flipped(mgwas): + """ + Check if the reference panel and the GWAS data have the same reference + allele. return a boolean vector. + + """ flipped = pd.DataFrame({"ref_flipped" : (mgwas.ref == mgwas.a2), "alt_flipped" : (mgwas.alt == mgwas.a1)}) flipped_complement = pd.DataFrame({"ref_flippedc" : (mgwas.ref == mgwas.a2c), "alt_flippedc" : (mgwas.alt == mgwas.a1c)}) @@ -44,6 +51,13 @@ def compute_is_flipped(mgwas): return is_flipped def compute_is_aligned(mgwas): + """ + Check if the reference panel and the GWAS data have the same reference + allele. return a boolean vector. + The function should be the complement of "is_flipped" but we still compute + the two function to eventually detect weird cases (more than two alleles for + instance) + """ aligned = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1), "alt_ok" : (mgwas.alt == mgwas.a2)}) aligned_complement = pd.DataFrame({"ref_ok" : (mgwas.ref == mgwas.a1c), "alt_ok" : (mgwas.alt == mgwas.a2c)}) @@ -53,7 +67,6 @@ def compute_is_aligned(mgwas): return is_aligned def compute_snp_alignement(mgwas): - """ Add a column to mgwas indicating if the reference and coded allele is flipped compared to the reference panel. diff --git a/jass_preprocessing/jass_preprocessing/save_output/save_output.py b/jass_preprocessing/jass_preprocessing/save_output/save_output.py index a10e719e0fbe568fbe078cfa2a68c856287404ce..c00fef4e56f60c445c4b89660e383e0be750a1aa 100644 --- a/jass_preprocessing/jass_preprocessing/save_output/save_output.py +++ b/jass_preprocessing/jass_preprocessing/save_output/save_output.py @@ -4,22 +4,46 @@ import pandas as pd def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): """ - Write the preprocessed Gwas + Write the preprocessed Gwas for imputation """ - mgwas.reset_index(inplace=True) + mgwas_copy = mgwas.reset_index(inplace=False) - mgwas.set_index("chr", inplace=True) - mgwas.dropna(subset=["computed_z"], how="any", inplace=True) - for chrom in mgwas.index.unique(): + mgwas_copy.set_index("chr", inplace=True) + mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True) + for chrom in mgwas_copy.index.unique(): mgwas_chr = pd.DataFrame({ - 'rsID': mgwas.loc[chrom].snp_id, - 'pos': mgwas.loc[chrom].pos, - 'A0': mgwas.loc[chrom].ref, - 'A1':mgwas.loc[chrom].alt, - 'Z': mgwas.loc[chrom].computed_z - }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) + 'rsID': mgwas_copy.loc[chrom].snp_id, + 'pos': mgwas_copy.loc[chrom].pos, + 'A0': mgwas_copy.loc[chrom].ref, + 'A1':mgwas_copy.loc[chrom].alt, + 'Z': mgwas_copy.loc[chrom].computed_z, + 'P': mgwas_copy.loc[chrom].pval + }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ]) impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt" print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder)) mgwas_chr.sort_values(by="pos").to_csv(impg_output_file, sep="\t", index=False) + + +def save_output(mgwas, ImpG_output_Folder, my_study): + """ + Write the preprocessed Gwas for ldscore analysis + """ + mgwas_copy = mgwas.reset_index(inplace=False) + mgwas_copy.dropna(subset=["computed_z"], how="any", inplace=True) + + mgwas_copy = pd.DataFrame({ + 'chrom':mgwas_copy.chr, + 'rsID': mgwas_copy.snp_id, + 'pos': mgwas_copy.pos, + 'A0': mgwas_copy.ref, + 'A1':mgwas_copy.alt, + 'Z': mgwas_copy.computed_z, + 'P': mgwas_copy.pval + }, columns= ['chrom','rsID', 'pos', 'A0', "A1", "Z", "P" ]) + + impg_output_file = ImpG_output_Folder + 'z_'+ my_study +".txt" + print("WRITING results for {} to: {}".format( my_study, ImpG_output_Folder)) + print(mgwas_copy.head()) + mgwas_copy.sort_values(['chrom','pos']).to_csv(impg_output_file, sep="\t", index=False) diff --git a/main_preprocessing.py b/main_preprocessing.py index a0c4637ac529e902948ff555e6eaa6629c47f08e..1f6e1626f2aaff88d293cae639512fd0e641fb35 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -21,6 +21,7 @@ GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv' GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' diagnostic_folder= "/mnt/atlas/PCMA/1._DATA/sample_size_distribution/" +ldscore_format="/mnt/atlas/PCMA/1._DATA/ldscore_data/" REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out' pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' @@ -31,13 +32,13 @@ def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', out_summary = "summary_GWAS.csv" ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' - gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) -GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", - "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt", - "GIANT_2015_HIP_COMBINED_EUR.txt", "MAGIC_HbA1C.txt", - "EUR.CD.gwas_filtered.assoc"] +GWAS_table = ["EUR.CD.gwas_filtered.assoc"] +# "GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", #"GWAS_SBP_recoded_dummy.txt" +# "GWAS_PP_recoded.txt","GWAS_SBP_recoded.txt", +# "GIANT_2015_HIP_COMBINED_EUR.txt", +# ] for GWAS_filename in GWAS_table: @@ -66,3 +67,8 @@ for GWAS_filename in GWAS_table: mgwas = jp.compute_score.compute_z_score(mgwas) mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag) jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag) + jp.save_output.save_output(mgwas, ldscore_format, tag) + + jp.save_output.save_output +mgwas.reset_index(inplace=True) +mgwas.sort_values(['chr', "pos"]).head(100)