diff --git a/jass_preprocessing/jass_preprocessing/__init__.py b/jass_preprocessing/jass_preprocessing/__init__.py index 617bb474fa9e0a259367fa9b6c28d5b38684d1d7..dd0779d4736d3bef5c788138c31b9fe4a720240a 100644 --- a/jass_preprocessing/jass_preprocessing/__init__.py +++ b/jass_preprocessing/jass_preprocessing/__init__.py @@ -2,3 +2,4 @@ import jass_preprocessing.map_gwas.map_gwas as map_gwas import jass_preprocessing.dna_utils.dna_utils as dna_utils import jass_preprocessing.map_reference.map_reference as map_reference import jass_preprocessing.compute_score.compute as compute_score +import jass_preprocessing.save_output.save_output as save_output diff --git a/jass_preprocessing/jass_preprocessing/map_reference/__init__.py b/jass_preprocessing/jass_preprocessing/map_reference/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py index 1ee293f5ffcdd1a814e68d3baaa712181268e001..2e89f6fbbc12d9d69fb052f10b433edf4c1b7248 100644 --- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py +++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py @@ -57,13 +57,10 @@ def compute_snp_alignement(mgwas): """ Add a column to mgwas indicating if the reference and coded allele is flipped compared to the reference panel. - If it is, the sign of the statistic must be flipped - Args: mgwas: a pandas dataframe of the GWAS data merged with the reference panel - """ mgwas['a1c'] = dna_u.dna_complement(mgwas.a1) diff --git a/jass_preprocessing/jass_preprocessing/save_output/__init__.py b/jass_preprocessing/jass_preprocessing/save_output/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/jass_preprocessing/jass_preprocessing/save_output/save_output.py b/jass_preprocessing/jass_preprocessing/save_output/save_output.py new file mode 100644 index 0000000000000000000000000000000000000000..a605323ca30258825ce4695ce33a9449cbaa5fdd --- /dev/null +++ b/jass_preprocessing/jass_preprocessing/save_output/save_output.py @@ -0,0 +1,22 @@ +import pandas as pd + + + +def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): + """ + Write the preprocessed Gwas + """ + mgwas.reset_index(inplace=True) + mgwas = pd.DataFrame({ + 'rsID': mgwas_chr.snp_id, + 'pos': mgwas_chr.pos, + 'A0': mgwas_chr.ref, + 'A1':mgwas_chr.alt, + 'Z': mgwas_chr.computed_z + }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ]) + mgwas.set_index("chr", inplace=True) + mgwas.dropna(subset=["Z"], how="any", inplace=True) + for chrom in mgwas.index.unique: + impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt" + print("WRITING CHR {} results for {} to:".format(chrom, my_study)) + mgwas.loc[chrom].to_csv(impg_output_file, sep="\t", index=False) diff --git a/jass_preprocessing/setup.py b/jass_preprocessing/setup.py index 9e21834d36159f72690d03aa7aaf42f98ee1ef37..5e11bb38e0bf3c14d3ad782a95c0b918519d46bb 100644 --- a/jass_preprocessing/setup.py +++ b/jass_preprocessing/setup.py @@ -10,6 +10,6 @@ setup(name='jass_preprocessing', #package_dir = {'': 'jass_preprocessing'}, packages= ['jass_preprocessing', "jass_preprocessing.map_gwas", "jass_preprocessing.dna_utils", "jass_preprocessing.map_reference", - "jass_preprocessing.compute_score" + "jass_preprocessing.compute_score", "jass_preprocessing.save_output" ], zip_safe=False) diff --git a/main_preprocessing.py b/main_preprocessing.py index 823318df0011ec46b6e48f451bc0fe38af0029ca..091d176f6dcefe6424b73da86d5e8564ef23659b 100644 --- a/main_preprocessing.py +++ b/main_preprocessing.py @@ -16,58 +16,52 @@ import pandas as pd import seaborn as sns perSS = 0.7 -netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' -#netPath = '/pasteur/projets/policy01/' +netPath = "/mnt/atlas/" GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv' GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' + +diagnostic_folder= "/mnt/atlas/PCMA/1._DATA/sample_size_distribution/" REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out' pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', - '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] + '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', + 'na', '.'] + out_summary = "summary_GWAS.csv" ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/' GWAS_labels -gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0, nrows=10) - -GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded_dummy.txt"] - -gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) -gwas -column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') - +gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0) -my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]] -column_dict[['freq']] - # READ GWAS -GWAS_filename = GWAS_table[3] +GWAS_table = ["GWAS_SBP_recoded_dummy.txt"]#["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", + #"GWAS_PP_recoded.txt","GWAS_SBP_recoded_dummy.txt"] -GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] -GWAS_link -mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) +for GWAS_filename in GWAS_table: -gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) -gw_df.head() + tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'], gwas_map.loc[GWAS_filename, 'outcome']) + print('processing GWAS: {}'.format(tag)) -ref = pd.read_csv(REF_filename, header=None, sep= "\t", - names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], - index_col="snp_id") -inter_index = ref.index.intersection(gw_df.index) + gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) + column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') -test_merge = pd.merge(ref.loc[inter_index], gw_df.loc[inter_index], how='inner', - indicator=True, left_index=True, right_index=True) + GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] + mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) + gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) + ref = pd.read_csv(REF_filename, header=None, sep= "\t", + names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], + index_col="snp_id") -print(jp.map_reference.map_on_ref_panel) -mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) -mgwas -mgwas = jp.map_reference.compute_snp_alignement(mgwas) + inter_index = ref.index.intersection(gw_df.index) + test_merge = pd.merge(ref.loc[inter_index], gw_df.loc[inter_index], how='inner', + indicator=True, left_index=True, right_index=True) -mgwas = jp.compute_score.compute_z_score(mgwas) -mgwas = jp.compute_score.compute_sample_size(mgwas, "/mnt/atlas/PCMA/1._DATA/RAW.GWAS/ICPB_bloodPress/", "test_samp") -mgwas.reset_index(inplace=True) -mgwas.set_index("chr", inplace=True) + print(jp.map_reference.map_on_ref_panel) + mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref) + mgwas = jp.map_reference.compute_snp_alignement(mgwas) -jp. + mgwas = jp.compute_score.compute_z_score(mgwas) + mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag) + jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag)