module to save preprocessing results

1a6d0f05 · Hanna JULIENNE · d2f5af12 · 1a6d0f05 · 1a6d0f05 · 1a6d0f05
Commit 1a6d0f05 authored 7 years ago by Hanna JULIENNE
--- a/jass_preprocessing/jass_preprocessing/__init__.py
+++ b/jass_preprocessing/jass_preprocessing/__init__.py
@@ -2,3 +2,4 @@ import jass_preprocessing.map_gwas.map_gwas as map_gwas
 import jass_preprocessing.dna_utils.dna_utils as dna_utils
 import jass_preprocessing.map_reference.map_reference as map_reference
 import jass_preprocessing.compute_score.compute as compute_score
+import jass_preprocessing.save_output.save_output as save_output
--- a/jass_preprocessing/jass_preprocessing/map_reference/__init__.py
+++ b/jass_preprocessing/jass_preprocessing/map_reference/__init__.py
--- a/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py
+++ b/jass_preprocessing/jass_preprocessing/map_reference/map_reference.py
@@ -57,13 +57,10 @@ def compute_snp_alignement(mgwas):
    """
    Add a column to mgwas indicating if the reference and coded
    allele is flipped compared to the reference panel.
-
    If it is, the sign of the statistic must be flipped
-
    Args:
        mgwas: a pandas dataframe of the GWAS data merged
         with the reference panel
-
    """

    mgwas['a1c'] = dna_u.dna_complement(mgwas.a1)

--- a/jass_preprocessing/jass_preprocessing/save_output/__init__.py
+++ b/jass_preprocessing/jass_preprocessing/save_output/__init__.py
--- a/jass_preprocessing/jass_preprocessing/save_output/save_output.py
+++ b/jass_preprocessing/jass_preprocessing/save_output/save_output.py
+import pandas as pd
+
+
+
+def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
+    """
+    Write the preprocessed Gwas
+    """
+    mgwas.reset_index(inplace=True)
+    mgwas = pd.DataFrame({
+                'rsID': mgwas_chr.snp_id,
+                'pos': mgwas_chr.pos,
+                'A0': mgwas_chr.ref,
+                'A1':mgwas_chr.alt,
+                'Z': mgwas_chr.computed_z
+    }, columns= ['rsID', 'pos', 'A0', "A1", "Z" ])
+    mgwas.set_index("chr", inplace=True)
+    mgwas.dropna(subset=["Z"], how="any", inplace=True)
+    for chrom in mgwas.index.unique:
+        impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr_'+str(chrom)+".txt"
+        print("WRITING CHR {} results for {} to:".format(chrom, my_study))
+        mgwas.loc[chrom].to_csv(impg_output_file, sep="\t", index=False)
--- a/jass_preprocessing/setup.py
+++ b/jass_preprocessing/setup.py
@@ -10,6 +10,6 @@ setup(name='jass_preprocessing',
      #package_dir = {'': 'jass_preprocessing'},
      packages= ['jass_preprocessing', "jass_preprocessing.map_gwas",
                "jass_preprocessing.dna_utils", "jass_preprocessing.map_reference",
-                "jass_preprocessing.compute_score"
+                "jass_preprocessing.compute_score", "jass_preprocessing.save_output"
                ],
      zip_safe=False)
--- a/main_preprocessing.py
+++ b/main_preprocessing.py
@@ -16,58 +16,52 @@ import pandas as pd
 import seaborn as sns

 perSS = 0.7
-netPath = "/mnt/atlas/"  # '/home/genstat/ATLAS/'
-#netPath       = '/pasteur/projets/policy01/'
+netPath = "/mnt/atlas/"
 GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_labels.csv'
 GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
+
+diagnostic_folder= "/mnt/atlas/PCMA/1._DATA/sample_size_distribution/"
 REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
 pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'

 outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
 def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
-               '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
+               '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan',
+               'na', '.']
+
 out_summary = "summary_GWAS.csv"
 ImpG_output_Folder = netPath+ 'PCMA/1._DATA/preprocessing_test/'

 GWAS_labels
-gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0, nrows=10)
-
-GWAS_table = ["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt", "GWAS_PP_recoded.txt","GWAS_SBP_recoded_dummy.txt"]
-
-gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
-gwas
-column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
-
+gwas_map = pd.read_csv(GWAS_labels, sep="\t", index_col=0)

-my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
-column_dict[['freq']]
-    # READ GWAS
-GWAS_filename = GWAS_table[3]
+GWAS_table = ["GWAS_SBP_recoded_dummy.txt"]#["GWAS_DBP_recoded.txt","GWAS_MAP_recoded.txt",
+              #"GWAS_PP_recoded.txt","GWAS_SBP_recoded_dummy.txt"]

-GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
-GWAS_link
-mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
+for GWAS_filename in GWAS_table:

-gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
-gw_df.head()
+    tag = "{0}_{1}".format(gwas_map.loc[GWAS_filename, 'consortia'], gwas_map.loc[GWAS_filename, 'outcome'])
+    print('processing GWAS: {}'.format(tag))

-ref = pd.read_csv(REF_filename, header=None, sep= "\t",
-                  names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
-                   index_col="snp_id")
-inter_index = ref.index.intersection(gw_df.index)
+    gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
+    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')

-test_merge = pd.merge(ref.loc[inter_index], gw_df.loc[inter_index], how='inner',
-                      indicator=True, left_index=True, right_index=True)
+    GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
+    mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)

+    gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
+    ref = pd.read_csv(REF_filename, header=None, sep= "\t",
+                      names =['chr', "pos", "snp_id", "ref", "alt", "MAF"],
+                       index_col="snp_id")

-print(jp.map_reference.map_on_ref_panel)
-mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
-mgwas
-mgwas = jp.map_reference.compute_snp_alignement(mgwas)
+    inter_index = ref.index.intersection(gw_df.index)
+    test_merge = pd.merge(ref.loc[inter_index], gw_df.loc[inter_index], how='inner',
+                          indicator=True, left_index=True, right_index=True)

-mgwas = jp.compute_score.compute_z_score(mgwas)
-mgwas = jp.compute_score.compute_sample_size(mgwas, "/mnt/atlas/PCMA/1._DATA/RAW.GWAS/ICPB_bloodPress/", "test_samp")
-mgwas.reset_index(inplace=True)
-mgwas.set_index("chr", inplace=True)
+    print(jp.map_reference.map_on_ref_panel)
+    mgwas = jp.map_reference.map_on_ref_panel(gw_df, ref)
+    mgwas = jp.map_reference.compute_snp_alignement(mgwas)

-jp.
+    mgwas = jp.compute_score.compute_z_score(mgwas)
+    mgwas = jp.compute_score.compute_sample_size(mgwas, diagnostic_folder, tag)
+    jp.save_output.save_output_by_chromosome(mgwas, ImpG_output_Folder, tag)