add mapping and merge function

a40f95cf · Hanna JULIENNE · d561f244 · a40f95cf · a40f95cf · a40f95cf
Commit a40f95cf authored 7 years ago by Hanna JULIENNE
--- a/jass_preprocessing/jass_preprocessing/__init__.py
+++ b/jass_preprocessing/jass_preprocessing/__init__.py
-import jass_preprocessing.map_gwas
+import jass_preprocessing.map_gwas.map_gwas as map_gwas
-import jass_preprocessing.dna_utils
+import jass_preprocessing.dna_utils.dna_utils as dna_utils
--- a/jass_preprocessing/jass_preprocessing/__pycache__/__init__.cpython-35.pyc
+++ b/jass_preprocessing/jass_preprocessing/__pycache__/__init__.cpython-35.pyc
--- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py
+++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py
 import os
 import sys
 import pandas as pd
+import numpy as np
 def walkfs(startdir, findfile):
    dircount = 0
@@ -16,7 +15,6 @@ def walkfs(startdir, findfile):
    return dircount, filecount, None
 def gwas_internal_link(GWAS_table, GWAS_path):
    """
    Walk the GWAS path to find the GWAS tables
@@ -40,5 +38,59 @@ def convert_missing_values(df):
    nmissing = len(def_missing)
    nan_vec = [np.nan] * nmissing
    return df.replace(def_missing, nan_vec)
+def map_columns_position(gwas_internal_link,  GWAS_labels):
+    """
+    Find column position for each specific Gwas
+    """
+    column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
+    gwas_file = gwas_internal_link.split('/')[-1]
+    my_labels = column_dict[column_dict['filename'] == gwas_file]
+    target_lab = my_labels.values.tolist()[0]
+    f = open(gwas_internal_link)
+    count_line = 0
+    line = f.readline()
+    header = line.split()
+    list_col = {}
+    list_lab = {}
+    for l in range(0, len(target_lab)):
+        for h in range(0, len(header)):
+            if header[h] == target_lab[l]:
+                list_lab[my_labels.columns.tolist()[l]] = h
+                list_col[h] = my_labels.columns.tolist()[l]
+    return {'label_position' : list_col, 'position_label': list_lab}
+def read_gwas( gwas_internal_link, column_dict):
+    """
+    Read gwas and apply minimal qc
+    """
+    fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
+                               usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0)
+    fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
+    fullGWAS = convert_missing_values(fullGWAS)
+    return fullGWAS
+def map_on_ref_panel(gw_df , ref_panel):
+    """
+    Merge Gwas df with the reference panel
+    """
+    def key2(x):
+        return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
+    ref_panel['key2'] = ref_panel.apply(key2,1)
+    merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
+    other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
+    merge_GWAS.loc[other_snp.index] = other_snp
+    return(merge_GWAS)
--- a/jass_preprocessing/setup.py
+++ b/jass_preprocessing/setup.py
-from setuptools import setup
+from setuptools import setup, find_packages
 setup(name='jass_preprocessing',
      version='0.1',
@@ -7,5 +7,7 @@ setup(name='jass_preprocessing',
      author='Hugues Aschard, Vincent Laville, Hanna Julienne',
      author_email='hugues.aschard@pasteur.fr',
      license='MIT',
-      packages=['jass_preprocessing'],
+      #package_dir = {'': 'jass_preprocessing'},
+      packages= ['jass_preprocessing', "jass_preprocessing.map_gwas",
+                "jass_preprocessing.dna_utils"],
      zip_safe=False)
--- a/main_preprocessing.py
+++ b/main_preprocessing.py
+"""
+Read raw GWAS summary statistics, filter and format
+Write clean GWAS
+"""
+__updated__ = '2018-19-02'
+import numpy as np
+import scipy.stats as ss
+import sys
+import math
+import os
+import pandas as pd
+import matplotlib.pyplot as plt
+import jass_preprocessing as jp
+import pandas as pd
+perSS = 0.7
+netPath = "/mnt/atlas/"  # '/home/genstat/ATLAS/'
+#netPath       = '/pasteur/projets/policy01/'
+GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
+GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
+REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
+pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
+outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
+def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
+               '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
+out_summary = "summary_GWAS.csv"
+ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
+GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
+              'SNP_gwas_mc_merge_nogc.tbl.uniq',
+              'GIANT_2015_HIP_COMBINED_EUR.txt',
+              'GIANT_2015_WC_COMBINED_EUR.txt',
+              'GIANT_2015_WHR_COMBINED_EUR.txt']
+gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
+column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
+column_dict.iloc[:2]
+gwas.iloc[0,1].split('/')[-1]
+column_dict['filename']
+my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
+target_lab = my_labels.values.tolist()[0]
+len(target_lab)
+    # READ GWAS
+GWAS_filename = GWAS_table[0]
+GWAS_filename
+GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
+GWAS_link
+mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
+gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
+ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id")
+dir(jp.map_gwas)
+mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)