diff --git a/jass_preprocessing/jass_preprocessing/__init__.py b/jass_preprocessing/jass_preprocessing/__init__.py index fcd7f03a80ca7191f10bb338edd3eac51a21fcd2..8ea525bc7d94f21a7f590c143594e58b757b0829 100644 --- a/jass_preprocessing/jass_preprocessing/__init__.py +++ b/jass_preprocessing/jass_preprocessing/__init__.py @@ -1,2 +1,2 @@ -import jass_preprocessing.map_gwas -import jass_preprocessing.dna_utils +import jass_preprocessing.map_gwas.map_gwas as map_gwas +import jass_preprocessing.dna_utils.dna_utils as dna_utils diff --git a/jass_preprocessing/jass_preprocessing/__pycache__/__init__.cpython-35.pyc b/jass_preprocessing/jass_preprocessing/__pycache__/__init__.cpython-35.pyc index 90cd37a5ba622284ffaaf5c15051c02e856fe0dd..6590be3001f7e28dd83d3108b92080eb83379f31 100644 Binary files a/jass_preprocessing/jass_preprocessing/__pycache__/__init__.cpython-35.pyc and b/jass_preprocessing/jass_preprocessing/__pycache__/__init__.cpython-35.pyc differ diff --git a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py index 2becf225ce9c74cb4e01603e06cd6ec9cc7001d6..d9523fe0365f91bdab605856e8bd3ebb13ba15f9 100644 --- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py +++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py @@ -1,8 +1,7 @@ import os import sys import pandas as pd - - +import numpy as np def walkfs(startdir, findfile): dircount = 0 @@ -16,7 +15,6 @@ def walkfs(startdir, findfile): return dircount, filecount, None - def gwas_internal_link(GWAS_table, GWAS_path): """ Walk the GWAS path to find the GWAS tables @@ -40,5 +38,59 @@ def convert_missing_values(df): nmissing = len(def_missing) nan_vec = [np.nan] * nmissing - return df.replace(def_missing, nan_vec) + + +def map_columns_position(gwas_internal_link, GWAS_labels): + """ + Find column position for each specific Gwas + """ + column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') + gwas_file = gwas_internal_link.split('/')[-1] + my_labels = column_dict[column_dict['filename'] == gwas_file] + target_lab = my_labels.values.tolist()[0] + f = open(gwas_internal_link) + + count_line = 0 + line = f.readline() + header = line.split() + list_col = {} + list_lab = {} + for l in range(0, len(target_lab)): + for h in range(0, len(header)): + + if header[h] == target_lab[l]: + list_lab[my_labels.columns.tolist()[l]] = h + list_col[h] = my_labels.columns.tolist()[l] + return {'label_position' : list_col, 'position_label': list_lab} + + +def read_gwas( gwas_internal_link, column_dict): + """ + Read gwas and apply minimal qc + """ + + fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, + usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0) + + fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')] + fullGWAS = convert_missing_values(fullGWAS) + + return fullGWAS + + + +def map_on_ref_panel(gw_df , ref_panel): + """ + Merge Gwas df with the reference panel + """ + def key2(x): + return ('chr' + str(x["chr"]) + ":" + str(x["pos"])) + + ref_panel['key2'] = ref_panel.apply(key2,1) + + merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True) + other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True) + + merge_GWAS.loc[other_snp.index] = other_snp + return(merge_GWAS) diff --git a/jass_preprocessing/setup.py b/jass_preprocessing/setup.py index 1aaf7ec39b0e2e8a6157d63479cd51b79d78186d..23e156cc244df1296261925e4181c5c5aaeaa315 100644 --- a/jass_preprocessing/setup.py +++ b/jass_preprocessing/setup.py @@ -1,4 +1,4 @@ -from setuptools import setup +from setuptools import setup, find_packages setup(name='jass_preprocessing', version='0.1', @@ -7,5 +7,7 @@ setup(name='jass_preprocessing', author='Hugues Aschard, Vincent Laville, Hanna Julienne', author_email='hugues.aschard@pasteur.fr', license='MIT', - packages=['jass_preprocessing'], + #package_dir = {'': 'jass_preprocessing'}, + packages= ['jass_preprocessing', "jass_preprocessing.map_gwas", + "jass_preprocessing.dna_utils"], zip_safe=False) diff --git a/main_preprocessing.py b/main_preprocessing.py new file mode 100644 index 0000000000000000000000000000000000000000..bb213c0f6782edfb6b21fe8580471dac9638683f --- /dev/null +++ b/main_preprocessing.py @@ -0,0 +1,65 @@ +""" +Read raw GWAS summary statistics, filter and format +Write clean GWAS +""" +__updated__ = '2018-19-02' + +import numpy as np +import scipy.stats as ss +import sys +import math +import os +import pandas as pd +import matplotlib.pyplot as plt +import jass_preprocessing as jp +import pandas as pd + + +perSS = 0.7 +netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/' +#netPath = '/pasteur/projets/policy01/' +GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt' +GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/' +REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out' +pathOUT = netPath+'PCMA/1._DATA/RAW.summary/' + +outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5' +def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', + '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] +out_summary = "summary_GWAS.csv" +ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/' + + +GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt', + 'SNP_gwas_mc_merge_nogc.tbl.uniq', + 'GIANT_2015_HIP_COMBINED_EUR.txt', + 'GIANT_2015_WC_COMBINED_EUR.txt', + 'GIANT_2015_WHR_COMBINED_EUR.txt'] + + +gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path) + + +column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') + +column_dict.iloc[:2] +gwas.iloc[0,1].split('/')[-1] + + +column_dict['filename'] +my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]] +target_lab = my_labels.values.tolist()[0] +len(target_lab) + + # READ GWAS +GWAS_filename = GWAS_table[0] +GWAS_filename + +GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2] +GWAS_link +mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels) +gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) +ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id") + +dir(jp.map_gwas) +mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)