Skip to content
Snippets Groups Projects
Commit a40f95cf authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add mapping and merge function

parent d561f244
Branches
Tags
No related merge requests found
import jass_preprocessing.map_gwas import jass_preprocessing.map_gwas.map_gwas as map_gwas
import jass_preprocessing.dna_utils import jass_preprocessing.dna_utils.dna_utils as dna_utils
No preview for this file type
import os import os
import sys import sys
import pandas as pd import pandas as pd
import numpy as np
def walkfs(startdir, findfile): def walkfs(startdir, findfile):
dircount = 0 dircount = 0
...@@ -16,7 +15,6 @@ def walkfs(startdir, findfile): ...@@ -16,7 +15,6 @@ def walkfs(startdir, findfile):
return dircount, filecount, None return dircount, filecount, None
def gwas_internal_link(GWAS_table, GWAS_path): def gwas_internal_link(GWAS_table, GWAS_path):
""" """
Walk the GWAS path to find the GWAS tables Walk the GWAS path to find the GWAS tables
...@@ -40,5 +38,59 @@ def convert_missing_values(df): ...@@ -40,5 +38,59 @@ def convert_missing_values(df):
nmissing = len(def_missing) nmissing = len(def_missing)
nan_vec = [np.nan] * nmissing nan_vec = [np.nan] * nmissing
return df.replace(def_missing, nan_vec) return df.replace(def_missing, nan_vec)
def map_columns_position(gwas_internal_link, GWAS_labels):
"""
Find column position for each specific Gwas
"""
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict[column_dict['filename'] == gwas_file]
target_lab = my_labels.values.tolist()[0]
f = open(gwas_internal_link)
count_line = 0
line = f.readline()
header = line.split()
list_col = {}
list_lab = {}
for l in range(0, len(target_lab)):
for h in range(0, len(header)):
if header[h] == target_lab[l]:
list_lab[my_labels.columns.tolist()[l]] = h
list_col[h] = my_labels.columns.tolist()[l]
return {'label_position' : list_col, 'position_label': list_lab}
def read_gwas( gwas_internal_link, column_dict):
"""
Read gwas and apply minimal qc
"""
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0)
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
fullGWAS = convert_missing_values(fullGWAS)
return fullGWAS
def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas df with the reference panel
"""
def key2(x):
return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
ref_panel['key2'] = ref_panel.apply(key2,1)
merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
merge_GWAS.loc[other_snp.index] = other_snp
return(merge_GWAS)
from setuptools import setup from setuptools import setup, find_packages
setup(name='jass_preprocessing', setup(name='jass_preprocessing',
version='0.1', version='0.1',
...@@ -7,5 +7,7 @@ setup(name='jass_preprocessing', ...@@ -7,5 +7,7 @@ setup(name='jass_preprocessing',
author='Hugues Aschard, Vincent Laville, Hanna Julienne', author='Hugues Aschard, Vincent Laville, Hanna Julienne',
author_email='hugues.aschard@pasteur.fr', author_email='hugues.aschard@pasteur.fr',
license='MIT', license='MIT',
packages=['jass_preprocessing'], #package_dir = {'': 'jass_preprocessing'},
packages= ['jass_preprocessing', "jass_preprocessing.map_gwas",
"jass_preprocessing.dna_utils"],
zip_safe=False) zip_safe=False)
"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS
"""
__updated__ = '2018-19-02'
import numpy as np
import scipy.stats as ss
import sys
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import jass_preprocessing as jp
import pandas as pd
perSS = 0.7
netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/'
#netPath = '/pasteur/projets/policy01/'
GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt',
'GIANT_2015_WHR_COMBINED_EUR.txt']
gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.iloc[:2]
gwas.iloc[0,1].split('/')[-1]
column_dict['filename']
my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
target_lab = my_labels.values.tolist()[0]
len(target_lab)
# READ GWAS
GWAS_filename = GWAS_table[0]
GWAS_filename
GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
GWAS_link
mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id")
dir(jp.map_gwas)
mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment