Skip to content
Snippets Groups Projects
Commit a40f95cf authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add mapping and merge function

parent d561f244
No related branches found
No related tags found
No related merge requests found
import jass_preprocessing.map_gwas
import jass_preprocessing.dna_utils
import jass_preprocessing.map_gwas.map_gwas as map_gwas
import jass_preprocessing.dna_utils.dna_utils as dna_utils
No preview for this file type
import os
import sys
import pandas as pd
import numpy as np
def walkfs(startdir, findfile):
dircount = 0
......@@ -16,7 +15,6 @@ def walkfs(startdir, findfile):
return dircount, filecount, None
def gwas_internal_link(GWAS_table, GWAS_path):
"""
Walk the GWAS path to find the GWAS tables
......@@ -40,5 +38,59 @@ def convert_missing_values(df):
nmissing = len(def_missing)
nan_vec = [np.nan] * nmissing
return df.replace(def_missing, nan_vec)
def map_columns_position(gwas_internal_link, GWAS_labels):
"""
Find column position for each specific Gwas
"""
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
gwas_file = gwas_internal_link.split('/')[-1]
my_labels = column_dict[column_dict['filename'] == gwas_file]
target_lab = my_labels.values.tolist()[0]
f = open(gwas_internal_link)
count_line = 0
line = f.readline()
header = line.split()
list_col = {}
list_lab = {}
for l in range(0, len(target_lab)):
for h in range(0, len(header)):
if header[h] == target_lab[l]:
list_lab[my_labels.columns.tolist()[l]] = h
list_col[h] = my_labels.columns.tolist()[l]
return {'label_position' : list_col, 'position_label': list_lab}
def read_gwas( gwas_internal_link, column_dict):
"""
Read gwas and apply minimal qc
"""
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols=column_dict['label_position'].keys(), index_col=0, names=column_dict['label_position'].values(), header=0)
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
fullGWAS = convert_missing_values(fullGWAS)
return fullGWAS
def map_on_ref_panel(gw_df , ref_panel):
"""
Merge Gwas df with the reference panel
"""
def key2(x):
return ('chr' + str(x["chr"]) + ":" + str(x["pos"]))
ref_panel['key2'] = ref_panel.apply(key2,1)
merge_GWAS = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_index=True, right_index=True)
other_snp = pd.merge(ref_panel, gw_df, how='inner', indicator=True, left_on ='key2', right_index=True)
merge_GWAS.loc[other_snp.index] = other_snp
return(merge_GWAS)
from setuptools import setup
from setuptools import setup, find_packages
setup(name='jass_preprocessing',
version='0.1',
......@@ -7,5 +7,7 @@ setup(name='jass_preprocessing',
author='Hugues Aschard, Vincent Laville, Hanna Julienne',
author_email='hugues.aschard@pasteur.fr',
license='MIT',
packages=['jass_preprocessing'],
#package_dir = {'': 'jass_preprocessing'},
packages= ['jass_preprocessing', "jass_preprocessing.map_gwas",
"jass_preprocessing.dna_utils"],
zip_safe=False)
"""
Read raw GWAS summary statistics, filter and format
Write clean GWAS
"""
__updated__ = '2018-19-02'
import numpy as np
import scipy.stats as ss
import sys
import math
import os
import pandas as pd
import matplotlib.pyplot as plt
import jass_preprocessing as jp
import pandas as pd
perSS = 0.7
netPath = "/mnt/atlas/" # '/home/genstat/ATLAS/'
#netPath = '/pasteur/projets/policy01/'
GWAS_labels = netPath+'PCMA/1._DATA/RAW.GWAS/GWAS_LABELS_MAP.txt'
GWAS_path = netPath+'PCMA/1._DATA/RAW.GWAS/'
REF_filename = netPath+'PCMA/0._REF/1KGENOME/summary_genome_Filter_part2.out'
pathOUT = netPath+'PCMA/1._DATA/RAW.summary/'
outFileName = netPath+'PCMA/1._DATA/ZSCORE_merged_ALL_NO_strand_ambiguous.hdf5'
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']
out_summary = "summary_GWAS.csv"
ImpG_output_Folder = netPath+ 'PCMA/1._DATA/ImpG_zfiles/'
GWAS_table = ['GIANT_HEIGHT_Wood_et_al_2014_publicrelease_HapMapCeuFreq.txt',
'SNP_gwas_mc_merge_nogc.tbl.uniq',
'GIANT_2015_HIP_COMBINED_EUR.txt',
'GIANT_2015_WC_COMBINED_EUR.txt',
'GIANT_2015_WHR_COMBINED_EUR.txt']
gwas = jp.map_gwas.gwas_internal_link(GWAS_table, GWAS_path)
column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na')
column_dict.iloc[:2]
gwas.iloc[0,1].split('/')[-1]
column_dict['filename']
my_labels = column_dict[column_dict['filename'] == gwas.iloc[0,0]]
target_lab = my_labels.values.tolist()[0]
len(target_lab)
# READ GWAS
GWAS_filename = GWAS_table[0]
GWAS_filename
GWAS_link = jp.map_gwas.walkfs(GWAS_path, GWAS_filename)[2]
GWAS_link
mapgw = jp.map_gwas.map_columns_position(GWAS_link, GWAS_labels)
gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw)
ref = pd.read_csv(REF_filename, header=None, sep= "\t", names =['chr', "pos", "snp_id", "ref", "alt", "MAF"], index_col="snp_id")
dir(jp.map_gwas)
mgwas = jp.map_gwas.map_on_ref_panel(gw_df, ref)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment