"""
Map GWAS
A set of functions to find GWAS files in subfolder and
to map columns
"""
import os
import sys
import pandas as pd
import numpy as np
import gzip
import re
[docs]def walkfs(startdir, findfile):
"""
Go through the folder and subfolder to find the specified file
Args:
startdir (str): path of the folder to explore
findfile (str): name of the file to find
"""
dircount = 0
filecount = 0
for root, dirs, files in os.walk(startdir):
if findfile in files:
return dircount, filecount + files.index(findfile), os.path.join(root, findfile)
dircount += 1
filecount += len(files)
# nothing found, return None instead of a full path for the file
return dircount, filecount, None
[docs]def gwas_internal_link(GWAS_table, GWAS_path):
"""
Walk the GWAS path to find the GWAS tables
Args:
GWAS_table (str): path of the folder to explore
findfile (str): name of the file to find
Return:
a pandas dataframe with one column for the filename
and one column containing the complete path to the file
"""
Glink = []
for GWAS in range(0, len(GWAS_table)):
GWAS_filename = GWAS_table[GWAS]
Glink.append({'filename': GWAS_filename,
'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]})
Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink'))
return Glink
[docs]def convert_missing_values(df):
"""
Convert all missing value strings to a standart np.nan value
Args:
GWAS_table (pandas dataframe): GWAS data as a dataframe
Return:
a pandas dataframe with missing value all equal to np.nan
"""
def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN',
'-NaN',
'-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN',
'nan', 'na', '.']
nmissing = len(def_missing)
nan_vec = ["NA"] * nmissing
return df.replace(def_missing, nan_vec)
[docs]def map_columns_position(gwas_internal_link, column_dict):
"""
Find column position for each specific Gwas
Args:
gwas_internal_link (str): filename of the GWAS data (with path)
GWAS_labels (pd.DataFrame): corresponding row of the information file
Return:
pandas Series with column position and column names as index
"""
print(gwas_internal_link)
gwas_file = gwas_internal_link.split('/')[-1]
#Our standart labels:
reference_label = column_dict.index.tolist()
print(reference_label)
# labels in the GWAS files
target_lab = pd.Index(column_dict.values.tolist())
is_gzipped = re.search(r".gz$", gwas_internal_link)
if is_gzipped:
f = gzip.open(gwas_internal_link)
line = f.readline()
line = line.decode('utf-8')
else:
f = open(gwas_internal_link)
line = f.readline()
count_line = 0
header = pd.Index(line.split())
def get_position(I,x):
try:
return I.get_loc(x)
except KeyError:
return np.nan
label_position = [get_position(header,i) for i in target_lab]
mapgw = pd.Series(label_position, index=reference_label)
mapgw = mapgw.loc[~mapgw.isna()].astype(int)
mapgw.sort_values(inplace=True)
f.close()
return mapgw
[docs]def read_gwas( gwas_internal_link, column_map):
"""
Read gwas raw data, fetch columns thanks to position stored in
column_map and rename columns according to column_map.index
Args:
gwas_internal_link (str): GWAS data as a dataframe
column_map (pandas Series): Series containing the position of column in
the raw data
Return:
a pandas dataframe with missing value all equal to np.nan
"""
print("Reading file:")
print(gwas_internal_link)
is_gzipped = re.search(r".gz$", gwas_internal_link)
if is_gzipped:
compression = 'gzip'
else:
compression = None
print(column_map.values)
print(column_map.index)
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols = column_map.values,
compression=compression,
#column_dict['label_position'].keys(),
names= column_map.index,
header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
'-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A',
'NA', 'NULL', 'NaN',
'nan', 'na', '.', '-'], dtype={"snpid":str, "a1":str,"a2":str,"freq":float, "z":float,"se":float, "pval":float})
print(fullGWAS.head())
fullGWAS.set_index("snpid", inplace=True)
print(fullGWAS.head())
fullGWAS = fullGWAS[~fullGWAS.index.duplicated(keep='first')]
#fullGWAS = convert_missing_values(fullGWAS)
return fullGWAS