Commit b8b082b3 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add zipped GWAS support

parent 845dfd26
Pipeline #17788 passed with stages
in 47 seconds
......@@ -10,6 +10,9 @@ import os
import sys
import pandas as pd
import numpy as np
import gzip
import re
def walkfs(startdir, findfile):
"""
......@@ -96,15 +99,23 @@ def map_columns_position(gwas_internal_link, GWAS_labels):
reference_label = column_dict.columns.tolist()
# labels in the GWAS files
target_lab = pd.Index(my_labels.values.tolist())
is_gzipped = re.search(r".gz$", gwas_internal_link)
if is_gzipped:
f = gzip.open(gwas_internal_link)
line = f.readline()
line = line.decode('utf-8')
else:
f = open(gwas_internal_link)
count_line = 0
line = f.readline()
count_line = 0
header = pd.Index(line.split())
def get_position(I,x):
try:
return I.get_loc(x)
except KeyError:
return np.nan
label_position = [get_position(header,i) for i in target_lab]
mapgw = pd.Series(label_position, index=reference_label)
......@@ -128,9 +139,18 @@ def read_gwas( gwas_internal_link, column_map):
"""
print("Reading file:")
print(gwas_internal_link)
is_gzipped = re.search(r".gz$", gwas_internal_link)
if is_gzipped:
compression = 'gzip'
else:
compression = None
print(column_map.values)
print(column_map.index)
fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
usecols = column_map.values, #column_dict['label_position'].keys(),
usecols = column_map.values,
compression=compression,
#column_dict['label_position'].keys(),
names= column_map.index,
index_col=0,
header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',
......
......@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
mgwas_chr = pd.DataFrame({
'rsID': mgwas_copy.loc[chrom].snp_id,
'pos': mgwas_copy.loc[chrom].pos,
'A1': mgwas_copy.loc[chrom].ref,
'A2':mgwas_copy.loc[chrom].alt,
'A0': mgwas_copy.loc[chrom].ref,
'A1':mgwas_copy.loc[chrom].alt,
'Z': mgwas_copy.loc[chrom].computed_z,
'P': mgwas_copy.loc[chrom].pval
}, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
}, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])
impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment