diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 17ac79d4899b3ce32f3ddb4039128df9f127cad6..7c3b6e01f84513b0c169bd56a81da9769e0987a9 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -10,6 +10,9 @@ import os import sys import pandas as pd import numpy as np +import gzip +import re + def walkfs(startdir, findfile): """ @@ -96,15 +99,23 @@ def map_columns_position(gwas_internal_link, GWAS_labels): reference_label = column_dict.columns.tolist() # labels in the GWAS files target_lab = pd.Index(my_labels.values.tolist()) - f = open(gwas_internal_link) + is_gzipped = re.search(r".gz$", gwas_internal_link) + if is_gzipped: + f = gzip.open(gwas_internal_link) + line = f.readline() + line = line.decode('utf-8') + else: + f = open(gwas_internal_link) + line = f.readline() count_line = 0 - line = f.readline() + header = pd.Index(line.split()) def get_position(I,x): try: return I.get_loc(x) except KeyError: return np.nan + label_position = [get_position(header,i) for i in target_lab] mapgw = pd.Series(label_position, index=reference_label) @@ -128,9 +139,18 @@ def read_gwas( gwas_internal_link, column_map): """ print("Reading file:") print(gwas_internal_link) - + is_gzipped = re.search(r".gz$", gwas_internal_link) + if is_gzipped: + compression = 'gzip' + else: + compression = None + + print(column_map.values) + print(column_map.index) fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True, - usecols = column_map.values, #column_dict['label_position'].keys(), + usecols = column_map.values, + compression=compression, + #column_dict['label_position'].keys(), names= column_map.index, index_col=0, header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN', diff --git a/jass_preprocessing/save_output.py b/jass_preprocessing/save_output.py index daf240c56d1119b7bc58b8f0b3c1857f54493d40..c00fef4e56f60c445c4b89660e383e0be750a1aa 100644 --- a/jass_preprocessing/save_output.py +++ b/jass_preprocessing/save_output.py @@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study): mgwas_chr = pd.DataFrame({ 'rsID': mgwas_copy.loc[chrom].snp_id, 'pos': mgwas_copy.loc[chrom].pos, - 'A1': mgwas_copy.loc[chrom].ref, - 'A2':mgwas_copy.loc[chrom].alt, + 'A0': mgwas_copy.loc[chrom].ref, + 'A1':mgwas_copy.loc[chrom].alt, 'Z': mgwas_copy.loc[chrom].computed_z, 'P': mgwas_copy.loc[chrom].pval - }, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ]) + }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ]) impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt" print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))