add zipped GWAS support

b8b082b3 · Hanna JULIENNE · 845dfd26 · b8b082b3 · b8b082b3
Commit b8b082b3 authored 5 years ago by Hanna JULIENNE
--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -10,6 +10,9 @@ import os
 import sys
 import pandas as pd
 import numpy as np
+import gzip
+import re
+

 def walkfs(startdir, findfile):
    """
@@ -96,15 +99,23 @@ def map_columns_position(gwas_internal_link,  GWAS_labels):
    reference_label = column_dict.columns.tolist()
    # labels in the GWAS files
    target_lab = pd.Index(my_labels.values.tolist())
-    f = open(gwas_internal_link)
+    is_gzipped = re.search(r".gz$", gwas_internal_link)
+    if is_gzipped:
+        f = gzip.open(gwas_internal_link)
+        line = f.readline()
+        line = line.decode('utf-8')
+    else:
+        f = open(gwas_internal_link)
+        line = f.readline()
    count_line = 0
-    line = f.readline()
+
    header = pd.Index(line.split())
    def get_position(I,x):
        try:
            return I.get_loc(x)
        except KeyError:
            return np.nan
+
    label_position = [get_position(header,i) for i in target_lab]

    mapgw = pd.Series(label_position, index=reference_label)
@@ -128,9 +139,18 @@ def read_gwas( gwas_internal_link, column_map):
    """
    print("Reading file:")
    print(gwas_internal_link)
-
+    is_gzipped = re.search(r".gz$", gwas_internal_link)
+    if is_gzipped:
+        compression = 'gzip'
+    else:
+        compression = None
+
+    print(column_map.values)
+    print(column_map.index)
    fullGWAS = pd.read_csv(gwas_internal_link, delim_whitespace=True,
-                               usecols = column_map.values, #column_dict['label_position'].keys(),
+                               usecols = column_map.values,
+                               compression=compression,
+                                #column_dict['label_position'].keys(),
                               names= column_map.index,
                                index_col=0,
                                 header=0, na_values= ['', '#N/A', '#N/A', 'N/A','#NA', '-1.#IND', '-1.#QNAN',

--- a/jass_preprocessing/save_output.py
+++ b/jass_preprocessing/save_output.py
@@ -15,11 +15,11 @@ def save_output_by_chromosome(mgwas, ImpG_output_Folder, my_study):
        mgwas_chr = pd.DataFrame({
                        'rsID': mgwas_copy.loc[chrom].snp_id,
                        'pos': mgwas_copy.loc[chrom].pos,
-                        'A1': mgwas_copy.loc[chrom].ref,
-                        'A2':mgwas_copy.loc[chrom].alt,
+                        'A0': mgwas_copy.loc[chrom].ref,
+                        'A1':mgwas_copy.loc[chrom].alt,
                        'Z': mgwas_copy.loc[chrom].computed_z,
                        'P': mgwas_copy.loc[chrom].pval
-            }, columns= ['rsID', 'pos', 'A1', "A2", "Z", "P" ])
+            }, columns= ['rsID', 'pos', 'A0', "A1", "Z", "P" ])

        impg_output_file = ImpG_output_Folder + 'z_'+ my_study +'_chr'+str(chrom)+".txt"
        print("WRITING CHR {} results for {} to: {}".format(chrom, my_study, ImpG_output_Folder))