raising an error when column names are not unique in summary statistics headers

a27a77c6 · Hanna JULIENNE · 6c87d176 · a27a77c6 · a27a77c6
Commit a27a77c6 authored 7 months ago by Hanna JULIENNE
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -71,7 +71,7 @@ def add_preprocessing_argument():
    parser = argparse.ArgumentParser()
    parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header")
    #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize")
-    parser.add_argument('--ref-path', required=True, help= "reference panel location (used to determine which snp to impute)")
+    parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs")
    parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'")
    parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution")

--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -88,9 +88,8 @@ def map_columns_position(gwas_internal_link,  column_dict):
    print(gwas_internal_link)
    gwas_file = gwas_internal_link.split('/')[-1]
    #Our standart labels:
    reference_label = column_dict.index.tolist()
-    print(reference_label)
    # labels in the GWAS files
    target_lab = pd.Index(column_dict.values.tolist())
    is_gzipped = re.search(r".gz$", gwas_internal_link)
@@ -106,12 +105,15 @@ def map_columns_position(gwas_internal_link,  column_dict):
    header = pd.Index(line.split())
    def get_position(I,x):
        try:
-            return I.get_loc(x)
+            position_in_header = I.get_loc(x)
+            if isinstance(position_in_header, int):
+                return position_in_header
+            else:
+                raise IndexError("{0} is a not corresponding to an unique column in {1}. Check that column names are unique in the header of {1} Summary Statistics".format(x, gwas_file))
        except KeyError:
            return np.nan
    label_position = [get_position(header,i) for i in target_lab]
    mapgw = pd.Series(label_position, index=reference_label)
    mapgw = mapgw.loc[~mapgw.isna()].astype(int)
    mapgw.sort_values(inplace=True)