From a27a77c61196288a160ea65489bfeeb89559d9d2 Mon Sep 17 00:00:00 2001
From: hjulienn <hanna.julienne@pasteur.fr>
Date: Thu, 7 Nov 2024 11:30:06 +0100
Subject: [PATCH] raising an error when column names are not unique in summary
 statistics headers

---
 jass_preprocessing/__main__.py |  2 +-
 jass_preprocessing/map_gwas.py | 10 ++++++----
 2 files changed, 7 insertions(+), 5 deletions(-)

diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py
index 870babb..9743f78 100644
--- a/jass_preprocessing/__main__.py
+++ b/jass_preprocessing/__main__.py
@@ -71,7 +71,7 @@ def add_preprocessing_argument():
     parser = argparse.ArgumentParser()
     parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header")
     #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize")
-    parser.add_argument('--ref-path', required=True, help= "reference panel location (used to determine which snp to impute)")
+    parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs")
     parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'")
     parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution")
 
diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py
index 5e73858..c9739b6 100644
--- a/jass_preprocessing/map_gwas.py
+++ b/jass_preprocessing/map_gwas.py
@@ -88,9 +88,8 @@ def map_columns_position(gwas_internal_link,  column_dict):
     print(gwas_internal_link)
     gwas_file = gwas_internal_link.split('/')[-1]
     #Our standart labels:
-
     reference_label = column_dict.index.tolist()
-    print(reference_label)
+    
     # labels in the GWAS files
     target_lab = pd.Index(column_dict.values.tolist())
     is_gzipped = re.search(r".gz$", gwas_internal_link)
@@ -106,12 +105,15 @@ def map_columns_position(gwas_internal_link,  column_dict):
     header = pd.Index(line.split())
     def get_position(I,x):
         try:
-            return I.get_loc(x)
+            position_in_header = I.get_loc(x)
+            if isinstance(position_in_header, int):
+                return position_in_header
+            else:
+                raise IndexError("{0} is a not corresponding to an unique column in {1}. Check that column names are unique in the header of {1} Summary Statistics".format(x, gwas_file))
         except KeyError:
             return np.nan
 
     label_position = [get_position(header,i) for i in target_lab]
-
     mapgw = pd.Series(label_position, index=reference_label)
     mapgw = mapgw.loc[~mapgw.isna()].astype(int)
     mapgw.sort_values(inplace=True)
-- 
GitLab