From a27a77c61196288a160ea65489bfeeb89559d9d2 Mon Sep 17 00:00:00 2001 From: hjulienn <hanna.julienne@pasteur.fr> Date: Thu, 7 Nov 2024 11:30:06 +0100 Subject: [PATCH] raising an error when column names are not unique in summary statistics headers --- jass_preprocessing/__main__.py | 2 +- jass_preprocessing/map_gwas.py | 10 ++++++---- 2 files changed, 7 insertions(+), 5 deletions(-) diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index 870babb..9743f78 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -71,7 +71,7 @@ def add_preprocessing_argument(): parser = argparse.ArgumentParser() parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header") #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize") - parser.add_argument('--ref-path', required=True, help= "reference panel location (used to determine which snp to impute)") + parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs") parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'") parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution") diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 5e73858..c9739b6 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -88,9 +88,8 @@ def map_columns_position(gwas_internal_link, column_dict): print(gwas_internal_link) gwas_file = gwas_internal_link.split('/')[-1] #Our standart labels: - reference_label = column_dict.index.tolist() - print(reference_label) + # labels in the GWAS files target_lab = pd.Index(column_dict.values.tolist()) is_gzipped = re.search(r".gz$", gwas_internal_link) @@ -106,12 +105,15 @@ def map_columns_position(gwas_internal_link, column_dict): header = pd.Index(line.split()) def get_position(I,x): try: - return I.get_loc(x) + position_in_header = I.get_loc(x) + if isinstance(position_in_header, int): + return position_in_header + else: + raise IndexError("{0} is a not corresponding to an unique column in {1}. Check that column names are unique in the header of {1} Summary Statistics".format(x, gwas_file)) except KeyError: return np.nan label_position = [get_position(header,i) for i in target_lab] - mapgw = pd.Series(label_position, index=reference_label) mapgw = mapgw.loc[~mapgw.isna()].astype(int) mapgw.sort_values(inplace=True) -- GitLab