diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index 870babb0d007a64c39e007cf946eb0d5ca1f92ec..9743f78a73d4fa81ab5d1652b4aa610bf81b7805 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -71,7 +71,7 @@ def add_preprocessing_argument(): parser = argparse.ArgumentParser() parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header") #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize") - parser.add_argument('--ref-path', required=True, help= "reference panel location (used to determine which snp to impute)") + parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs") parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'") parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution") diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index 5e73858f268e934b36db38818bc1f05d3ebb57d3..c9739b6c208a5c479287d6c10f7f75d19b0bdfcb 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -88,9 +88,8 @@ def map_columns_position(gwas_internal_link, column_dict): print(gwas_internal_link) gwas_file = gwas_internal_link.split('/')[-1] #Our standart labels: - reference_label = column_dict.index.tolist() - print(reference_label) + # labels in the GWAS files target_lab = pd.Index(column_dict.values.tolist()) is_gzipped = re.search(r".gz$", gwas_internal_link) @@ -106,12 +105,15 @@ def map_columns_position(gwas_internal_link, column_dict): header = pd.Index(line.split()) def get_position(I,x): try: - return I.get_loc(x) + position_in_header = I.get_loc(x) + if isinstance(position_in_header, int): + return position_in_header + else: + raise IndexError("{0} is a not corresponding to an unique column in {1}. Check that column names are unique in the header of {1} Summary Statistics".format(x, gwas_file)) except KeyError: return np.nan label_position = [get_position(header,i) for i in target_lab] - mapgw = pd.Series(label_position, index=reference_label) mapgw = mapgw.loc[~mapgw.isna()].astype(int) mapgw.sort_values(inplace=True)