Skip to content
Snippets Groups Projects
Commit a27a77c6 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

raising an error when column names are not unique in summary statistics headers

parent 6c87d176
No related branches found
No related tags found
No related merge requests found
Pipeline #143040 passed
...@@ -71,7 +71,7 @@ def add_preprocessing_argument(): ...@@ -71,7 +71,7 @@ def add_preprocessing_argument():
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header") parser.add_argument('--gwas-info', required=True, help= "Path to the file describing the format of the individual GWASs files with correct header")
#parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize") #parser.add_argument('--gwas-filename', required=True, help= "Name of the raw GWAS file to standardize")
parser.add_argument('--ref-path', required=True, help= "reference panel location (used to determine which snp to impute)") parser.add_argument('--ref-path', required=True, help= "reference panel location (notably used to harmonize reference and alternative allele accross SNPs")
parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'") parser.add_argument('--input-folder', required=True, help= "Path to the folder containing the Raw GWASs summary statistic files, must end by '/'")
parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution") parser.add_argument('--diagnostic-folder', required=True, help= "Path to the reporting information on the PreProcessing such as the SNPs sample size distribution")
......
...@@ -88,9 +88,8 @@ def map_columns_position(gwas_internal_link, column_dict): ...@@ -88,9 +88,8 @@ def map_columns_position(gwas_internal_link, column_dict):
print(gwas_internal_link) print(gwas_internal_link)
gwas_file = gwas_internal_link.split('/')[-1] gwas_file = gwas_internal_link.split('/')[-1]
#Our standart labels: #Our standart labels:
reference_label = column_dict.index.tolist() reference_label = column_dict.index.tolist()
print(reference_label)
# labels in the GWAS files # labels in the GWAS files
target_lab = pd.Index(column_dict.values.tolist()) target_lab = pd.Index(column_dict.values.tolist())
is_gzipped = re.search(r".gz$", gwas_internal_link) is_gzipped = re.search(r".gz$", gwas_internal_link)
...@@ -106,12 +105,15 @@ def map_columns_position(gwas_internal_link, column_dict): ...@@ -106,12 +105,15 @@ def map_columns_position(gwas_internal_link, column_dict):
header = pd.Index(line.split()) header = pd.Index(line.split())
def get_position(I,x): def get_position(I,x):
try: try:
return I.get_loc(x) position_in_header = I.get_loc(x)
if isinstance(position_in_header, int):
return position_in_header
else:
raise IndexError("{0} is a not corresponding to an unique column in {1}. Check that column names are unique in the header of {1} Summary Statistics".format(x, gwas_file))
except KeyError: except KeyError:
return np.nan return np.nan
label_position = [get_position(header,i) for i in target_lab] label_position = [get_position(header,i) for i in target_lab]
mapgw = pd.Series(label_position, index=reference_label) mapgw = pd.Series(label_position, index=reference_label)
mapgw = mapgw.loc[~mapgw.isna()].astype(int) mapgw = mapgw.loc[~mapgw.isna()].astype(int)
mapgw.sort_values(inplace=True) mapgw.sort_values(inplace=True)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment