diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml index 4fd855e119d3aa3d79da7409d515e4af2e793439..595e9d1c281fdd750af9db26db3aa93a2f7b3b49 100644 --- a/.gitlab-ci.yml +++ b/.gitlab-ci.yml @@ -14,7 +14,7 @@ pages: - yum install -y make - pip3 install sphinx - pip3 install sphinxcontrib-bibtex sphinx_rtd_theme sphinx-argparse - - pip3 install -r jass_preprocessing/requirements.txt + - pip3 install -r requirements.txt - cd doc - sphinx-apidoc -f -o ./source/_autosummary/ ../jass_preprocessing/ - make html diff --git a/jass_preprocessing/__main__.py b/jass_preprocessing/__main__.py index ed42e4bd26e41b984a4bdbd1e0fcf6dc97cdf508..482b83efdc3a13ae45223c1eadbd12beb1b2c96b 100644 --- a/jass_preprocessing/__main__.py +++ b/jass_preprocessing/__main__.py @@ -22,24 +22,20 @@ import argparse #| ImpG_output_Folder | main ouput folder | netPath+ 'PCMA/1._DATA/preprocessing_test/' | - def launch_preprocessing(args): """ Preprocessing GWAS dataset """ - gwas_map = pd.read_csv(args.gwas_info, sep="\t", index_col=0) - print(gwas_map.head()) + gwas_map = pd.read_csv(args.gwas_info, sep="\t") + gwas_map.set_index("filename", inplace=True) for gwas_filename in gwas_map.index: - print(gwas_filename) - print(gwas_map.columns) - tag = "{0}_{1}".format(gwas_map.loc[gwas_filename, 'consortia'], - gwas_map.loc[gwas_filename, 'outcome']) + tag = "{0}_{1}".format(gwas_map.loc[gwas_filename, 'Consortium'], + gwas_map.loc[gwas_filename, 'Outcome']) print('processing GWAS: {}'.format(tag)) start = time.time() GWAS_link = jp.map_gwas.walkfs(args.input_folder, gwas_filename)[2] - mapgw = jp.map_gwas.map_columns_position(GWAS_link, args.gwas_info) gw_df = jp.map_gwas.read_gwas(GWAS_link, mapgw) diff --git a/jass_preprocessing/map_gwas.py b/jass_preprocessing/map_gwas.py index f3e8313c377be8da888d58552d121e5545980ef2..83f01c5d9ed3ac230b88c7b92a0512c851c5d477 100644 --- a/jass_preprocessing/map_gwas.py +++ b/jass_preprocessing/map_gwas.py @@ -84,23 +84,22 @@ def map_columns_position(gwas_internal_link, GWAS_labels): Return: pandas Series with column position and column names as index """ - column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na', index_col=0) - gwas_file = gwas_internal_link.split('/')[-1] + column_dict = pd.read_csv(GWAS_labels, sep='\t', na_values='na') + + column_dict.set_index("filename", inplace=True) + gwas_file = gwas_internal_link.split('/')[-1] my_labels = column_dict.loc[gwas_file] #Our standart labels: reference_label = column_dict.columns.tolist() # labels in the GWAS files target_lab = pd.Index(my_labels.values.tolist()) - f = open(gwas_internal_link) count_line = 0 line = f.readline() - print(line) header = pd.Index(line.split()) - def get_position(I,x): try: return I.get_loc(x) @@ -108,7 +107,6 @@ def map_columns_position(gwas_internal_link, GWAS_labels): return np.nan label_position = [get_position(header,i) for i in target_lab] - mapgw = pd.Series(label_position, index=reference_label) mapgw = mapgw.loc[~mapgw.isna()].astype(int) mapgw.sort_values(inplace=True) @@ -137,8 +135,7 @@ def read_gwas( gwas_internal_link, column_map): index_col=0, header=0, na_values= ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', - '-NaN', - '-nan', '1.#IND', '1.#QNAN', 'N/A', + '-NaN', '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.']) diff --git a/jass_preprocessing/requirements.txt b/jass_preprocessing/requirements.txt deleted file mode 100644 index cec313db9034d9e3fa5d010e2b813c1f93ad98bf..0000000000000000000000000000000000000000 --- a/jass_preprocessing/requirements.txt +++ /dev/null @@ -1,5 +0,0 @@ -scipy -pandas -numpy -seaborn -matplotlib