diff --git a/jass_preprocessing/jass_preprocessing/__init__.py b/jass_preprocessing/jass_preprocessing/__init__.py index 8669fc771e2bee31c822c9d69beb0f3e14bba93a..fcd7f03a80ca7191f10bb338edd3eac51a21fcd2 100644 --- a/jass_preprocessing/jass_preprocessing/__init__.py +++ b/jass_preprocessing/jass_preprocessing/__init__.py @@ -1,2 +1,2 @@ -import jass_preprocessing.map_gwas.map_gwas -import jass_preprocessing.dna_utils.dna_utils +import jass_preprocessing.map_gwas +import jass_preprocessing.dna_utils diff --git a/jass_preprocessing/jass_preprocessing/dna_utils/__init__.py b/jass_preprocessing/jass_preprocessing/dna_utils/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/jass_preprocessing/jass_preprocessing/map_gwas/__init__.py b/jass_preprocessing/jass_preprocessing/map_gwas/__init__.py new file mode 100644 index 0000000000000000000000000000000000000000..e69de29bb2d1d6434b8b29ae775ad8c2e48c5391 diff --git a/jass_preprocessing/jass_preprocessing/map_gwas/__pycache__/map_gwas.cpython-35.pyc b/jass_preprocessing/jass_preprocessing/map_gwas/__pycache__/map_gwas.cpython-35.pyc deleted file mode 100644 index a080559f1ba2ed9e2ae37219fc43f4397bf6141d..0000000000000000000000000000000000000000 Binary files a/jass_preprocessing/jass_preprocessing/map_gwas/__pycache__/map_gwas.cpython-35.pyc and /dev/null differ diff --git a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py index aef00bb1ea9bec4f6e56f0bf280751e31f49c6d1..2becf225ce9c74cb4e01603e06cd6ec9cc7001d6 100644 --- a/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py +++ b/jass_preprocessing/jass_preprocessing/map_gwas/map_gwas.py @@ -1,6 +1,6 @@ import os import sys - +import pandas as pd @@ -29,3 +29,16 @@ def gwas_internal_link(GWAS_table, GWAS_path): 'internalDataLink': walkfs(GWAS_path, GWAS_filename)[2]}) Glink = pd.DataFrame(Glink, columns=('filename', 'internalDataLink')) return Glink + + +def convert_missing_values(df): + """ + Convert all missing value strings to a standart np.nan value + """ + def_missing = ['', '#N/A', '#N/A', 'N/A', '#NA', '-1.#IND', '-1.#QNAN', '-NaN', + '-nan', '1.#IND', '1.#QNAN', 'N/A', 'NA', 'NULL', 'NaN', 'nan', 'na', '.'] + + nmissing = len(def_missing) + nan_vec = [np.nan] * nmissing + + return df.replace(def_missing, nan_vec)