diff --git a/jass/__main__.py b/jass/__main__.py index 968205dae217c8cdf25511acab49873388d9ebf3..8779f1fef58bf9be7bf0858a84edc7fbf8740aaf 100644 --- a/jass/__main__.py +++ b/jass/__main__.py @@ -5,7 +5,7 @@ import os import shutil import sys import argparse -from datetime import timedelta, datetime +from datetime import timedelta, datetime, date from json import JSONDecodeError import uvicorn @@ -231,6 +231,7 @@ def w_create_inittable(args): regions_map_path = absolute_path_of_the_file(args.regions_map_path) description_file_path = absolute_path_of_the_file(args.description_file_path) + init_table_metadata_path = absolute_path_of_the_file(args.init_table_metadata_path) init_table_path = absolute_path_of_the_file(args.init_table_path, True) create_inittable_file( @@ -240,6 +241,7 @@ def w_create_inittable(args): init_table_path, init_covariance_path, init_genetic_covariance_path, + init_table_metadata_path=init_table_metadata_path, ) @@ -438,6 +440,13 @@ def get_parser(): default=None, help="path to the genetic covariance file to import. Used only for display on Jass web application", ) + parser_create_it.add_argument( + "--init-table-metadata-path", + required=False, + default=None, + help="path to metadata file to attache to the initial data file", + ) + parser_create_it.set_defaults(func=w_create_inittable) # ------- create-worktable ------- diff --git a/jass/models/inittable.py b/jass/models/inittable.py index a7cd1b5e16f70139d89f22ada36a185a78144715..72785ee3b1e56305b23ad9dbf420b5f3595223d5 100644 --- a/jass/models/inittable.py +++ b/jass/models/inittable.py @@ -9,8 +9,6 @@ import re import glob import logging from pandas import HDFStore, DataFrame, read_csv, concat, options, read_hdf -import pandas as pd -# create (or open) an hdf5 file and opens in append mode import numpy as np import tables import warnings @@ -29,17 +27,36 @@ class InitMeta(object): def get_inittable_meta(file_name): init_store = HDFStore(file_name, mode='r') nb_snps = init_store.get_storer("SumStatTab").nrows - name=f"Name of {file_name.split('/')[-1]}" - desc=f"Description {file_name.split('/')[-1]}" + metadata = dict( + title=f"Filename: {file_name.split('/')[-1]}", + description="No description", + ancestry="??", + assembly="????", + ) + try: + df = init_store.get('METADATA') + for i in range(len(df)): + metadata[df.iloc[i, 0]] = df.iloc[i, 1] + except KeyError: + pass init_store.close() nb_phenotypes = read_hdf(file_name, "PhenoList").shape[0] + return dict( nb_snps=int(nb_snps), nb_phenotypes=int(nb_phenotypes), - name=name, - desc=desc, + name=metadata['title'], + desc=metadata['description'], + **dict( + (k, metadata[k]) + for k in set(metadata.keys()) if k not in { + 'title', + 'description', + } + ), ) + def get_gwasname(file_name): return "_".join(os.path.basename(file_name).split("_")[0:3]) @@ -51,7 +68,6 @@ def check_if_SNP_unique(z_gwas_chrom): ) raise IOError(msg) - def get_gwas_dict(input_data_path): gwas_dict = {} # retrieve all files corresponding to glob patterns @@ -219,6 +235,7 @@ def create_inittable_file( init_table_path: str, init_covariance_path=None, init_genetic_covariance_path=None, + init_table_metadata_path=None, ): # Read region file regions = read_csv(regions_map_path, sep="\s+", memory_map=True) @@ -266,6 +283,11 @@ def create_inittable_file( GEN_COV = genetic_covariance.loc[pheno_select, pheno_select] hdf_init.put("GEN_COV", GEN_COV, format="table", data_columns=True) + # Read metadata from file and store it + if init_table_metadata_path is not None: + metadata = read_csv(init_table_metadata_path, sep='\t', quotechar='"', index_col=False, memory_map=True) + hdf_init.put("METADATA", metadata, format="table", data_columns=True) + which_cols = [ "Region", "CHR", diff --git a/jass/test/data_real/initTable.hdf5 b/jass/test/data_real/initTable.hdf5 index c0991ad580a4cea014ba80f654607884366a1d70..821bdcc09b2740e389cec2d5049da4fb1b2d3523 100644 Binary files a/jass/test/data_real/initTable.hdf5 and b/jass/test/data_real/initTable.hdf5 differ diff --git a/jass/test/data_real/metadata.txt b/jass/test/data_real/metadata.txt new file mode 100644 index 0000000000000000000000000000000000000000..1811a8b6bc5642d02aceb32ec4ea41a2cdfbf020 --- /dev/null +++ b/jass/test/data_real/metadata.txt @@ -0,0 +1,6 @@ +information content +title Small subset of Curated GWAS data +description "lorem ipsum" +ancestry UNK +assembly hg99 +foo bar \ No newline at end of file diff --git a/jass/test/data_real/summary.csv b/jass/test/data_real/summary.csv old mode 100755 new mode 100644 diff --git a/jass/test/data_test1/initTable.hdf5 b/jass/test/data_test1/initTable.hdf5 index 904ed8c837cac132e7e8795377c2dd559e40ee4a..43518edff657babee693ad6a63b453d2fbe434cb 100644 Binary files a/jass/test/data_test1/initTable.hdf5 and b/jass/test/data_test1/initTable.hdf5 differ diff --git a/jass/test/data_test1/metadata.txt b/jass/test/data_test1/metadata.txt new file mode 100644 index 0000000000000000000000000000000000000000..8eb75e7afde9d6be08b89e2cf8892384a384af5a --- /dev/null +++ b/jass/test/data_test1/metadata.txt @@ -0,0 +1,5 @@ +information content +title Mock dataset with disney +description "lorem ipsum" +ancestry DIS +assembly dSNY diff --git a/jass/test/data_test2/initTable.hdf5 b/jass/test/data_test2/initTable.hdf5 index 035a9cc84f3df0ba479ce53a8ebd55b6f7953833..b7dc5d7db470bf3819f4617cd3b9e92e1a2c1f0d 100644 Binary files a/jass/test/data_test2/initTable.hdf5 and b/jass/test/data_test2/initTable.hdf5 differ diff --git a/jass/test/data_test2/metadata.txt b/jass/test/data_test2/metadata.txt new file mode 100644 index 0000000000000000000000000000000000000000..e70e8c5f23fb4ce9142e833a6da916d228e9f001 --- /dev/null +++ b/jass/test/data_test2/metadata.txt @@ -0,0 +1,5 @@ +information content +title Mock dataset with car +description "lorem ipsum" +ancestry CAR +assembly car1 diff --git a/jass/test/test_server.py b/jass/test/test_server.py index 024202f4c4d35470b0a22118e7300871197b47e1..6234855e187645ceb519bf53df82f1f5b5dcbcc6 100644 --- a/jass/test/test_server.py +++ b/jass/test/test_server.py @@ -68,7 +68,15 @@ class TestDefaultController(JassWebClientTestCase): respT1 = json.loads(response.content.decode("utf-8")) self.assertNotEqual(respT1, respMain) - self.assertSetEqual(set(respMain.keys()), {'nb_phenotypes', 'nb_snps', 'name', 'desc'}) + for key in { + 'nb_phenotypes', + 'nb_snps', + 'name', + 'desc', + 'ancestry', + 'assembly', + }: + self.assertIn(key , respMain) def test_get_tables(self): response = self.testing_client.get("/api/tables") diff --git a/jass/test/update_test_hdf5_files.sh b/jass/test/update_test_hdf5_files.sh index ed467befff155a6bebedfbd10b9d2c26e1e81887..35e5729cc5c41ea7910035759d8d9edbda9d2e35 100755 --- a/jass/test/update_test_hdf5_files.sh +++ b/jass/test/update_test_hdf5_files.sh @@ -23,7 +23,7 @@ for DATA_DIR in $DATA_DIRS; do fi echo "Creating inittable" - jass create-inittable --input-data-path "./${DATA_DIR}/z*.txt" --init-covariance-path "./${DATA_DIR}/COV.csv" --init-genetic-covariance-path ${GEN_COV} --regions-map-path "./${DATA_DIR}/regions.txt" --description-file-path "./${DATA_DIR}/summary.csv" --init-table-path "./${DATA_DIR}/initTable.hdf5" + jass create-inittable --input-data-path "./${DATA_DIR}/z*.txt" --init-covariance-path "./${DATA_DIR}/COV.csv" --init-genetic-covariance-path ${GEN_COV} --regions-map-path "./${DATA_DIR}/regions.txt" --description-file-path "./${DATA_DIR}/summary.csv" --init-table-metadata-path "./${DATA_DIR}/metadata.txt" --init-table-path "./${DATA_DIR}/initTable.hdf5" echo "Creating worktable" jass create-project-data --init-table-path "${DATA_DIR}/initTable.hdf5" --phenotype ${TRAITS} --worktable-path ./${DATA_DIR}/worktable.hdf5 diff --git a/requirements.txt b/requirements.txt index 8055a80aa0b16f65bb6142269481eb87df40ce2a..eca93137eabad0c9c1d363794dff515723c22b0e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -17,3 +17,5 @@ httpx uvicorn[standard] typing_extensions; python_version < '3.8' requests +h5py +wheel \ No newline at end of file diff --git a/scripts/hdf5_add_attributes.py b/scripts/hdf5_add_attributes.py new file mode 100644 index 0000000000000000000000000000000000000000..bb3067b37a932fa0fe11209c028e43939d73a3c5 --- /dev/null +++ b/scripts/hdf5_add_attributes.py @@ -0,0 +1,102 @@ +import argparse +import csv +import json +import tempfile + +from pandas import HDFStore, read_csv + +from jass.models.inittable import get_inittable_meta + + +def set_metadata_from_file(*, hdf5_file, init_table_metadata_path): + global init_store, metadata + init_store = HDFStore(hdf5_file) + metadata = read_csv(init_table_metadata_path, sep='\t', quotechar='"', index_col=False, memory_map=True) + init_store.put("METADATA", metadata, format="table", data_columns=True) + init_store.close() + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--init-table-path", default=None, help="path to the inittable to edit", required=True, dest="hdf5_file" + ) + meta_arg = parser.add_argument( + "--init-table-metadata-path", + required=False, + default=None, + help="path to metadata file to attache to the inittable. Note that all previous metadata are purged.", + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--clean-metadata", + action="store_true", + default=False, + help="Remove all information in metadata before adding new one", + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--title", + help="title to append to the metadata", + default=None, + required=False, + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--description", + help="description to append to the metadata", + default=None, + required=False, + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--ancestry", + help="ancestry to append to the metadata", + default=None, + required=False, + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--assembly", + help="assembly to append to the metadata", + default=None, + required=False, + ) + args = parser.parse_args() + + if args.init_table_metadata_path: + set_metadata_from_file(hdf5_file=args.hdf5_file, init_table_metadata_path=args.init_table_metadata_path) + else: + init_store = HDFStore(args.hdf5_file, mode='r') + if args.clean_metadata: + metadata = dict() + else: + try: + df = init_store.get('METADATA') + metadata = dict((df.iloc[i, 0], df.iloc[i, 1]) for i in range(len(df))) + except KeyError: + metadata = dict() + init_store.close() + for k in [ + 'title', + 'description', + 'ancestry', + 'assembly', + ]: + if getattr(args, k): + metadata[k] = getattr(args, k) + + with tempfile.NamedTemporaryFile(suffix=".csv") as f: + with open(f.name, 'w', newline='') as csvfile: + csvwriter = csv.writer(csvfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) + csvwriter.writerow(["information", "content"]) + for item in metadata.items(): + csvwriter.writerow(item) + set_metadata_from_file(hdf5_file=args.hdf5_file, init_table_metadata_path=f.name) + + print("Resulting metadata is:", json.dumps(get_inittable_meta(args.hdf5_file), indent=4))