diff --git a/jass/__main__.py b/jass/__main__.py index 3efd212c8661561cece809034b9a5f2d34a0bf9f..8779f1fef58bf9be7bf0858a84edc7fbf8740aaf 100644 --- a/jass/__main__.py +++ b/jass/__main__.py @@ -12,7 +12,7 @@ import uvicorn from jass.config import config from jass.models.phenotype import get_available_phenotypes -from jass.models.inittable import create_inittable_file, add_gene_annotation, add_inittable_meta +from jass.models.inittable import create_inittable_file, add_gene_annotation from jass.models.worktable import create_worktable_file from jass.models.project import get_projects_last_access, load_project from jass.models.plots import ( @@ -231,6 +231,7 @@ def w_create_inittable(args): regions_map_path = absolute_path_of_the_file(args.regions_map_path) description_file_path = absolute_path_of_the_file(args.description_file_path) + init_table_metadata_path = absolute_path_of_the_file(args.init_table_metadata_path) init_table_path = absolute_path_of_the_file(args.init_table_path, True) create_inittable_file( @@ -240,9 +241,9 @@ def w_create_inittable(args): init_table_path, init_covariance_path, init_genetic_covariance_path, + init_table_metadata_path=init_table_metadata_path, ) - add_inittable_meta(init_table_path, args.title, args.description_inittable) - + def w_plot_manhattan(args): worktable_path = absolute_path_of_the_file(args.worktable_path) @@ -440,17 +441,10 @@ def get_parser(): help="path to the genetic covariance file to import. Used only for display on Jass web application", ) parser_create_it.add_argument( - "--title", - required=False, - default="inittable_{}".format(date.today()), - help="Title of the dataset (optional)", - ) - - parser_create_it.add_argument( - "--description-inittable", - required=False, - default="inittable generated the {}".format(date.today()), - help="Textual description of the dataset (optional)", + "--init-table-metadata-path", + required=False, + default=None, + help="path to metadata file to attache to the initial data file", ) parser_create_it.set_defaults(func=w_create_inittable) diff --git a/jass/models/inittable.py b/jass/models/inittable.py index 292d7311fc57e45bf38c3c96fc2802bc89b3c798..72785ee3b1e56305b23ad9dbf420b5f3595223d5 100644 --- a/jass/models/inittable.py +++ b/jass/models/inittable.py @@ -9,9 +9,6 @@ import re import glob import logging from pandas import HDFStore, DataFrame, read_csv, concat, options, read_hdf -import h5py -import pandas as pd -# create (or open) an hdf5 file and opens in append mode import numpy as np import tables import warnings @@ -30,33 +27,36 @@ class InitMeta(object): def get_inittable_meta(file_name): init_store = HDFStore(file_name, mode='r') nb_snps = init_store.get_storer("SumStatTab").nrows + metadata = dict( + title=f"Filename: {file_name.split('/')[-1]}", + description="No description", + ancestry="??", + assembly="????", + ) + try: + df = init_store.get('METADATA') + for i in range(len(df)): + metadata[df.iloc[i, 0]] = df.iloc[i, 1] + except KeyError: + pass init_store.close() nb_phenotypes = read_hdf(file_name, "PhenoList").shape[0] - try: - f = h5py.File(file_name, mode='r') - name=f.attrs['title'] - desc=f.attrs["description"] - f.close() - except KeyError: - name = f"Filename: {file_name.split('/')[-1]}" - desc = "No description" return dict( nb_snps=int(nb_snps), nb_phenotypes=int(nb_phenotypes), - name=name, - desc=desc, + name=metadata['title'], + desc=metadata['description'], + **dict( + (k, metadata[k]) + for k in set(metadata.keys()) if k not in { + 'title', + 'description', + } + ), ) -def add_inittable_meta(file_name, title, description): - """ - add description in hdf5 attributes - """ - f = h5py.File(file_name, mode='a') - f.attrs['title'] = title - f.attrs["description"] = description - f.close() - + def get_gwasname(file_name): return "_".join(os.path.basename(file_name).split("_")[0:3]) @@ -235,6 +235,7 @@ def create_inittable_file( init_table_path: str, init_covariance_path=None, init_genetic_covariance_path=None, + init_table_metadata_path=None, ): # Read region file regions = read_csv(regions_map_path, sep="\s+", memory_map=True) @@ -282,6 +283,11 @@ def create_inittable_file( GEN_COV = genetic_covariance.loc[pheno_select, pheno_select] hdf_init.put("GEN_COV", GEN_COV, format="table", data_columns=True) + # Read metadata from file and store it + if init_table_metadata_path is not None: + metadata = read_csv(init_table_metadata_path, sep='\t', quotechar='"', index_col=False, memory_map=True) + hdf_init.put("METADATA", metadata, format="table", data_columns=True) + which_cols = [ "Region", "CHR", diff --git a/jass/test/data_real/initTable.hdf5 b/jass/test/data_real/initTable.hdf5 index c0991ad580a4cea014ba80f654607884366a1d70..821bdcc09b2740e389cec2d5049da4fb1b2d3523 100644 Binary files a/jass/test/data_real/initTable.hdf5 and b/jass/test/data_real/initTable.hdf5 differ diff --git a/jass/test/data_real/metadata.txt b/jass/test/data_real/metadata.txt new file mode 100644 index 0000000000000000000000000000000000000000..1811a8b6bc5642d02aceb32ec4ea41a2cdfbf020 --- /dev/null +++ b/jass/test/data_real/metadata.txt @@ -0,0 +1,6 @@ +information content +title Small subset of Curated GWAS data +description "lorem ipsum" +ancestry UNK +assembly hg99 +foo bar \ No newline at end of file diff --git a/jass/test/data_test1/initTable.hdf5 b/jass/test/data_test1/initTable.hdf5 index 96edbc7a1ae7f15228d6656a5238d72372fe4976..43518edff657babee693ad6a63b453d2fbe434cb 100644 Binary files a/jass/test/data_test1/initTable.hdf5 and b/jass/test/data_test1/initTable.hdf5 differ diff --git a/jass/test/data_test1/metadata.txt b/jass/test/data_test1/metadata.txt new file mode 100644 index 0000000000000000000000000000000000000000..8eb75e7afde9d6be08b89e2cf8892384a384af5a --- /dev/null +++ b/jass/test/data_test1/metadata.txt @@ -0,0 +1,5 @@ +information content +title Mock dataset with disney +description "lorem ipsum" +ancestry DIS +assembly dSNY diff --git a/jass/test/data_test2/initTable.hdf5 b/jass/test/data_test2/initTable.hdf5 index 11d1d35d79470d801dd232691621e0ee54c091d1..b7dc5d7db470bf3819f4617cd3b9e92e1a2c1f0d 100644 Binary files a/jass/test/data_test2/initTable.hdf5 and b/jass/test/data_test2/initTable.hdf5 differ diff --git a/jass/test/data_test2/metadata.txt b/jass/test/data_test2/metadata.txt new file mode 100644 index 0000000000000000000000000000000000000000..e70e8c5f23fb4ce9142e833a6da916d228e9f001 --- /dev/null +++ b/jass/test/data_test2/metadata.txt @@ -0,0 +1,5 @@ +information content +title Mock dataset with car +description "lorem ipsum" +ancestry CAR +assembly car1 diff --git a/jass/test/update_test_hdf5_files.sh b/jass/test/update_test_hdf5_files.sh index ed467befff155a6bebedfbd10b9d2c26e1e81887..35e5729cc5c41ea7910035759d8d9edbda9d2e35 100755 --- a/jass/test/update_test_hdf5_files.sh +++ b/jass/test/update_test_hdf5_files.sh @@ -23,7 +23,7 @@ for DATA_DIR in $DATA_DIRS; do fi echo "Creating inittable" - jass create-inittable --input-data-path "./${DATA_DIR}/z*.txt" --init-covariance-path "./${DATA_DIR}/COV.csv" --init-genetic-covariance-path ${GEN_COV} --regions-map-path "./${DATA_DIR}/regions.txt" --description-file-path "./${DATA_DIR}/summary.csv" --init-table-path "./${DATA_DIR}/initTable.hdf5" + jass create-inittable --input-data-path "./${DATA_DIR}/z*.txt" --init-covariance-path "./${DATA_DIR}/COV.csv" --init-genetic-covariance-path ${GEN_COV} --regions-map-path "./${DATA_DIR}/regions.txt" --description-file-path "./${DATA_DIR}/summary.csv" --init-table-metadata-path "./${DATA_DIR}/metadata.txt" --init-table-path "./${DATA_DIR}/initTable.hdf5" echo "Creating worktable" jass create-project-data --init-table-path "${DATA_DIR}/initTable.hdf5" --phenotype ${TRAITS} --worktable-path ./${DATA_DIR}/worktable.hdf5 diff --git a/scripts/hdf5_add_attributes.py b/scripts/hdf5_add_attributes.py index 8b5ab65729e6a37de281310a0bcb2d3783daccfa..bb3067b37a932fa0fe11209c028e43939d73a3c5 100644 --- a/scripts/hdf5_add_attributes.py +++ b/scripts/hdf5_add_attributes.py @@ -1,23 +1,102 @@ +import argparse +import csv +import json +import tempfile -from jass.models.inittable import get_inittable_meta, add_inittable_meta -# we need python package h5py to read/write .hdf5 file +from pandas import HDFStore, read_csv +from jass.models.inittable import get_inittable_meta -if __name__ == "__main__": - - title = 'Curated GWAS summary statistics on African ancestry on 19 blood count traits and glycemic traits (hg38)' - des = 'Genome wide curated summary statistics on 19 blood count traits and glycemic traits' \ - 'File format is the inittable format intended to be used with the Joint Analysis of Summary Statistics (JASS), which allows to perform multi-trait GWAS:' \ - 'https://gitlab.pasteur.fr/statistical-genetics/jass' \ - 'GWAS of hematological traits originate from Chen et al paper and were downloaded from the GWAS Catalog (https://www.ebi.ac.uk/gwas/publications/32888493#study_panel). GWAS of glycemic traits come from the (18) study downloadable from GWAS Catalog (https://www.ebi.ac.uk/gwas/publications/34059833).' - hdf5_file = '/pasteur/zeus/projets/p02/GGS_JASS/jass_pipeline_dev_copie/jass/jass/test/data_test2/initTable.hdf5' - - add_inittable_meta(hdf5_file, title, des) - - print(get_inittable_meta(hdf5_file)) +def set_metadata_from_file(*, hdf5_file, init_table_metadata_path): + global init_store, metadata + init_store = HDFStore(hdf5_file) + metadata = read_csv(init_table_metadata_path, sep='\t', quotechar='"', index_col=False, memory_map=True) + init_store.put("METADATA", metadata, format="table", data_columns=True) + init_store.close() +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument( + "--init-table-path", default=None, help="path to the inittable to edit", required=True, dest="hdf5_file" + ) + meta_arg = parser.add_argument( + "--init-table-metadata-path", + required=False, + default=None, + help="path to metadata file to attache to the inittable. Note that all previous metadata are purged.", + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--clean-metadata", + action="store_true", + default=False, + help="Remove all information in metadata before adding new one", + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--title", + help="title to append to the metadata", + default=None, + required=False, + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--description", + help="description to append to the metadata", + default=None, + required=False, + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--ancestry", + help="ancestry to append to the metadata", + default=None, + required=False, + ) + mutex_grp = parser.add_mutually_exclusive_group() + mutex_grp._group_actions.append(meta_arg) + mutex_grp.add_argument( + "--assembly", + help="assembly to append to the metadata", + default=None, + required=False, + ) + args = parser.parse_args() + if args.init_table_metadata_path: + set_metadata_from_file(hdf5_file=args.hdf5_file, init_table_metadata_path=args.init_table_metadata_path) + else: + init_store = HDFStore(args.hdf5_file, mode='r') + if args.clean_metadata: + metadata = dict() + else: + try: + df = init_store.get('METADATA') + metadata = dict((df.iloc[i, 0], df.iloc[i, 1]) for i in range(len(df))) + except KeyError: + metadata = dict() + init_store.close() + for k in [ + 'title', + 'description', + 'ancestry', + 'assembly', + ]: + if getattr(args, k): + metadata[k] = getattr(args, k) + with tempfile.NamedTemporaryFile(suffix=".csv") as f: + with open(f.name, 'w', newline='') as csvfile: + csvwriter = csv.writer(csvfile, delimiter='\t', quotechar='"', quoting=csv.QUOTE_MINIMAL) + csvwriter.writerow(["information", "content"]) + for item in metadata.items(): + csvwriter.writerow(item) + set_metadata_from_file(hdf5_file=args.hdf5_file, init_table_metadata_path=f.name) + print("Resulting metadata is:", json.dumps(get_inittable_meta(args.hdf5_file), indent=4))