diff --git a/doc/source/generating_joint_analysis.rst b/doc/source/generating_joint_analysis.rst index ecceb8b2a37cd0c829b7dd92ea5fe06faaabd1bc..3d8a5833fd26948df698d603fb6ff2cf20167d59 100644 --- a/doc/source/generating_joint_analysis.rst +++ b/doc/source/generating_joint_analysis.rst @@ -35,16 +35,22 @@ By default, all the traits will have the same weight. If the user wishes to, she/he can specify a vector of weight by using the --custom-loadings option. -Access HDFStore components --------------------------- +Extract HDFStore components to tsv files +---------------------------------------- -To access each table of the HDFStore you can use `pandas read_hdf functions <https://pandas.pydata.org/docs/reference/api/pandas.read_hdf.html>`_ : +Each table of the HDFStore is accessible through the command line tool `jass extract-tsv` (see command line reference for complete details). + +.. code-block:: shell + + jass extract-tsv --hdf5-table-path ./initTable.hdf5 --tsv-path './test_extract.tsv' --table-key SumStatTab + +alternately, you can use directly `pandas read_hdf functions <https://pandas.pydata.org/docs/reference/api/pandas.read_hdf.html>`_ : For instance if you want to access the Regions table : .. code-block:: python - pd.read_hdf("WK_test.hdf5", "Regions") + pd.read_hdf("WK_test.hdf5", "Regions") Note that is you wish that the SumStatTab table to be saved as a csv file you can provide the command lines with the --csv-file-path option and a csv will be generated as well. Outputting a csv while lengthen execution @@ -54,5 +60,7 @@ Command Line example -------------------- See command line usage for details + .. code-block:: shell + jass create-project-data --init-table-path init_table/init_table_EUR_not_imputed.hdf5 --phenotype z_MAGIC_GLUCOSE-TOLERANCE z_MAGIC_FAST-GLUCOSE z_MAGIC_FAST-INSULIN z_MAGIC_HBA1C --worktable-path ./work_glycemic.hdf5 --manhattan-plot-path ./manhattan_glycemic.png --quadrant-plot-path ./quadrant_glycemic.png diff --git a/jass/__main__.py b/jass/__main__.py index 4733ea6055c7ec4d1047c2539a2cf5b9f17f5524..b9776c0b9e455195a81e4d640ce1fcd757893804 100644 --- a/jass/__main__.py +++ b/jass/__main__.py @@ -7,6 +7,7 @@ import sys import argparse from datetime import timedelta, datetime from json import JSONDecodeError +import pandas as pd import uvicorn @@ -184,6 +185,35 @@ def w_clean_project_data(args): else: print("keeping it") +def w_extract_tsv(args): + + hdf5_path = args.hdf5_table_path + csv_path = args.tsv_path + table_key = args.table_key + + if table_key!="SumStatTab": + table = pd.read_hdf(hdf5_path, table_key) + table.to_csv(csv_path, sep="\t") + else: + append=0 + regions = pd.read_hdf(hdf5_path, "Regions").index.tolist() + end_reg = max(regions) + start_reg = min(regions) + slice_size = 50 + + for binf in range(start_reg, (end_reg+slice_size), slice_size): + sum_stat_tab = pd.read_hdf( + hdf5_path, + "SumStatTab", + where="Region >= {0} and Region < {1}".format(binf, binf+slice_size), + ) + if append==0: + with open(csv_path, "w") as f: + sum_stat_tab.to_csv(f, sep="\t") + append=1 + else: + with open(csv_path, "a") as f: + sum_stat_tab.to_csv(f, header=0, sep="\t") def w_create_inittable(args): input_data_path = absolute_path_of_the_file(args.input_data_path) @@ -533,6 +563,27 @@ def get_parser(): ) parser_create_mp.set_defaults(func=w_plot_quadrant) + # ------- extract-csv ------- + + parser_create_mp = subparsers.add_parser( + "extract-tsv", help="Extract a table from a hdf5 repository to the tsv format. Will work for the worktables and the inittable.\nWARNING: can strongly increase storage space needed" + ) + parser_create_mp.add_argument( + "--hdf5-table-path", + required=True, + help="path to the worktable file containing the data", + ) + parser_create_mp.add_argument( + "--tsv-path", required=True, help="path to the tsv table" + ) + parser_create_mp.add_argument( + "--table-key", + help="Existing key are 'SumStatTab' : The results of the joint analysis by SNPs - 'PhenoList' : the meta data of analysed GWAS - 'COV' : The H0 covariance used to perform joint analysis - 'GENCOV' (If present in the initTable): The genetic covariance as computed by the LDscore. Uniquely for the worktable: 'Regions' : Results of the joint analysis summarised by LD regions (Notably Lead SNPs by regions) - 'summaryTable': a double entry table summarizing the number of significant regions by test (univariate vs joint test)", + ) + parser_create_mp.set_defaults(func=w_extract_tsv) + + + # ------- add-gene-annotation ------- parser_create_mp = subparsers.add_parser(