diff --git a/doc/source/generating_joint_analysis.rst b/doc/source/generating_joint_analysis.rst index ecceb8b2a37cd0c829b7dd92ea5fe06faaabd1bc..3369d6d353222733d891f0cd7ed4aac229dc9bce 100644 --- a/doc/source/generating_joint_analysis.rst +++ b/doc/source/generating_joint_analysis.rst @@ -38,8 +38,14 @@ If the user wishes to, she/he can specify a vector of weight by using the --cust Access HDFStore components -------------------------- -To access each table of the HDFStore you can use `pandas read_hdf functions <https://pandas.pydata.org/docs/reference/api/pandas.read_hdf.html>`_ : +Each table of the HDFStore is accessible through the command line tool `jass extract-tsv` (see command line reference for complete details). +.. code-block:: shell + + jass extract-tsv --hdf5-table-path ./initTable.hdf5 --tsv-path './test_extract.tsv' --table-key SumStatTab + +Alternately, you can use directly `pandas read_hdf functions <https://pandas.pydata.org/docs/reference/api/pandas.read_hdf.html>`_ : + For instance if you want to access the Regions table : .. code-block:: python diff --git a/jass/__main__.py b/jass/__main__.py index 4733ea6055c7ec4d1047c2539a2cf5b9f17f5524..968205dae217c8cdf25511acab49873388d9ebf3 100644 --- a/jass/__main__.py +++ b/jass/__main__.py @@ -7,7 +7,6 @@ import sys import argparse from datetime import timedelta, datetime from json import JSONDecodeError - import uvicorn from jass.config import config @@ -22,7 +21,7 @@ from jass.models.plots import ( create_local_plot, create_qq_plot, ) - +from pandas import read_hdf def absolute_path_of_the_file(fileName, output_file=False): """ @@ -93,6 +92,39 @@ def w_list_phenotypes(args): print([phenotype.id for phenotype in phenotypes]) +def w_extract_tsv(args): + hdf5_path = args.hdf5_table_path + csv_path = args.tsv_path + table_key = args.table_key + if table_key!="SumStatTab": + table = read_hdf(hdf5_path, table_key) + table.to_csv(csv_path, sep="\t") + else: + append=0 + regions = read_hdf(hdf5_path, "Regions") + if 'Region' in regions.columns: + regions = regions.Region.tolist() + else: + regions = regions.index.tolist() + end_reg = max(regions) + start_reg = min(regions) + slice_size = 50 + for binf in range(start_reg, (end_reg+slice_size), slice_size): + sum_stat_tab = read_hdf( + hdf5_path, + "SumStatTab", + where="Region >= {0} and Region < {1}".format(binf, binf+slice_size) + ) + if append==0: + with open(csv_path, "w") as f: + sum_stat_tab.to_csv(f, sep="\t") + append=1 + else: + with open(csv_path, "a") as f: + sum_stat_tab.to_csv(f, header=0, sep="\t") + + + def compute_worktable(args): csv_file_path = args.csv_file_path @@ -557,6 +589,23 @@ def get_parser(): ) parser_create_mp.set_defaults(func=w_gene_annotation) + # ------- extract-csv -------# + parser_create_mp = subparsers.add_parser( + "extract-tsv", help="Extract a table from a hdf5 repository to the tsv format. Will work for the worktables and the inittable.\nWARNING: can strongly increase storage space needed" + ) + parser_create_mp.add_argument( + "--hdf5-table-path", + required=True, + help="path to the worktable file containing the data", + ) + parser_create_mp.add_argument( + "--tsv-path", required=True, help="path to the tsv table" + ) + parser_create_mp.add_argument( + "--table-key", + help="Existing key are 'SumStatTab' : The results of the joint analysis by SNPs - 'PhenoList' : the meta data of analysed GWAS - 'COV' : The H0 covariance used to perform joint analysis - 'GENCOV' (If present in the initTable): The genetic covariance as computed by the LDscore. Uniquely for the worktable: 'Regions' : Results of the joint analysis summarised by LD regions (Notably Lead SNPs by regions) - 'summaryTable': a double entry table summarizing the number of significant regions by test (univariate vs joint test)", + ) + parser_create_mp.set_defaults(func=w_extract_tsv) return parser diff --git a/jass/models/inittable.py b/jass/models/inittable.py index 112451269aff8077524150c37b2c96f2ac74f638..632f486f86c9224e1c296e1dbcc4753aa4a60a59 100644 --- a/jass/models/inittable.py +++ b/jass/models/inittable.py @@ -30,12 +30,11 @@ def get_inittable_meta(file_name): nb_snps = init_store.get_storer("SumStatTab").nrows init_store.close() nb_phenotypes = read_hdf(file_name, "PhenoList").shape[0] - return {"nb_snps":int(nb_snps), "nb_phenotypes":int(nb_phenotypes)} + return {"nb_snps":np.int64(nb_snps), "nb_phenotypes":np.int64(nb_phenotypes)} def get_gwasname(file_name): return "_".join(os.path.basename(file_name).split("_")[0:3]) - def check_if_SNP_unique(z_gwas_chrom): tg = z_gwas_chrom.groupby(["snp_ids"]).count()["CHR"].reset_index() if len(tg.loc[tg["CHR"] > 1, "CHR"]) > 0: @@ -115,7 +114,6 @@ def create_pheno_summary(description): ] ] pheno_summary_data.index = pheno_summary_data["ID"] - return pheno_summary_data @@ -149,7 +147,7 @@ def format_chr_gwas(gwas_file_chri, chrom, study_name, regions_bychr): left = region_row["start"] right = region_row["stop"] ind = (z_gwas["position"] >= left) & (z_gwas["position"] <= right) - (z_gwas.loc[ind, "Region"]) = np.int(region_index + 1) + (z_gwas.loc[ind, "Region"]) = np.int64(region_index + 1) (z_gwas.loc[ind, "MiddlePosition"]) = (left + right) / 2 return z_gwas @@ -418,7 +416,7 @@ def add_gene_annotation( elements = ligne.split("\t") if elements[0].startswith("NC_"): decode_chr = elements[0].strip("NC_").split(".") - chr = int(decode_chr[0]) + chr = np.int64(decode_chr[0]) if chr <= 22: if elements[2] == "gene": gene_chr.append(chr) @@ -435,8 +433,8 @@ def add_gene_annotation( elif elements[2] == "exon": TMP__exon_chr.append(chr) - TMP__exon_start.append(int(elements[3])) - TMP__exon_end.append(int(elements[4])) + TMP__exon_start.append(np.int64(elements[3])) + TMP__exon_end.append(np.int64(elements[4])) TMP__exon_direction.append(elements[6]) decode_id_1 = elements[8].split(";") decode_id_2 = decode_id_1[0].split("=") diff --git a/jass/models/plots.py b/jass/models/plots.py index 8a95ebebbf7ef21f88a3c1a0b1fa674830bf4d49..c5985ccf65912bc38e57fedd4eba02c9478de409 100644 --- a/jass/models/plots.py +++ b/jass/models/plots.py @@ -1,4 +1,4 @@ -# -*- coding: utf-8 -*- +# -*- coding: utf-8 -*-t """ This software allows to plot and store graphs which can be displayed on the web interface. @@ -32,17 +32,15 @@ def replaceZeroes(df): df.values[df.values == 0] = min_nonzero return df - - def create_global_plot(work_file_path: str, global_plot_path: str): """ create_global_plot generate genome-wide manhattan plot for a given set of phenotypes """ + regions = read_hdf(work_file_path, "Regions") chr_length = regions.groupby('CHR').max().position - N_reg= regions.shape[0] - + N_reg= regions.Region.max() maxy = 0 fig = plt.figure(figsize=(30, 12)) @@ -53,8 +51,8 @@ def create_global_plot(work_file_path: str, global_plot_path: str): '#4287f5', 'orangered' ] - binf=1 - bsup= chunk_size + binf=regions.Region.iloc[0] + bsup= binf+chunk_size chr_considered= regions.CHR.unique() length_chr = regions.groupby("CHR").MiddlePosition.max() / 10**6 length_chr.loc[0] = 0 @@ -63,18 +61,15 @@ def create_global_plot(work_file_path: str, global_plot_path: str): lab_pos = length_chr.loc[chr_considered]/2 pos_shift = length_chr.cumsum() pos_shift.index = pos_shift.index +1 - pos_shift.loc[1] = 0 - lab_pos = lab_pos + [pos_shift.loc[i] for i in chr_considered] + pos_shift.loc[chr_considered[0]] = 0 + lab_pos = lab_pos + [pos_shift.loc[i] for i in chr_considered] while binf < N_reg: - df = read_hdf(work_file_path, "SumStatTab", columns=["CHR","position", 'JASS_PVAL', "Region"], where = "Region >= {0} and Region < {1}".format(binf, bsup)) binf+= chunk_size bsup= binf+ chunk_size - df["plot_position"] = pos_shift.loc[df.CHR].values + df.position/ 10**6 df["-log10(Joint p-value)"] = -np.log10(df.JASS_PVAL) - max_reg = np.max(df["-log10(Joint p-value)"]) if maxy < max_reg: maxy = max_reg @@ -106,7 +101,7 @@ def create_local_plot(work_file_path: str, local_plot_path: str): colors = ['blue', 'red'] df = read_hdf(work_file_path, "SumStatTab") - chr = int(df["CHR"].iloc[0]) + chr = np.int64(df["CHR"].iloc[0]) Num_color = chr % 2 df['JASS_PVAL'] = replaceZeroes(df['JASS_PVAL']) diff --git a/jass/models/worktable.py b/jass/models/worktable.py index 05546e14b047e58d3abe6ec6e47d3f7e93be8467..a59f63a5da48ac2ace0cac83102cd24856d30735 100644 --- a/jass/models/worktable.py +++ b/jass/models/worktable.py @@ -60,7 +60,7 @@ def signif(x, digit): if x == 0: return 0 - return round(x, digit - int(math.floor(math.log10(abs(x)))) - 1) + return round(x, digit - np.int64(math.floor(math.log10(abs(x)))) - 1) def choose_stat_function( @@ -363,7 +363,7 @@ def create_worktable_file( "create_worktable_file: the required argument chromosome is not a number" ) else: - num_Chr = int(chromosome) + num_Chr = np.int64(chromosome) if (pos_Start is None) and (pos_End is None): chromosome_full = True @@ -584,9 +584,9 @@ def create_worktable_file( for pattern in frequent_pattern: for ligne, indice in enumerate(dico_index_x[pattern]): - Retour_omnibus[int(indice)] = ( + Retour_omnibus[np.int64(indice)] = ( Retour_omnibus_bypattern[pattern] - )[int(ligne)] + )[np.int64(ligne)] sum_stat_tab["JASS_PVAL"] = Retour_omnibus @@ -651,7 +651,7 @@ def create_worktable_file( print("CALLING PROGRESS FUNCTION") # the chunk index starts at zero and we take into account the 2 plot stages JASS_progress = round((chunk + 1) * 100 / (Nchunk + 2)) - callback_report(int(JASS_progress)) + callback_report(np.int64(JASS_progress)) hdf_work.close() @@ -735,8 +735,8 @@ def store_pattern(dico_z, dico_index_x, *colonne): store_pattern Reorders z-values by pattern and store their index in the original dataframe """ - Index = int(colonne[0]) - Codage = int(colonne[1]) + Index = np.int64(colonne[0]) + Codage = np.int64(colonne[1]) if not (Codage in dico_z): dico_z[Codage] = [] @@ -838,8 +838,8 @@ def get_worktable_summary(project_hdf_path: str): lines = {} for index, row in summary_dataframe.iterrows(): lines[index] = { - "MinUnivSignif": int(row["MinUnivSignif"]), - "MinUnivNotSignif": int(row["MinUnivNotSignif"]), + "MinUnivSignif": np.int64(row["MinUnivSignif"]), + "MinUnivNotSignif": np.int64(row["MinUnivNotSignif"]), } return lines diff --git a/requirements.txt b/requirements.txt index cbdbddcbb29c31ca7932c7c96e6fa0f50da3e0d4..bfb4e8bc162f14a8b16a6ad6d335d9d094a68a81 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,8 +10,10 @@ scipy matplotlib seaborn celery +importlib-metadata==4.13.0 pydantic fastapi +httpx uvicorn[standard] typing_extensions; python_version < '3.8' requests