Skip to content
Snippets Groups Projects
Commit f29b1e77 authored by Bryan BRANCOTTE's avatar Bryan BRANCOTTE
Browse files

Merge branch 'predict_gain' into newmain-gain

parents 7467a216 835782b8
No related branches found
No related tags found
3 merge requests!98Draft: Newmaster,!97Newmain gain,!96Draft: Newmain
include README.md
include setup.cfg
include jass/swagger/swagger.yaml
recursive-include jass/static *
recursive-include celery_files *
recursive-include jass/models/data/ *.tsv
recursive-include jass/static *
\ No newline at end of file
0
k_coef_mv 0.07740334977380119
log10_avg_distance_cor_coef_mv -0.6999110771883902
log10_mean_gencov_coef_mv 0.746794584985343
avg_Neff_coef_mv 0.07289261717080556
avg_h2_coef_mv -0.516496395500929
avg_perc_h2_diff_region_coef_mv 0.15727591593399
This diff is collapsed.
GRP1,z_GIANT_HIP z_GLG_HDL z_GLG_LDL z_MAGIC_2HGLU-ADJBMI
GRP2,z_SPIRO-UKB_FVC z_SPIRO-UKB_FEV1 z_TAGC_ASTHMA
traits k avg_distance_cor mean_gencov avg_Neff avg_h2 avg_perc_h2_diff_region log10_mean_gencov log10_avg_distance_cor gain
1 ['z_SPIRO-UKB_FVC', 'z_SPIRO-UKB_FEV1', 'z_TAGC_ASTHMA'] 0.1 0.1731946683845993 0.0637 0.3843393026739591 0.2785193310634847 0.7976315890930669 0.8139196701681637 0.8013809378674498 0.06428524764535551
0 ['z_GIANT_HIP', 'z_GLG_HDL', 'z_GLG_LDL', 'z_MAGIC_2HGLU-ADJBMI'] 0.2 0.14899001074867035 0.01535 0.12076877719858631 0.22628198390356655 0.9055326131023057 0.6573854616675169 0.7879956172999502 -0.010766494024690904
minimum_value maximum_value
k 2.0 12.0
log10_avg_distance_cor -4.675617219570908 0.20864138105896807
log10_mean_gencov -4.4093921991254446 -0.46117501106209624
avg_Neff 6730.5 697828.0
avg_h2 0.014033707225812 0.4361454950334251
avg_perc_h2_diff_region 0.0906544694784672 0.9831222899777692
Compute JASS power gain from the genetic architecture of traits
===============================================================
In a recent study :cite:`suzuki2023trait`, we explore how the genetic architecture of the set of traits (heritability, genetic covariance, heritability undetected by the univariate test, ...) can be predictive of statistical power gain of the multi-trait test.
We implement an additional command line tool to give access our predictive model (the **jass predict-gain** command).
This command allows the to score swiftly a large number of traits combinations and to focus on set of traits the most promising for multi-trait testing.
To work the inittable provided to the **jass predict-gain** command must contain the genetic covariance between traits.
.. code-block:: shell
jass predict-gain --inittable-path inittable_curated_111_traits_20-03-2024.hdf5 --combination_path ./combination_example.tsv --gain-path predicted_gain.tsv
The second argument (--combination_path) is a path to a file containing the set of traits to be scored.
.. csv-table:: Set of traits
:widths: 20, 70
:header-rows: 1
GRP1,z_GIANT_HIP z_GLG_HDL z_GLG_LDL z_MAGIC_2HGLU-ADJBMI
GRP2,z_SPIRO-UKB_FVC z_SPIRO-UKB_FEV1 z_TAGC_ASTHMA
When executed the command will created a report at --gain-path
.. csv-table:: Predicted gain
:header-rows: 1
traits,k,avg_distance_cor,mean_gencov,avg_Neff,avg_h2,avg_perc_h2_diff_region,log10_mean_gencov,log10_avg_distance_cor,gain
['z_SPIRO-UKB_FVC'; 'z_SPIRO-UKB_FEV1'; 'z_TAGC_ASTHMA'],0.1,0.1731946683845993,0.0637,0.3843393026739591,0.2785193310634847,0.7976315890930669,0.8139196701681637,0.8013809378674498,0.06428524764535551
['z_GIANT_HIP'; 'z_GLG_HDL'; 'z_GLG_LDL'; 'z_MAGIC_2HGLU-ADJBMI'],0.2,0.14899001074867035,0.01535,0.12076877719858631,0.22628198390356655,0.9055326131023057,0.6573854616675169,0.7879956172999502,-0.010766494024690904
The last column provide the predicted gain ("the higher the more promising"). Note that extrapoling on new data might give lesser performances than reported in :cite:`suzuki2023trait`.
.. bibliography:: reference.bib
\ No newline at end of file
......@@ -14,6 +14,7 @@ JASS documentation
install
data_import
generating_joint_analysis
get_predicted_gain
command_line_usage
web_usage
web_admin
......
......@@ -20,6 +20,13 @@
publisher={Oxford University Press}
}
@article{suzuki2023trait,
title={Trait selection strategy in multi-trait GWAS: Boosting SNPs discoverability},
author={Suzuki, Yuka and M{\'e}nager, Herv{\'e} and Brancotte, Bryan and Vernet, Rapha{\"e}l and Nerin, Cyril and Boetto, Christophe and Auvergne, Antoine and Linhard, Christophe and Torchet, Rachel and Lechat, Pierre and others},
journal={bioRxiv},
year={2023},
publisher={Cold Spring Harbor Laboratory Preprints}
}
@article{Pasaniuc2014,
......
......@@ -21,6 +21,8 @@ from jass.models.plots import (
create_local_plot,
create_qq_plot,
)
from jass.models.gain import create_features, compute_gain
from pandas import read_hdf
def absolute_path_of_the_file(fileName, output_file=False):
......@@ -279,6 +281,15 @@ def w_gene_annotation(args):
gene_data_path, initTable_path, df_gene_csv_path, df_exon_csv_path
)
def w_compute_gain(args):
inittable_path = absolute_path_of_the_file(args.inittable_path)
combi_path = absolute_path_of_the_file(args.combination_path)
combi_path_with_gain = absolute_path_of_the_file(args.gain_path, True)
features = create_features(inittable_path, combi_path)
compute_gain(features, combi_path_with_gain)
def get_parser():
parser = argparse.ArgumentParser(prog="jass")
......@@ -619,6 +630,27 @@ def get_parser():
help="Existing key are 'SumStatTab' : The results of the joint analysis by SNPs - 'PhenoList' : the meta data of analysed GWAS - 'COV' : The H0 covariance used to perform joint analysis - 'GENCOV' (If present in the initTable): The genetic covariance as computed by the LDscore. Uniquely for the worktable: 'Regions' : Results of the joint analysis summarised by LD regions (Notably Lead SNPs by regions) - 'summaryTable': a double entry table summarizing the number of significant regions by test (univariate vs joint test)",
)
parser_create_mp.set_defaults(func=w_extract_tsv)
# ------- compute predicted gain -------#
parser_create_mp = subparsers.add_parser(
"predict-gain", help="Predict gain based on the genetic architecture of the set of multi-trait. To function, this command need the inittable to contain genetic covariance store under the key 'GEN_COV in the inittable'"
)
parser_create_mp.add_argument(
"--inittable-path",
required=True,
help="Path to the inittable",
)
parser_create_mp.add_argument(
"--combination-path",
required=True,
help="Path to the file storing combination to be scored",
)
parser_create_mp.add_argument(
"--gain-path", required=True, help="path to save predicted gain"
)
parser_create_mp.set_defaults(func=w_compute_gain)
return parser
......
0
k_coef_mv 0.07740334977380119
log10_avg_distance_cor_coef_mv -0.6999110771883902
log10_mean_gencov_coef_mv 0.746794584985343
avg_Neff_coef_mv 0.07289261717080556
avg_h2_coef_mv -0.516496395500929
avg_perc_h2_diff_region_coef_mv 0.15727591593399
minimum_value maximum_value
k 2.0 12.0
log10_avg_distance_cor -4.675617219570908 0.20864138105896807
log10_mean_gencov -4.4093921991254446 -0.46117501106209624
avg_Neff 6730.5 697828.0
avg_h2 0.014033707225812 0.4361454950334251
avg_perc_h2_diff_region 0.0906544694784672 0.9831222899777692
import pkg_resources
import pandas as pd
import numpy as np
import os
# data issued from https://doi.org/10.1101/2023.10.27.564319
stream = pkg_resources.resource_stream(__name__, 'data/range_feature_gain_prediction.tsv')
X_range = pd.read_csv(stream, sep="\t", index_col=0)
stream = pkg_resources.resource_stream(__name__, 'data/coef_mean_model.tsv')
model_coefficients = pd.read_csv(stream, sep="\t", index_col=0)
# Scale according to observed
def scale_feature(X, feature_name):
X_std = (X - X_range.loc[feature_name, "minimum_value"]) / ( X_range.loc[feature_name, "maximum_value"] - X_range.loc[feature_name, "minimum_value"])
return X_std
def preprocess_feature(df_combinations):
# transformation of features
df_combinations['log10_mean_gencov'] = np.log10(df_combinations.mean_gencov)
df_combinations['log10_avg_distance_cor'] = np.log10(df_combinations.avg_distance_cor)
for f in ["k", "log10_avg_distance_cor", "log10_mean_gencov", "avg_Neff", "avg_h2", "avg_perc_h2_diff_region"]:
df_combinations[f] = scale_feature(df_combinations[f], f)
return df_combinations
def compute_gain(df_combinations, path_output):
preprocess_feature(df_combinations)
df_combinations["gain"] = df_combinations[["k", "log10_avg_distance_cor", "log10_mean_gencov", "avg_Neff", "avg_h2", "avg_perc_h2_diff_region"]].dot(model_coefficients["0"].values)
df_combinations.sort_values(by="gain", ascending=False).to_csv(path_output, sep="\t")
# cov to cor
def cov2cor(c):
"""
Return a correlation matrix given a covariance matrix.
: c = covariance matrix
"""
D = 1 / np.sqrt(np.diag(c)) # takes the inverse of sqrt of diag.
return D * c * D
def compute_detected_undected_h2(inittable_path):
phenoL = pd.read_hdf(inittable_path, "PhenoList")
gen_cov = pd.read_hdf(inittable_path, "GEN_COV")
region = pd.read_hdf(inittable_path, "Regions")
combi_c = list(phenoL.index)
reg_start=0
reg_end=50
chunk_size=50
Nchunk = region.shape[0] // chunk_size + 1
start_value = 0
zscore_threshold = 5.452
h2_GW = np.zeros(len(combi_c))
for chunk in range(start_value, Nchunk):
print(chunk)
binf = chunk * chunk_size
bsup = (chunk + 1) * chunk_size
init_extract = pd.read_hdf(inittable_path, "SumStatTab", where= "Region >= {0} and Region < {1}".format(reg_start, reg_end))
init_extract[combi_c] = init_extract[combi_c].abs()
max_zscore = init_extract[["Region"] + combi_c].groupby("Region").max()
Neff_term = np.ones(max_zscore.shape)
Neff_term = Neff_term* (1/phenoL.loc[combi_c, "Effective_sample_size"].values)
beta_2 = max_zscore.mask((max_zscore < zscore_threshold))
beta_2 = beta_2 * np.sqrt(Neff_term)
beta_2 = beta_2.mask( (beta_2 > 0.019))
h2_GW += (beta_2**2).sum()
h2 = np.diag(gen_cov.loc[combi_c, combi_c])
undetected_h2 = ((h2 - h2_GW) / h2)
phenoL["h2"] = h2
phenoL["h2_GW"] = h2_GW
phenoL["undetected_h2_perc"] = undetected_h2
return phenoL
def add_h2_to_pheno_description(inittable_path):
phenoL_before = pd.read_hdf(inittable_path, "PhenoList")
if "avg_perc_h2_diff_region" in phenoL_before.columns:
phenoL = compute_detected_undected_h2(inittable_path)
phenoL.to_hdf(inittable_path, key="table")
def compute_mean_cov(cov, combi_c):
rows, cols = np.indices(cov.loc[combi_c, combi_c].shape)
mean_gencov = cov.loc[combi_c, combi_c].where(rows != cols).stack().abs().mean()
return mean_gencov
def compute_diff_cor(res_cov, gen_cov, combi_c):
res_cor = cov2cor(res_cov.loc[combi_c, combi_c])
gen_cor = cov2cor(gen_cov.loc[combi_c, combi_c])
rows, cols = np.indices(res_cor.loc[combi_c, combi_c].shape)
off_gencor = res_cor.where(rows != cols).stack()
off_rescor = gen_cor.where(rows != cols).stack()
return (off_gencor - off_rescor).abs().mean()
def compute_mean_undetected_h2(phenoL, combi_c):
mean_h2 = np.mean(phenoL.loc[combi_c, "undetected_h2_perc"])
return mean_h2
def compute_mean_h2(phenoL, combi_c):
mean_h2 = np.mean(phenoL.loc[combi_c, "h2"])
return mean_h2
def compute_mean_Neff(phenoL, combi_c):
mean_neff = np.mean(phenoL.loc[combi_c, "Effective_sample_size"])
return mean_neff
#beta_2_GW = beta_2_GW * (1/phenoL.loc[combi_c, "Effective_sample_size"].values)
def create_features(inittable_path, combi_file):
add_h2_to_pheno_description(inittable_path)
phenoL = pd.read_hdf(inittable_path, "PhenoList")
gen_cov = pd.read_hdf(inittable_path, "GEN_COV")
res_cov = pd.read_hdf(inittable_path, "COV")
combi = pd.read_csv(combi_file, sep=",", index_col=0, names=["combi"])
combi = list(combi.combi.str.split(" "))
D = {'traits':[] ,'k':[], 'avg_distance_cor': [], 'mean_gencov': [],
'avg_Neff':[], 'avg_h2':[], 'avg_perc_h2_diff_region':[]}
for c in combi:
D['traits'].append(str(c))
D["k"].append(len(c))
D["avg_distance_cor"].append(compute_diff_cor(res_cov, gen_cov, c))
D["mean_gencov"].append(compute_mean_cov(gen_cov, c))
D["avg_Neff"].append(compute_mean_Neff(phenoL, c))
D["avg_h2"].append(compute_mean_h2(phenoL, c))
D["avg_perc_h2_diff_region"].append(compute_mean_undetected_h2(phenoL, c))
return pd.DataFrame.from_dict(D)
......@@ -31,7 +31,7 @@ setup(
install_requires=REQUIRES,
license="MIT",
keywords=["GWAS", "Data analysis", "summary statistics"],
package_data={'jass': ['swagger/swagger.yaml']},
package_data={'jass': ['swagger/swagger.yaml', "data/*.tsv"]},
include_package_data=True,
long_description=readme,
long_description_content_type="text/markdown",
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment