command line tool to compute gain

fe5f676a · Hanna JULIENNE · 18f81184 · fe5f676a · fe5f676a · fe5f676a
Commit fe5f676a authored 1 year ago by Hanna JULIENNE
--- a/data/coef_mean_model.tsv
+++ b/data/coef_mean_model.tsv
+	0
+k_coef_mv	0.07740334977380119
+log10_avg_distance_cor_coef_mv	-0.6999110771883902
+log10_mean_gencov_coef_mv	0.746794584985343
+avg_Neff_coef_mv	0.07289261717080556
+avg_h2_mixer_coef_mv	-0.516496395500929
+avg_perc_h2_diff_region_coef_mv	0.15727591593399
--- a/data/combi_sample.tsv
+++ b/data/combi_sample.tsv
--- a/data/range_feature_gain_prediction.tsv
+++ b/data/range_feature_gain_prediction.tsv
+	minimum_value	maximum_value
+k	2.0	12.0
+log10_avg_distance_cor	-4.675617219570908	0.20864138105896807
+log10_mean_gencov	-4.4093921991254446	-0.46117501106209624
+avg_Neff	6730.5	697828.0
+avg_h2_mixer	0.014033707225812	0.4361454950334251
+avg_perc_h2_diff_region	0.0906544694784672	0.9831222899777692
--- a/jass/__main__.py
+++ b/jass/__main__.py
@@ -21,6 +21,8 @@ from jass.models.plots import (
    create_local_plot,
    create_qq_plot,
 )
+from jass.models.gain import compute_gain
+
 from pandas import read_hdf

 def absolute_path_of_the_file(fileName, output_file=False):
@@ -279,6 +281,14 @@ def w_gene_annotation(args):
        gene_data_path, initTable_path, df_gene_csv_path, df_exon_csv_path
    )

+def w_compute_gain(args):
+    combi_path = absolute_path_of_the_file(args.combination_path)
+    combi_path_with_gain = absolute_path_of_the_file(args.gain_path, True)
+
+    compute_gain(
+        combi_path, combi_path_with_gain
+    )
+

 def get_parser():
    parser = argparse.ArgumentParser(prog="jass")
@@ -619,6 +629,22 @@ def get_parser():
        help="Existing key are 'SumStatTab' : The results of the joint analysis by SNPs - 'PhenoList' : the meta data of analysed GWAS - 'COV' : The H0 covariance used to perform joint analysis - 'GENCOV' (If present in the initTable): The genetic covariance as computed by the LDscore. Uniquely for the worktable: 'Regions' : Results of the joint analysis summarised by LD regions (Notably Lead SNPs by regions) - 'summaryTable': a double entry table summarizing the number of significant regions by test (univariate vs joint test)",
    )
    parser_create_mp.set_defaults(func=w_extract_tsv)
+    
+    # ------- compute predicted gain -------#
+    parser_create_mp = subparsers.add_parser(
+        "predict-gain", help="predict gain based on the genetic architecture of the set of multi-trait"
+    )
+    parser_create_mp.add_argument(
+        "--combination-path",
+        required=True,
+        help="path to the worktable file containing the data",
+    )
+    parser_create_mp.add_argument(
+        "--gain-path", required=True, help="path to save predicted gain"
+    )
+
+    parser_create_mp.set_defaults(func=w_compute_gain)
+    
    return parser



--- a/jass/models/gain.py
+++ b/jass/models/gain.py
+import pandas as pd
+import numpy as np
+
+X_range = pd.read_csv("./data/range_feature_gain_prediction.tsv", sep="\t", index_col=0)
+model_coefficients =  pd.read_csv("./data/coef_mean_model.tsv", sep="\t", index_col=0)
+
+# Scale according to observed 
+def scale_feature(X, feature_name):
+    X_std = (X - X_range.loc[feature_name, "minimum_value"]) / ( X_range.loc[feature_name, "maximum_value"] -  X_range.loc[feature_name, "minimum_value"])
+    return X_std
+
+def preprocess_feature(df_combinations):
+    # transformation of features
+ 
+    df_combinations['log10_mean_gencov'] = np.log10(df_combinations.mean_gencov)
+    df_combinations['log10_avg_distance_cor'] = np.log10(df_combinations.avg_distance_cor)
+    for f in ["k", "log10_avg_distance_cor", "log10_mean_gencov", "avg_Neff", "avg_h2_mixer", "avg_perc_h2_diff_region"]:
+        df_combinations[f] = scale_feature(df_combinations[f], f)
+    return df_combinations
+
+
+def compute_gain(path_combi, path_output):
+
+    df_combinations = pd.read_csv(path_combi)
+
+    preprocess_feature(df_combinations)
+    df_combinations["gain"] = df_combinations[["k", "log10_avg_distance_cor", "log10_mean_gencov", "avg_Neff", "avg_h2_mixer", "avg_perc_h2_diff_region"]].dot(model_coefficients["0"].values)
+    df_combinations.sort_values(by="gain", ascending=False).to_csv(path_output, sep="\t")