Commit 3901cc4d authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

improvement performance report

parent 98125ff6
......@@ -8,6 +8,7 @@
from raiss.pipes import save_chromosome_imputation
import multiprocessing
import itertools
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
......@@ -151,7 +152,9 @@ def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder,
def grid_search(zscore_folder, masked_folder, output_folder,
ref_folder, ld_folder, gwas, chrom="chr22",
eigen_ratio_grid = [0.5, 0.1, 0.01], window_size= 500000,
eigen_ratio_grid = [0.5, 0.1, 0.01],
ld_threshold_grid = [0,4, 10,20],
window_size= 500000,
buffer_size=125000, l2_regularization=0.1, R2_threshold=0.6,
N_to_mask=5000,ref_panel_preffix="",ref_panel_suffix=".eur.1pct.bim", ld_type="plink",
stratifying_vector=None, stratifying_bins=None, LD_threshold=4):
......@@ -172,6 +175,7 @@ def grid_search(zscore_folder, masked_folder, output_folder,
gwas (str): gwas identifier in the following format : 'CONSORTIA_TRAIT'
chrom (str): chromosome in the format "chr.."
eigen_ratio_grid (list): list of eigen_ratio to test (must be between 0 and 1)
ld_threshold_grid (list) : list of minimum-ld to test (must be > 0 )
window_size, buffer_size, l2_regularization, R2_threshold : imputation parameter (see raiss command line documentation)
N_to_mask (int): Number of SNPs masked in the initial dataset to compute the correlation between true value and imputed value
ref_panel_suffix (str): suffix
......@@ -190,39 +194,45 @@ def grid_search(zscore_folder, masked_folder, output_folder,
z_masked.to_csv(z_masked_file, sep="\t")
masked_SNP = res_masked[1]
def run_imputation(cond):
tag = "_{}".format(cond)
def run_imputation(param):
cond = param[0]
min_ld = param[1]
tag = "_{0}_{1}".format(cond, min_ld)
save_chromosome_imputation(gwas, chrom, window_size, buffer_size,
l2_regularization, cond, masked_folder,
ref_folder, ld_folder, output_folder,
R2_threshold, tag, ref_panel_preffix,
ref_panel_suffix, ld_type, minimum_ld = LD_threshold)
ref_panel_suffix, ld_type, minimum_ld = min_ld)
n_cpu = multiprocessing.cpu_count()
Parallel(n_jobs=n_cpu)(delayed(run_imputation)(rd) for rd in eigen_ratio_grid)
param_grid = itertools.product(eigen_ratio_grid, ld_threshold_grid)
R2_serie = pd.DataFrame({'N_SNP':np.nan, 'fraction_imputed': np.nan, 'cor':np.nan, 'mean_absolute_error':np.nan, 'median_absolute_error':np.nan,
'min_absolute_error':np.nan,'max_absolute_error':np.nan, "SNP_max_error":np.nan}, index = eigen_ratio_grid)
for rd in eigen_ratio_grid:
z_output = "{0}/z_{1}_{2}_{3}.txt".format(output_folder, gwas, chrom, rd)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
print(rd)
try:
res = imputation_performance(dat_orig, dat_imp, masked_SNP)
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
print(e)
res = np.nan
R2_serie.loc[rd, 'N_SNP'] = res["N_SNP"]
R2_serie.loc[rd, 'cor'] = res["cor"]
R2_serie.loc[rd, 'mean_absolute_error'] = res["mean_absolute_error"]
R2_serie.loc[rd, 'fraction_imputed'] = res["fraction_imputed"]
R2_serie.loc[rd, 'median_absolute_error'] = res["median_absolute_error"]
R2_serie.loc[rd, 'min_absolute_error'] = res["min_absolute_error"]
R2_serie.loc[rd, 'max_absolute_error'] = res["max_absolute_error"]
R2_serie.loc[rd, 'SNP_max_error'] = res["SNP_max_error"]
print(len(masked_SNP))
print("Result for rd {0} = cor: {1}, fraction_imputed: {2}".format(rd, res["cor"], res["fraction_imputed"]))
'min_absolute_error':np.nan,'max_absolute_error':np.nan, "SNP_max_error":np.nan}, index = pd.MultiIndex.from_tuples(param_grid, names=["eigen_ratio", "min_ld"]))
param_grid = itertools.product(eigen_ratio_grid, ld_threshold_grid)
Parallel(n_jobs=n_cpu)(delayed(run_imputation)(param) for param in param_grid)
for min_ld in ld_threshold_grid:
for rd in eigen_ratio_grid:
z_output = "{0}/z_{1}_{2}_{3}_{4}.txt".format(output_folder, gwas, chrom, rd, min_ld)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
print(rd)
try:
res = imputation_performance(dat_orig, dat_imp, masked_SNP)
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
print(e)
res = np.nan
ind_loop = (rd, min_ld)
R2_serie.loc[ind_loop, 'N_SNP'] = res["N_SNP"]
R2_serie.loc[ind_loop, 'cor'] = res["cor"]
R2_serie.loc[ind_loop, 'mean_absolute_error'] = res["mean_absolute_error"]
R2_serie.loc[ind_loop, 'fraction_imputed'] = res["fraction_imputed"]
R2_serie.loc[ind_loop, 'median_absolute_error'] = res["median_absolute_error"]
R2_serie.loc[ind_loop, 'min_absolute_error'] = res["min_absolute_error"]
R2_serie.loc[ind_loop, 'max_absolute_error'] = res["max_absolute_error"]
R2_serie.loc[ind_loop, 'SNP_max_error'] = res["SNP_max_error"]
print(len(masked_SNP))
print("Result for ind_loop {0} = cor: {1}, fraction_imputed: {2}".format(ind_loop, res["cor"], res["fraction_imputed"]))
return(R2_serie)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment