Commit df0a48a6 authored by Hanna  JULIENNE's avatar Hanna JULIENNE

Reporting more information in the performance evaluation module

parent e59f1f07
Pipeline #9931 passed with stages
in 1 minute and 14 seconds
This source diff could not be displayed because it is too large. You can view the blob instead.
# coding: utf-8
# Direct imputation with the presence of causal SNPs
import numpy as np
import pandas as pd
import seaborn as sns
import scipy
import raiss
def exponential_decay(nsnp):
exp_decay = np.zeros((nsnp, nsnp))
dec = np.exp(-np.linspace(0,4,nsnp))
for i in range(nsnp):
exp_decay[i] = np.roll(dec, i)
tr = np.triu(exp_decay, k=1)
tr = tr + np.transpose(tr)
np.fill_diagonal(tr,1)
return(tr)
# # # Create a vector of causal effect
# Insert one causal SNPs
def generate_one_causal(amplitude, LD_cor):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp)
beta[nsnp//2] = amplitude
Zscore =(beta.dot(LD_cor))/ nsnp
return({"causal":beta, "Zscore":Zscore})
def generate_two_causal(amplitude, LD_cor):
nsnp = LD_cor.shape[0]
beta = np.random.normal(size=nsnp)
beta[nsnp//4] = amplitude
beta[3*(nsnp//4)] = amplitude
Zscore =(beta.dot(LD_cor))/ nsnp
return({"causal":beta, "Zscore":Zscore})
def get_cor(rd, Zscore, LD_cor, ids_known, ids_masked):
imputed = raiss.stat_models.raiss_model(Zscore[ids_known], pd.DataFrame(LD_cor[ids_known,:][:,ids_known]), LD_cor[ids_masked,:][:,ids_known], rcond=rd)
cor_perf = np.corrcoef(imputed['mu'], Zscore[ids_masked])[0,1]
return(cor_perf)
# Get best rd
def best_rd(zscore, LD_cor, ids_known, ids_masked):
rd_list = np.linspace(0.001,0.75, 100)
cor_perf = pd.Series(index=rd_list)
for rd in rd_list:
cor_perf[rd] = get_cor(rd, zscore, LD_cor, ids_known, ids_masked)
return({"correlation": cor_perf[np.argmax(cor_perf)],
"rcond":np.argmax(cor_perf)})
if __name__=="__main__":
nsnp = 100
# Draw causal effect in a normal distribution
LD = np.random.uniform(low=0.4, high=1.0, size=(nsnp,nsnp))
M_dec = exponential_decay(nsnp)
np.fill_diagonal(LD, 1)
LD = (LD + LD.transpose()) / 2
LD_cor = np.multiply(LD,M_dec)
n_masked = 40
ids_masked = np.random.choice(100, n_masked, replace=False)
ids_known = np.setdiff1d(np.array(range(100)), ids_masked)
amplitude_effect = pd.Series(index=np.arange(5,101,5))
for amp in amplitude_effect.index:
print('search of best rd for amplitude {}'.format(amp))
Zscore = generate_two_causal(amp, LD_cor)["Zscore"]
res = best_rd(Zscore, LD_cor, ids_known, ids_masked)
print("best rd : {}".format(res['rcond']))
print("Correlation : {}".format(res['correlation']))
amplitude_effect.loc[amp] = res['correlation']
amplitude_effect.to_csv("./results/amplitude_effect_two_SNP.csv")
LD_cor[ids_known,:] [:,ids_masked]
......@@ -14,3 +14,4 @@ import raiss.stat_models as model
import raiss.windows
import raiss.filter_format_output
from raiss.imputation_launcher import ImputationLauncher
import raiss.imputation_R2
......@@ -2,6 +2,7 @@ import argparse
import pandas as pd
from raiss.filter_format_output import filter_output
from raiss.imputation_launcher import ImputationLauncher
from raiss.pipes import save_chromosome_imputation
def launch_chromosome_imputation(args):
......@@ -13,26 +14,7 @@ def launch_chromosome_imputation(args):
args (dict): Argument parsed from the command line see the
__main__.add_chromosome_imputation_argument(parser) function.
"""
print("Imputation of {0} gwas for chromosome {1}".format(args.gwas, args.chrom))
# Imputer settings
imputer = ImputationLauncher( window_size=int(args.window_size), buf=int(args.buffer_size),
lamb= float(args.l2_regularization), pinv_rcond = float(args.eigen_threshold))
# Reading of inputs
z_file = "{0}/z_{1}_{2}.txt".format(args.zscore_folder, args.gwas, args.chrom)
zscore = pd.read_csv(z_file,index_col=0, sep="\t")
ref_panel_file = args.ref_folder + "/"+ args.chrom +".eur.1pct.bim"
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
# imputation
imputed_zscore = imputer.chromosome_imputation(args.chrom, zscore, ref_panel, args.ld_folder)
print("Imputation DONE")
#Formatting and filtering
# and Saving results
z_fo = "{0}/z_{1}_{2}.txt".format(args.output_folder, args.gwas, args.chrom)
filter_output(imputed_zscore, z_fo, float(args.R2_threshold))
print("Save imputation done at {0}".format(z_fo))
save_chromosome_imputation(args.gwas, args.chrom, args.window_size, args.buffer_size, args.l2_regularization, args.eigen_threshold, args.zscore_folder, args.ref_folder, args.R2_threshold)
def add_chromosome_imputation_argument():
......@@ -56,8 +38,6 @@ def add_chromosome_imputation_argument():
return(parser)
def main():
#prog='impute_jass')
parser = add_chromosome_imputation_argument()
args = parser.parse_args()
args.func(args)
......
......@@ -18,6 +18,6 @@ def filter_output(zscores, fout, R2_threshold = 0.8):
zscores.reset_index(inplace = True)
chr_fo = zscores[['index', 'pos', 'A0', 'A1', 'Z', 'Var', "ld_score"]]
chr_fo.columns = ['snp_ids', 'position', 'Ref_allele', 'Alt_allele','z_score', 'Var', "ld_score"]
chr_fo.columns = ['rsID','pos','A0','A1','Z', 'Var', "ld_score"]
chr_fo.loc[chr_fo.Var < (1-R2_threshold)].to_csv(fout, sep="\t", index=False)
from raiss.pipes import save_chromosome_imputation
import multiprocessing
from joblib import Parallel, delayed
import pandas as pd
import numpy as np
"""
Function to generate test dataset with missing SNPs, impute missing SNPs
and compare precision with original dataset.
In other word, this module provides function to empirically measure imputation R2 on independant dataset.
"""
def generated_test_data(zscore, N_to_mask=5000, condition=None):
"""
Mask N_to_mask Snps in the dataframe zscore and return the dataframe with missing SNPs.
Args:
zscore (pandas DataFrame): Zscores DataFrame
condition (None or pandas boolean Series): If None, SNPs will be mask randomly. If a pandas boolean series is passed, masked SNPs will be randomly chosen inside the SNPs which have True value.
N_to_mask (int): Number of SNPs to mask.
"""
if isinstance(condition, pd.Series)==False:
masked = np.random.choice(zscore.index, N_to_mask, replace=False)
else:
try:
masked = np.random.choice(zscore.index[condition], N_to_mask, replace=False)
except ValueError as ve:
print("Couldn't sample {0} SNPs".format(N_to_mask))
print("raise {0}".format(ve))
known = zscore.index.difference(masked)
return (zscore.loc[known], masked)
def imputation_performance(zscore_initial, zscore_imputed, masked):
"""
two dataframe
Args:
zscore_initial (pandas DataFrame):
zscore_imputed (pandas DataFrame):
masked : SNPs ids which have been masked by imputation
"""
try:
fraction_imputed = 1.0-zscore_imputed.loc[masked, "Z"].isnull().mean()
cor = zscore_initial.loc[masked, "Z"].corr(zscore_imputed.loc[masked, "Z"].fillna(0))
cor_on_imputed = zscore_initial.loc[masked, "Z"].corr(zscore_imputed.loc[masked, "Z"])
return {'fraction_imputed':fraction_imputed, 'cor':cor, 'cor_on_imputed':cor_on_imputed}
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
res = np.nan
return {'fraction_imputed': np.nan, 'cor':np.nan, 'cor_on_imputed':np.nan}
def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder, ld_folder, gwas, z_treshold = [0, 1.0, 2.0, 3.0, 4.0, 5], window_size= 500000, buffer_size=125000, eigen_ratio = 0.1, chrom="chr22", l2_regularization=0.1, R2_threshold=0.6):
"""
Compute the imputation performance on SNPs with different amplitude
The procedure is the following:
* Masked Snps in the input dataset in function of the amplitude
for eigen ratio in **eigen_ratio_grid**:
* Impute SNPs
* Compute performance on masked SNps
Args:
zscore_folder (str): path toward the input data folder
masked_folder (str) : path toward the folder to save the dataframe with masked SNPs
output_folder (str) : path toward the folder to save the imputed dataframe
ref_folder (str) : path toward the folder to save the imputed dataframe
ld_folder (str) : path toward the Linkage desiquilibrium matrices
to save the imputed dataframe
gwas (str): gwas identifier in the following format : 'CONSORTIA_TRAIT'
chrom (str): chromosome in the format "chr.."
z_treshold (list) : float list to select Z score to mask above Z score
eigen_ratio (float): rcond parameter (must be between 0 and 1)
window_size, buffer_size, l2_regularization, R2_threshold : imputation parameter (see raiss command line documentation)
"""
z_file = "{0}/z_{1}_{2}.txt".format(zscore_folder, gwas, chrom)
zscore = pd.read_csv(z_file, index_col=0, sep="\t")
z_output = "{0}/z_{1}_{2}.txt".format(output_folder, gwas, chrom)
dat_orig = pd.read_csv(z_file, sep="\t", index_col=0)
def run_imputation(z, i):
print("Z score threshold : {}".format(z))
condition = (dat_orig.Z > z)
print("Number of SNPs verifying condition : {}".format(condition.sum()))
N_to_mask = np.int(np.floor(condition.sum()*0.5))
res_masked = generated_test_data(dat_orig, N_to_mask= N_to_mask, condition = condition)
z_masked = res_masked[0]
z_masked_file = "{0}/z_{1}_{2}_{3}.txt".format(masked_folder, gwas, i, chrom)
z_masked.to_csv(z_masked_file, sep="\t")
masked_SNP = res_masked[1]
tag = "_{}".format(z)
gwas_proc = "{0}_{1}".format(gwas, i)
save_chromosome_imputation(gwas_proc, chrom, window_size, buffer_size, l2_regularization, eigen_ratio, masked_folder, ref_folder, ld_folder, output_folder, R2_threshold, tag)
z_output = "{0}/z_{1}_{2}{3}.txt".format(output_folder, gwas_proc, chrom, tag)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
R2 = imputation_performance(dat_orig, dat_imp, masked_SNP)
print("R2 for {0} Zscore z_treshold: {1}".format(z, R2))
return(R2)
n_cpu = multiprocessing.cpu_count()
R2_val = Parallel(n_jobs=n_cpu)(delayed(run_imputation)(z, i) for i,z in enumerate(z_treshold))
R2_serie =pd.DataFrame(R2_val, columns = ['fraction_imputed','cor','cor_on_imputed'], index = z_treshold)
return(R2_serie)
def grid_search(zscore_folder, masked_folder, output_folder, ref_folder, ld_folder, gwas, chrom="chr22", eigen_ratio_grid = [0.5, 0.1, 0.01], window_size= 500000, buffer_size=125000, l2_regularization=0.1, R2_threshold=0.6):
"""
Compute the imputation performance for several eigen ratioself.
The procedure is the following:
* Masked Snps in the input dataset
for eigen ratio in **eigen_ratio_grid**:
* Impute SNPs
* Compute performance on masked SNps
Args:
zscore_folder (str): path toward the input data folder
masked_folder (str) : path toward the folder to save the dataframe with masked SNPs
output_folder (str) : path toward the folder to save the imputed dataframe
ref_folder (str) : path toward the folder to save the imputed dataframe
ld_folder (str) : path toward the Linkage desiquilibrium matrices
to save the imputed dataframe
gwas (str): gwas identifier in the following format : 'CONSORTIA_TRAIT'
chrom (str): chromosome in the format "chr.."
eigen_ratio_grid (list): list of eigen_ratio to test (must be between 0 and 1)
window_size, buffer_size, l2_regularization, R2_threshold : imputation parameter (see raiss command line documentation)
"""
z_file = "{0}/z_{1}_{2}.txt".format(zscore_folder, gwas, chrom)
z_output = "{0}/z_{1}_{2}.txt".format(output_folder, gwas, chrom)
dat_orig = pd.read_csv(z_file, sep="\t", index_col=0)
res_masked = generated_test_data(dat_orig)
z_masked = res_masked[0]
z_masked_file = "{0}/z_{1}_{2}.txt".format(masked_folder, gwas, chrom)
z_masked.to_csv(z_masked_file, sep="\t")
masked_SNP = res_masked[1]
def run_imputation(cond):
tag = "_{}".format(cond)
save_chromosome_imputation(gwas, chrom, window_size, buffer_size, l2_regularization, cond, masked_folder, ref_folder, ld_folder, output_folder, R2_threshold, tag)
n_cpu = multiprocessing.cpu_count()
Parallel(n_jobs=n_cpu)(delayed(run_imputation)(rd) for rd in eigen_ratio_grid)
R2_serie = pd.DataFrame({"cor":np.nan, "cor_on_imputed":np.nan,
"fraction_imputed":np.nan}, index = eigen_ratio_grid)
for rd in eigen_ratio_grid:
z_output = "{0}/z_{1}_{2}_{3}.txt".format(output_folder, gwas, chrom, rd)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
try:
res = imputation_performance(dat_orig, dat_imp, masked_SNP)
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
res = np.nan
R2_serie.loc[rd, 'cor'] = res["cor"]
R2_serie.loc[rd, 'cor_on_imputed'] = res["cor_on_imputed"]
R2_serie.loc[rd, 'fraction_imputed'] = res["fraction_imputed"]
print(len(masked_SNP))
print("Result for rd {0} = cor: {1}, cor_on_imputed: {2}, fraction_imputed: {3}".format(rd, res["cor"], res["cor_on_imputed"],res["fraction_imputed"] ))
return(R2_serie)
......@@ -8,7 +8,7 @@ on the genome
import glob
import pandas as pd
from .windows import prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
from raiss.windows import prepare_zscore_for_imputation, impg_like_imputation, realigned_zfiles_on_panel
class ImputationLauncher(object):
"""
......
"""
module to manage the creation of files to save the results of imputation
"""
import argparse
import pandas as pd
from raiss.filter_format_output import filter_output
from raiss.imputation_launcher import ImputationLauncher
def save_chromosome_imputation(gwas, chrom, window_size, buffer_size, l2_regularization, eigen_threshold, zscore_folder, ref_folder, ld_folder, output_folder, R2_threshold, tag=""):
"""
module to manage the creation of files to save the results of imputation
Args:
gwas (str): gwas identifier in the following format : 'CONSORTIA_TRAIT'
chrom (str): chromosome in the format "chr.."
zscore_folder (str): path toward the input data folder
masked_folder (str) : path toward the folder to save the dataframe with masked SNPs
output_folder (str) : path toward the folder to save the imputed dataframe
ref_folder (str) : path toward the folder to save the imputed dataframe
ld_folder (str) : path toward the Linkage desiquilibrium matrices
to save the imputed dataframe
eigen_threshold (float): list of eigen_threshold to test (must be between 0 and 1)
window_size, buffer_size, l2_regularization, R2_threshold : imputation parameter (see raiss command line documentation)
"""
print("Imputation of {0} gwas for chromosome {1}".format(gwas, chrom))
# Imputer settings
imputer = ImputationLauncher( window_size=int(window_size), buf=int(buffer_size), lamb= float(l2_regularization), pinv_rcond = float(eigen_threshold))
# Reading of inputs
z_file = "{0}/z_{1}_{2}.txt".format(zscore_folder, gwas, chrom)
zscore = pd.read_csv(z_file, index_col=0, sep="\t")
ref_panel_file = ref_folder + "/"+ chrom +".eur.1pct.bim"
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
# imputation
imputed_zscore = imputer.chromosome_imputation(chrom, zscore, ref_panel, ld_folder)
print("Imputation DONE")
# Formatting and filtering
# and Saving results
z_fo = "{0}/z_{1}_{2}{3}.txt".format(output_folder, gwas, chrom, tag)
filter_output(imputed_zscore, z_fo, float(R2_threshold))
print("Save imputation done at {0}".format(z_fo))
......@@ -47,11 +47,9 @@ def compute_var(sig_i_t, sig_t_inv, lamb, batch=True):
if batch:
var = (1 + lamb) - np.einsum('ij,jk,ki->i', sig_i_t, sig_t_inv ,sig_i_t.transpose())
ld_score = (sig_i_t**2).sum(1)
else:
var = (1 + lamb) - np.dot(sig_i_t, np.dot(sig_t_inv, sig_i_t.transpose()))
ld_score = (sig_i_t**2).sum()
return var, ld_score
def check_inversion(sig_t, sig_t_inv):
......@@ -70,15 +68,14 @@ def var_in_boundaries(var,lamb):
return var
def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
def raiss_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
"""
Compute the variance
Args:
zt (np.array): the vector of known Z scores
sig_t (np.ndarray) : the matrix of known Linkage desiquilibrium
correlation
sig_i_t (np.ndarray): inverse of the correlation matrix of known
matrix
sig_i_t (np.ndarray): correlation matrix of known matrix
lamb (float): regularization term added to the diagonal of the sig_t matrix
rcond (float): threshold to filter eigenvector with a eigenvalue under rcond
make inversion biased but much more numerically robust
......@@ -94,10 +91,10 @@ def impg_model(zt, sig_t, sig_i_t, lamb=0.01, rcond=0.01, batch=True):
condition_number = np.linalg.cond(sig_t)
correct_inversion = check_inversion(sig_t, sig_t_inv)
var, ld_score = compute_var(sig_i_t, sig_t_inv, lamb, batch)
mu = compute_mu(sig_i_t, sig_t_inv, zt)
if np.any(mu > 30):
print("ABERANT SNP SNiP")
var_norm = var_in_boundaries(var, lamb)
R2 = ((1+lamb)-var_norm)
mu = mu / np.sqrt(R2)
return({"var" : var, "mu" : mu, "ld_score" : ld_score, "condition_number" : condition_number, "correct_inversion":correct_inversion })
......@@ -5,8 +5,8 @@ implement the imputation window is sliding along the genome:
- centered_window: A sliding window centered on the Snp to impute
"""
from .stat_models import impg_model
from .ld_matrix import generate_sparse_matrix
from raiss.stat_models import raiss_model
from raiss.ld_matrix import generate_sparse_matrix
import pandas as pd
import numpy as np
......@@ -124,7 +124,7 @@ def empty_imputed_dataframe():
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
rcond, unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snps found in a window
Each missing Snp is imputed by known snps found in a window
Argument.
Args:
ld_file (str): Linkage desiquilibrium matrix files
......@@ -160,7 +160,7 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
zt = zscore.loc[known,'Z']
if(len(known) > 0):
imp = impg_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True)
imp = raiss_model(zt, sig_t, sig_i_t, lamb=lamb, rcond=rcond, batch=True)
batch_df = format_result_df(imp, ref_panel, known, unknowns)
# keep only snp in the core window (and not in the buffer)
......
Markdown is supported
0% or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment