Commit 64be0d3f authored by hjulienn's avatar hjulienn
Browse files

add stratification for performance test

parent fee74822
Pipeline #48317 failed with stage
in 1 minute and 4 seconds
......@@ -14,7 +14,7 @@ import numpy as np
def generated_test_data(zscore, N_to_mask=5000, condition=None):
def generated_test_data(zscore, N_to_mask=5000, condition=None, stratifying_vector=None, stratifying_bins=None):
"""
Mask N_to_mask Snps in the dataframe zscore and return the dataframe with missing SNPs.
Args:
......@@ -22,16 +22,30 @@ def generated_test_data(zscore, N_to_mask=5000, condition=None):
condition (None or pandas boolean Series): If None, SNPs will be mask randomly. If a pandas boolean series is passed, masked SNPs will be randomly chosen inside the SNPs which have True value.
N_to_mask (int): Number of SNPs to mask.
"""
if isinstance(condition, pd.Series)==False:
masked = np.random.choice(zscore.index, N_to_mask, replace=False)
else:
try:
print(condition)
print(isinstance(condition, pd.Series))
try:
if isinstance(condition, pd.Series)==True:
masked = np.random.choice(zscore.index[condition], N_to_mask, replace=False)
except ValueError as ve:
print("Couldn't sample {0} SNPs".format(N_to_mask))
print("raise {0}".format(ve))
else:
if isinstance(stratifying_vector, pd.Series)==True:
masked = []
binned = np.digitize(stratifying_vector, stratifying_bins)
N_bins = len(stratifying_vector)-1
for i in range(N_bins):
masked = masked + list(np.random.choice(zscore.index[(binned==(i+1))], N_to_mask // N_bins, replace=False))
masked = np.array(masked)
else:
masked = np.random.choice(zscore.index, N_to_mask, replace=False)
except ValueError as ve:
print("Couldn't sample {0} SNPs".format(N_to_mask))
print("raise {0}".format(ve))
known = zscore.index.difference(masked)
print(known)
print(zscore)
return (zscore.loc[known], masked)
def imputation_performance(zscore_initial, zscore_imputed, masked):
......@@ -117,7 +131,8 @@ def grid_search(zscore_folder, masked_folder, output_folder,
ref_folder, ld_folder, gwas, chrom="chr22",
eigen_ratio_grid = [0.5, 0.1, 0.01], window_size= 500000,
buffer_size=125000, l2_regularization=0.1, R2_threshold=0.6,
N_to_mask=5000,ref_panel_suffix=".eur.1pct.bim", ld_type="plink"):
N_to_mask=5000,ref_panel_suffix=".eur.1pct.bim", ld_type="plink",
stratifying_vector=None, stratifying_bins=None):
"""
Compute the imputation performance for several eigen ratioself.
The procedure is the following:
......@@ -137,13 +152,17 @@ def grid_search(zscore_folder, masked_folder, output_folder,
eigen_ratio_grid (list): list of eigen_ratio to test (must be between 0 and 1)
window_size, buffer_size, l2_regularization, R2_threshold : imputation parameter (see raiss command line documentation)
N_to_mask (int): Number of SNPs masked in the initial dataset to compute the correlation between true value and imputed value
ref_panel_suffix (str): suffix
ld_type (str): The type of file where the LD is stored should be 'plink' or 'scipy'
stratifying_vector (pd.Series) : a continuous vector containin one value per SNPs used to stratify the sampling of SNPs to mask
stratifying_bins (list) : a vector specifying the boundary values to form the bins
"""
z_file = "{0}/z_{1}_{2}.txt".format(zscore_folder, gwas, chrom)
z_output = "{0}/z_{1}_{2}.txt".format(output_folder, gwas, chrom)
dat_orig = pd.read_csv(z_file, sep="\t", index_col=0)
res_masked = generated_test_data(dat_orig, N_to_mask)
res_masked = generated_test_data(dat_orig, N_to_mask, stratifying_vector=stratifying_vector, stratifying_bins=stratifying_bins)
z_masked = res_masked[0]
z_masked_file = "{0}/z_{1}_{2}.txt".format(masked_folder, gwas, chrom)
z_masked.to_csv(z_masked_file, sep="\t")
......@@ -164,6 +183,8 @@ def grid_search(zscore_folder, masked_folder, output_folder,
for rd in eigen_ratio_grid:
z_output = "{0}/z_{1}_{2}_{3}.txt".format(output_folder, gwas, chrom, rd)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
print(rd)
print(dat_imp)
try:
res = imputation_performance(dat_orig, dat_imp, masked_SNP)
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment