Commit a814b83a authored by hjulienne's avatar hjulienne

updated imputation_R2

parent 61f134aa
Pipeline #42904 failed with stage
in 9 seconds
......@@ -14,7 +14,10 @@ def launch_chromosome_imputation(args):
args (dict): Argument parsed from the command line see the
__main__.add_chromosome_imputation_argument(parser) function.
"""
save_chromosome_imputation(args.gwas, args.chrom, args.window_size, args.buffer_size, args.l2_regularization, args.eigen_threshold, args.zscore_folder, args.ref_folder, args.ld_folder, args.output_folder, args.R2_threshold, ref_panel_suffix=args.ref_panel_suffix)
save_chromosome_imputation(args.gwas, args.chrom, args.window_size,
args.buffer_size, args.l2_regularization, args.eigen_threshold,
args.zscore_folder, args.ref_folder, args.ld_folder, args.output_folder,
args.R2_threshold, ref_panel_suffix=args.ref_panel_suffix, ld_type=args.ld_type)
def add_chromosome_imputation_argument():
......@@ -33,6 +36,7 @@ def add_chromosome_imputation_argument():
parser.add_argument('--eigen-threshold', help= "threshold under which eigen vectors are removed for the computation of the pseudo inverse", default = 0.1)
parser.add_argument('--R2-threshold', help= "R square (imputation quality) threshold bellow which SNPs are filtered from the output", default = 0.6)
parser.add_argument("--ld-type", help= "Ld can be supplied as plink command --ld-snp-list output files (see raiss.ld_matrix.launch_plink_ld to compute these data using plink) or as a couple of a scipy sparse matrix (.npz )and an .csv containing SNPs index", default="plink")
parser.add_argument('--ref-panel-suffix', help= "end of the suffix for the reference panel files", default = ".eur.1pct.bim")
parser.set_defaults(func=launch_chromosome_imputation)
......
......@@ -45,13 +45,15 @@ def imputation_performance(zscore_initial, zscore_imputed, masked):
masked : SNPs ids which have been masked by imputation
"""
try:
masked = zscore_imputed.index.intersection(masked)
fraction_imputed = 1.0-zscore_imputed.loc[masked, "Z"].isnull().mean()
#cor = zscore_initial.loc[masked, "Z"].corr(zscore_imputed.loc[masked, "Z"].fillna(0))
cor = zscore_initial.loc[masked, "Z"].corr(zscore_imputed.loc[masked, "Z"])
MAE = (zscore_initial.loc[masked, "Z"] - zscore_imputed.loc[masked, "Z"]).dropna().abs().mean()
return {'N_SNP':len(masked),'fraction_imputed':fraction_imputed, 'cor':cor, 'mean_absolute_error':MAE}
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
except KeyError:
print(e) # If KeyError none of the masked_SNP are in the imputed dataframe
res = np.nan
return {'N_SNP':np.nan, 'fraction_imputed': np.nan, 'cor':np.nan, 'mean_absolute_error':np.nan}
......@@ -111,7 +113,11 @@ def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder,
return(R2_serie)
def grid_search(zscore_folder, masked_folder, output_folder, ref_folder, ld_folder, gwas, chrom="chr22", eigen_ratio_grid = [0.5, 0.1, 0.01], window_size= 500000, buffer_size=125000, l2_regularization=0.1, R2_threshold=0.6, N_to_mask=5000):
def grid_search(zscore_folder, masked_folder, output_folder,
ref_folder, ld_folder, gwas, chrom="chr22",
eigen_ratio_grid = [0.5, 0.1, 0.01], window_size= 500000,
buffer_size=125000, l2_regularization=0.1, R2_threshold=0.6,
N_to_mask=5000,ref_panel_suffix=".eur.1pct.bim", ld_type="plink"):
"""
Compute the imputation performance for several eigen ratioself.
The procedure is the following:
......@@ -145,7 +151,10 @@ def grid_search(zscore_folder, masked_folder, output_folder, ref_folder, ld_fold
def run_imputation(cond):
tag = "_{}".format(cond)
save_chromosome_imputation(gwas, chrom, window_size, buffer_size, l2_regularization, cond, masked_folder, ref_folder, ld_folder, output_folder, R2_threshold, tag)
save_chromosome_imputation(gwas, chrom, window_size, buffer_size,
l2_regularization, cond, masked_folder,
ref_folder, ld_folder, output_folder,
R2_threshold, tag, ref_panel_suffix, ld_type)
n_cpu = multiprocessing.cpu_count()
Parallel(n_jobs=n_cpu)(delayed(run_imputation)(rd) for rd in eigen_ratio_grid)
......@@ -158,6 +167,7 @@ def grid_search(zscore_folder, masked_folder, output_folder, ref_folder, ld_fold
try:
res = imputation_performance(dat_orig, dat_imp, masked_SNP)
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
print(e)
res = np.nan
R2_serie.loc[rd, 'cor'] = res["cor"]
......
......@@ -15,7 +15,7 @@ class ImputationLauncher(object):
Class to perform imputation of snp from summary statistic
"""
def __init__(self, window_size=10000, buf=2500,
lamb= 0.01, pinv_rcond = 0.01):
lamb= 0.01, pinv_rcond = 0.01, ld_type="plink"):
"""
Initialise the imputation object. Fix the windows size, the buffer size
and the kind of imputation employed
......@@ -34,7 +34,7 @@ class ImputationLauncher(object):
self.buffer = buf
self.lamb = lamb
self.rcond = pinv_rcond
self.ld_type = ld_type
def chromosome_imputation(self, chrom, zscore, ref_panel, ld_folder):
"""
......@@ -44,24 +44,32 @@ class ImputationLauncher(object):
Args:
chrom (str): chromosome "chr*"
zscore (pandas dataframe): known zscore
ref_panel (str): path of the folder of reference panel
ref_panel (DataFrame): chromosome reference panel dataframe
ld_folder (str): path of the folder containing linkage desiquilibrium matrices
Returns:
pandas dataframe: Imputed zscore dataframe
"""
pattern = "{0}/{1}_*.ld".format(ld_folder, chrom)
def get_file_name(x):
return x.split("/")[-1].split(".")[0]
pattern = "{0}/{1}_*".format(ld_folder, chrom)
zscore = prepare_zscore_for_imputation(ref_panel, zscore)
zscore_results = zscore.copy(deep=True)
def imputer(ld_file):
return impg_like_imputation(ld_file, ref_panel, zscore,
self.window_size, self.buffer,
self.lamb, self.rcond)
self.lamb, self.rcond, self.ld_type)
for ld_file in glob.glob(pattern):
ld_file_list = set(map(get_file_name, glob.glob(pattern)))
for ld_file in ld_file_list:
print("processing Region: {0}".format(ld_file))
ld_batch = imputer(ld_file)
ld_path = "{0}/{1}".format(ld_folder, ld_file)
ld_batch = imputer(ld_path)
zscore_results = pd.concat([zscore_results, ld_batch])
zscore_results.sort_values(by="pos", inplace=True)
......
......@@ -46,13 +46,15 @@ def launch_plink_ld(startpos, endpos, chr, reffile, folder):
ld_results.loc[ld_results.R**2 > 0.4].to_csv('{0}.ld'.format(fo), index=False, sep="\t")
def generate_sparse_matrix(plink_ld, ref_chr_df):
def generate_sparse_matrix(plink_ld, ref_chr_df, ld_file_type = "plink"):
"""
Extract correlation matrix from the plink correlation
file generated by ld_matrix.launch_plink_ld
read plink results create a sparse dataframe LD-matrix
then save it to a zipped pickle
Can also load ld from scipy matrix
Args:
plink_ld (str): path to the plink correlation matrix file
ref_chr_df (str):
......@@ -60,21 +62,51 @@ def generate_sparse_matrix(plink_ld, ref_chr_df):
Returns:
pandas.SparseDataFrame : Linkage desiquilibrium matrix
"""
if ld_file_type=="plink":
return load_plink_ld(plink_ld, ref_chr_df)
else:
if ld_file_type=="scipy":
return load_sparse_matrix(plink_ld, ref_chr_df)
else:
raise ValueError("ld files must be plink or scipy sparse matrix")
def load_plink_ld(plink_ld, ref_chr_df):
plink_ld = plink_ld + ".ld"
plink_ld = pd.read_csv(plink_ld, sep = "\s+")
mat_ld = plink_ld.pivot(index='SNP_A', columns='SNP_B', values='R')
un_index = mat_ld.index.union(mat_ld.columns)
mat_ld = mat_ld.reindex(index=un_index, columns=un_index)
mat_ld.fillna(0, inplace=True)
sym = np.maximum(mat_ld.values,mat_ld.values.transpose())
sym = np.where(np.abs(mat_ld.values) > np.abs(mat_ld.values.transpose()), mat_ld.values, mat_ld.values.transpose())
mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns)
re_index = ref_chr_df.loc[mat_ld.index].sort_values(by="pos").index
int_index = ref_chr_df.index.intersection(mat_ld.index)
re_index = ref_chr_df.loc[int_index.index].sort_values(by="pos").index
mat_ld = mat_ld.loc[re_index, re_index]
mat_ld = mat_ld.to_sparse()
return mat_ld
def load_sparse_matrix(path_sparse_LD, ref_chr_df):
"""
Load matrix from LD matrix from scipy sparse file
"""
fi_ld = path_sparse_LD + ".npz"
fi_index = path_sparse_LD + ".csv"
print((fi_ld, fi_index))
Sp_M = sc.sparse.load_npz(fi_ld)
snp_index = pd.read_csv(fi_index, header=None)
Sp_M = Sp_M.todense()
print(Sp_M.shape)
print(snp_index.shape)
mat_ld = (np.where(np.abs(Sp_M) > np.abs(Sp_M.transpose()), Sp_M, Sp_M.transpose()))
mat_ld = pd.DataFrame(mat_ld, index= snp_index.iloc[:,0], columns=snp_index.iloc[:,0])
valid_id = ref_chr_df.index.intersection(mat_ld.index)
print(mat_ld.iloc[:5,:5])
return(mat_ld.loc[valid_id, valid_id])
def generate_genome_matrices(region_files, reffolder, folder_output, suffix = ""):
"""
go through region files and compute LD matrix for each transform and
......
......@@ -7,7 +7,10 @@ import pandas as pd
from raiss.filter_format_output import filter_output
from raiss.imputation_launcher import ImputationLauncher
def save_chromosome_imputation(gwas, chrom, window_size, buffer_size, l2_regularization, eigen_threshold, zscore_folder, ref_folder, ld_folder, output_folder, R2_threshold, tag="", ref_panel_suffix=".eur.1pct.bim"):
def save_chromosome_imputation(gwas, chrom, window_size, buffer_size,
l2_regularization, eigen_threshold, zscore_folder,
ref_folder, ld_folder, output_folder, R2_threshold,
tag="", ref_panel_suffix=".eur.1pct.bim", ld_type="plink"):
"""
module to manage the creation of files to save the results of imputation
Args:
......@@ -25,7 +28,7 @@ def save_chromosome_imputation(gwas, chrom, window_size, buffer_size, l2_regular
print("Imputation of {0} gwas for chromosome {1}".format(gwas, chrom))
# Imputer settings
imputer = ImputationLauncher( window_size=int(window_size), buf=int(buffer_size), lamb= float(l2_regularization), pinv_rcond = float(eigen_threshold))
imputer = ImputationLauncher( window_size=int(window_size), buf=int(buffer_size), lamb= float(l2_regularization), pinv_rcond = float(eigen_threshold), ld_type = ld_type)
# Reading of inputs
z_file = "{0}/z_{1}_{2}.txt".format(zscore_folder, gwas, chrom)
zscore = pd.read_csv(z_file, index_col=0, sep="\t")
......
......@@ -17,7 +17,8 @@ def parse_region_position(ld_file):
Argument :
ld_file : A ld file generated by jass_impute
"""
(chrom, startpos, endpos ) = ld_file.split("/")[-1].split(".")[0].split('_')
(chrom, startpos, endpos ) = ld_file.split("/")[-1].split('_')
#.split(".")[0]
return (chrom, startpos, endpos)
def realigned_zfiles_on_panel(ref_panel, zscore):
......@@ -122,7 +123,7 @@ def empty_imputed_dataframe():
def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
rcond, unknowns=pd.Series([])):
rcond, ld_file_type = "plink"):# unknowns=pd.Series([])):
"""
Each missing Snp is imputed by known snps found in a window
Argument.
......@@ -132,7 +133,7 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
snps
"""
(chrom, start_ld_block, end_ld_block) = parse_region_position(ld_file)
LD_mat = generate_sparse_matrix(ld_file, ref_panel)
LD_mat = generate_sparse_matrix(ld_file, ref_panel, ld_file_type)
Nwindows, window_resize = compute_window_and_size(start_ld_block, end_ld_block, window_size)
all_unknowns = ref_panel.loc[ref_panel.index.difference(zscore.index)]
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment