Commit 74ab312b authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

script for masking different SNPs in studies

parent 00a02591
# coding: utf-8
# Direct imputation with the presence of causal SNPs
import numpy as np
import pandas as pd
import scipy
import os
import sys
import re
from Impute_simulated_signal import *
from sklearn.preprocessing import StandardScaler
def format_result_df(imp, id_masked, Z_known ,known):
result_dict_unknown = {
'pos': id_masked,
"Z" : imp['mu'],
"Var": imp["var"],
"ld_score" : imp["ld_score"],
"condition_number": imp['condition_number'],
"correct_inversion":imp["correct_inversion"],
"Nsnp_to_impute" : len(known)
}
result_dict_known = {
'pos': known,
"Z" : Z_known,
"Var": -1,
"ld_score" : -1,
"condition_number": -1,
"correct_inversion":-1,
"Nsnp_to_impute" : len(known)
}
column_order = ['pos',"Z","Var", "ld_score", "condition_number",
"correct_inversion", "Nsnp_to_impute"]
batch_df_unknown = pd.DataFrame(result_dict_unknown, columns = column_order)
batch_df_known = pd.DataFrame(result_dict_known, columns = column_order)
batch_df = pd.concat([batch_df_unknown,batch_df_known]).set_index("pos")
#batch_df.index = batch_df.pos
return batch_df
if __name__ == '__main__':
# import sample size each study
print(sys.argv)
meta_data_file = sys.argv[1]
Zscore_prefix = sys.argv[2]
Imputed_prefix = sys.argv[3]
LD_cor = pd.read_csv("/mnt/zeus/GGS/PROJECT_imputation_covidhg/hgcovid_imputation/data/processed/Simulated/Genotypes/LD_matrix3.csv", sep="\t", index_col=0)
LD_cor = LD_cor.values
Ssize = pd.read_csv("{0}".format(meta_data_file),sep=",", index_col=1 )
for tag in ["null","one_causal", 'two_opposite', 'two_causal']:
Zscores = pd.read_csv("../../data/processed/Simulated/Zscores/Zscore_{0}{1}.csv".format(Zscore_prefix,tag), sep=",", index_col=0)
n_masked_all = 20
n_masked_study = 5
n_snp= LD_cor.shape[0]
ids_masked_inall = np.random.choice(np.arange(0,99, 1), n_masked_all, replace=False)
ids_known = np.setdiff1d(np.array(range(n_snp)), ids_masked_inall)
opt_rs = best_rd(Zscores["Zscore_all_SNPs"].values, LD_cor, ids_known, ids_masked_inall)
print(Zscores["Zscore_all_SNPs"].head())
list_study = []
for study in [i for i in Zscores.columns[2:] if i[:2]=="Z_"]:
print(study)
print(Ssize.loc["{0}".format(study[2:])])
Zscore = Zscores[study]
ids_masked_study = np.random.choice(np.arange(0,99, 1), n_masked_study, replace=False)
ids_masked = np.union1d(ids_masked_inall, ids_masked_study)
ids_known = np.setdiff1d(np.array(range(n_snp)), ids_masked)
Zs = raiss.stat_models.raiss_model(Zscore[ids_known], pd.DataFrame(LD_cor[ids_known,:][:,ids_known]), LD_cor[ids_masked,:][:,ids_known], rcond=opt_rs["rcond"])
Df = format_result_df(Zs, ids_masked, Zscore.values[ids_known], ids_known)
Df.columns = [i+"_"+study[2:] for i in Df.columns]
list_study.append(Df.iloc[:, [0,1,2,5]])
z_res = pd.concat([Zscores[["Beta_all_SNPs", "Zscore_all_SNPs"]]]+list_study, axis=1)
complete_ids = np.where((z_res[[j for j in z_res.columns if j[:3]=="Var"]]==-1).all(1))[0]
ids_masked = np.setdiff1d(np.array(range(n_snp)), complete_ids)
if len(complete_ids) > 5:
Zs = raiss.stat_models.raiss_model(Zscores["Zscore_all_SNPs"][complete_ids], pd.DataFrame(LD_cor[complete_ids,:][:,complete_ids]), LD_cor[ids_masked,:][:,complete_ids], rcond=opt_rs["rcond"])
Df = format_result_df(Zs, ids_masked, Zscore.values[complete_ids], complete_ids)
Df.columns = [i+"_impfrom_meta_analysis" for i in Df.columns]
list_study.append(Df.iloc[:, [0,1,2,5]])
pd.concat([Zscores[["Beta_all_SNPs", "Zscore_all_SNPs"]]]+list_study, axis=1).to_csv("../../data/processed/Simulated/Imputation_strategy/Imputation_strategy_{0}{1}.csv".format(Imputed_prefix, tag))
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment