Commit 8f919ee4 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

add option for prefix for ref panel file names

parent a3d2b269
......@@ -28,9 +28,7 @@ for( i in 1:(nvar-1))
}
CC <- cov2cor(cov_mat)
CC
p11_range
p11
## Simulate 10000 x-y pairs, and check that they have the specified
## correlation structure
......
......@@ -17,7 +17,8 @@ def launch_chromosome_imputation(args):
save_chromosome_imputation(args.gwas, args.chrom, args.window_size,
args.buffer_size, args.l2_regularization, args.eigen_threshold,
args.zscore_folder, args.ref_folder, args.ld_folder, args.output_folder,
args.R2_threshold, ref_panel_suffix=args.ref_panel_suffix,
args.R2_threshold, ref_panel_prefix=args.ref_panel_prefix,
ref_panel_suffix=args.ref_panel_suffix,
ld_type=args.ld_type, minimum_ld=args.minimum_ld)
def add_chromosome_imputation_argument():
......@@ -38,7 +39,8 @@ def add_chromosome_imputation_argument():
parser.add_argument('--R2-threshold', help= "R square (imputation quality) threshold bellow which SNPs are filtered from the output", default = 0.6)
parser.add_argument("--ld-type", help= "Ld can be supplied as plink command --ld-snp-list output files (see raiss.ld_matrix.launch_plink_ld to compute these data using plink) or as a couple of a scipy sparse matrix (.npz )and an .csv containing SNPs index", default="plink")
parser.add_argument('--ref-panel-suffix', help= "end of the suffix for the reference panel files", default = ".bim")
parser.add_argument('--ref-panel-prefix', help= "prefix for the reference panel files", default = "")
parser.add_argument('--ref-panel-suffix', help= "suffix for the reference panel files", default = ".bim")
parser.add_argument('--minimum-ld', help = "this parameter ensure that their is enough typed SNPs around the imputed to perform a high accuracy imputation", default = 4)
parser.set_defaults(func=launch_chromosome_imputation)
......
......@@ -20,6 +20,23 @@ def filter_output(zscores, fout, R2_threshold = 0.6, minimum_ld = 5):
chr_fo = zscores[['index', 'pos', 'A0', 'A1', 'Z', 'Var', "ld_score"]]
chr_fo["imputation_R2"] = 1-chr_fo["Var"]
chr_fo.columns = ['rsID','pos','A0','A1','Z', 'Var', "ld_score", "imputation_R2"]
print(chr_fo.shape)
print(chr_fo.loc[(chr_fo.imputation_R2 > R2_threshold) & (chr_fo.ld_score > float(minimum_ld))].shape)
chr_fo.loc[(chr_fo.imputation_R2 > R2_threshold) & (chr_fo.ld_score > float(minimum_ld))].to_csv(fout, sep="\t", index=False)
NSNPs_bf_filt = chr_fo.shape[0]
NSNPs_initial = (chr_fo.imputation_R2==2.0).sum()
NSNPs_imputed = (chr_fo.imputation_R2!=2.0).sum()
NSNPs_ld_filt = (chr_fo.ld_score < float(minimum_ld)).sum()
NSNPs_R2_filt = (chr_fo.imputation_R2 < R2_threshold).sum()
chr_fo = chr_fo.loc[(chr_fo.imputation_R2 > R2_threshold) & (chr_fo.ld_score > float(minimum_ld))]
NSNPs_af_filt = chr_fo.shape[0]
print("IMPUTATION REPORT")
print("Number of SNPs:")
print("before filter: {}".format(NSNPs_bf_filt))
print("not imputed: {}".format(NSNPs_initial))
print("imputed: {}".format(NSNPs_imputed))
print("filtered because of ld: {}".format(NSNPs_ld_filt))
print("filtered because of R2: {}".format(NSNPs_R2_filt))
print("before filter: {}".format(NSNPs_af_filt))
chr_fo.to_csv(fout, sep="\t", index=False)
......@@ -79,7 +79,10 @@ def imputation_performance(zscore_initial, zscore_imputed, masked):
res = np.nan
return {'N_SNP':np.nan, 'fraction_imputed': np.nan, 'cor':np.nan, 'mean_absolute_error':np.nan}
def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder, ld_folder, gwas, z_treshold = [0, 1.0, 2.0, 3.0, 4.0, 5], window_size= 500000, buffer_size=125000, eigen_ratio = 0.1, chrom="chr22", l2_regularization=0.1, R2_threshold=0.6, ratio_to_mask=0.5):
def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder,
ld_folder, gwas,ref_panel_preffix="",ref_panel_suffix=".eur.1pct.bim",
z_treshold = [0, 1.0, 2.0, 3.0, 4.0, 5], window_size= 500000, buffer_size=125000,
eigen_ratio = 0.1, chrom="chr22", l2_regularization=0.1, R2_threshold=0.6, ratio_to_mask=0.5):
"""
Compute the imputation performance on SNPs with different amplitude
The procedure is the following:
......@@ -123,7 +126,7 @@ def z_amplitude_effect(zscore_folder, masked_folder, output_folder, ref_folder,
masked_SNP = res_masked[1]
tag = "_{}".format(z)
gwas_proc = "{0}_{1}".format(gwas, i)
save_chromosome_imputation(gwas_proc, chrom, window_size, buffer_size, l2_regularization, eigen_ratio, masked_folder, ref_folder, ld_folder, output_folder, R2_threshold, tag)
save_chromosome_imputation(gwas_proc, chrom, window_size, buffer_size, l2_regularization, eigen_ratio, masked_folder, ref_folder, ref_panel_preffix, ref_panel_suffix, ld_folder, output_folder, R2_threshold, tag)
z_output = "{0}/z_{1}_{2}{3}.txt".format(output_folder, gwas_proc, chrom, tag)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
......@@ -141,7 +144,7 @@ def grid_search(zscore_folder, masked_folder, output_folder,
ref_folder, ld_folder, gwas, chrom="chr22",
eigen_ratio_grid = [0.5, 0.1, 0.01], window_size= 500000,
buffer_size=125000, l2_regularization=0.1, R2_threshold=0.6,
N_to_mask=5000,ref_panel_suffix=".eur.1pct.bim", ld_type="plink",
N_to_mask=5000,ref_panel_preffix="",ref_panel_suffix=".eur.1pct.bim", ld_type="plink",
stratifying_vector=None, stratifying_bins=None, LD_threshold=4):
"""
Compute the imputation performance for several eigen ratioself.
......@@ -183,7 +186,8 @@ def grid_search(zscore_folder, masked_folder, output_folder,
save_chromosome_imputation(gwas, chrom, window_size, buffer_size,
l2_regularization, cond, masked_folder,
ref_folder, ld_folder, output_folder,
R2_threshold, tag, ref_panel_suffix, ld_type, minimum_ld = LD_threshold)
R2_threshold, tag, ref_panel_preffix,
ref_panel_suffix, ld_type, minimum_ld = LD_threshold)
n_cpu = multiprocessing.cpu_count()
Parallel(n_jobs=n_cpu)(delayed(run_imputation)(rd) for rd in eigen_ratio_grid)
......
......@@ -81,11 +81,10 @@ def load_plink_ld(plink_ld, ref_chr_df):
mat_ld = pd.DataFrame(sym, index=mat_ld.index, columns=mat_ld.columns)
int_index = ref_chr_df.index.intersection(mat_ld.index)
print(int_index)
re_index = ref_chr_df.loc[int_index].sort_values(by="pos").index
mat_ld = mat_ld.loc[re_index, re_index]
print(mat_ld.iloc[1:5, 1:5])
return mat_ld
def load_sparse_matrix(path_sparse_LD, ref_chr_df):
......@@ -94,18 +93,14 @@ def load_sparse_matrix(path_sparse_LD, ref_chr_df):
"""
fi_ld = path_sparse_LD + ".npz"
fi_index = path_sparse_LD + ".csv"
print((fi_ld, fi_index))
Sp_M = sc.sparse.load_npz(fi_ld)
snp_index = pd.read_csv(fi_index, header=None)
Sp_M = Sp_M.todense()
print(Sp_M.shape)
print(snp_index.shape)
mat_ld = (np.where(np.abs(Sp_M) > np.abs(Sp_M.transpose()), Sp_M, Sp_M.transpose()))
mat_ld = pd.DataFrame(mat_ld, index= snp_index.iloc[:,0], columns=snp_index.iloc[:,0])
valid_id = ref_chr_df.index.intersection(mat_ld.index)
print(mat_ld.iloc[:5,:5])
return(mat_ld.loc[valid_id, valid_id])
......@@ -123,13 +118,13 @@ def generate_genome_matrices(region_files, reffolder, folder_output, prefix, suf
suffix (str): part of file name in reffolder after the chromosome number (without extension)
"""
regions = pd.read_csv(region_files,sep=separa)
for reg in regions.iterrows():
# input reference panel file
fi_ref = "{0}/{1}{2}{3}".format(reffolder,prefix, reg[1]['chr'], suffix)
#print(fi_ref)
chr_int = re.search('([0-9X]{1,2})', str(reg[1]['chr'])).group()
# Compute the LD correlation with LD
launch_plink_ld(reg[1]['start'], reg[1]['stop'], chr_int, fi_ref, folder_output)
......
......@@ -10,7 +10,7 @@ from raiss.imputation_launcher import ImputationLauncher
def save_chromosome_imputation(gwas, chrom, window_size, buffer_size,
l2_regularization, eigen_threshold, zscore_folder,
ref_folder, ld_folder, output_folder, R2_threshold,
tag="", ref_panel_suffix=".eur.1pct.bim", ld_type="plink", minimum_ld=4):
tag="", ref_panel_prefix="", ref_panel_suffix=".eur.1pct.bim", ld_type="plink", minimum_ld=4):
"""
module to manage the creation of files to save the results of imputation
Args:
......@@ -32,7 +32,7 @@ def save_chromosome_imputation(gwas, chrom, window_size, buffer_size,
# Reading of inputs
z_file = "{0}/z_{1}_{2}.txt".format(zscore_folder, gwas, chrom)
zscore = pd.read_csv(z_file, index_col=0, sep="\t")
ref_panel_file = ref_folder + "/"+ chrom + ref_panel_suffix
ref_panel_file = ref_folder + "/"+ ref_panel_prefix + chrom + ref_panel_suffix
ref_panel = pd.read_csv(ref_panel_file, sep="\t", names=['chr', "nothing", 'pos', 'Ref_all', 'alt_all'], index_col = 1)
# imputation
......
......@@ -184,8 +184,6 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
end_core_window = int(start_ld_block) + (i+1)*window_resize
in_core_window = in_region(batch_df.pos, start_core_window, end_core_window)
# keep only SNP with non negligible explained variance
#snp_well_predicted = (batch_df.Var < 0.9)
batch_df_filt = batch_df.loc[(in_core_window), zscore_results.columns]
zscore_results = pd.concat([zscore_results, batch_df_filt])
except (ValueError, KeyError, TypeError) as e:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment