Commit 01ecc3b2 authored by hjulienn's avatar hjulienn
Browse files

stratification function

parent 4abc5ace
Pipeline #48353 passed with stages
in 1 minute and 9 seconds
......@@ -22,22 +22,29 @@ def generated_test_data(zscore, N_to_mask=5000, condition=None, stratifying_vec
condition (None or pandas boolean Series): If None, SNPs will be mask randomly. If a pandas boolean series is passed, masked SNPs will be randomly chosen inside the SNPs which have True value.
N_to_mask (int): Number of SNPs to mask.
"""
print(condition)
print(isinstance(condition, pd.Series))
try:
if isinstance(condition, pd.Series)==True:
masked = np.random.choice(zscore.index[condition], N_to_mask, replace=False)
else:
print("Stratifying vector?")
inter_id = zscore.index.intersection(stratifying_vector.index).drop_duplicates(keep='first')
print(inter_id[1:10])
stratifying_vector = stratifying_vector.loc[inter_id]
if isinstance(stratifying_vector, pd.Series)==True:
masked = []
binned = np.digitize(stratifying_vector, stratifying_bins)
N_bins = len(stratifying_vector)-1
N_bins = len(stratifying_bins)-1
print(N_bins)
print(np.unique(binned))
print(inter_id[(binned==(1))])
print(N_to_mask // N_bins)
for i in range(N_bins):
print(i)
print(np.where(binned==(i+1)))
masked = masked + list(np.random.choice(inter_id[(binned==(i+1))], N_to_mask // N_bins, replace=False))
masked = np.array(masked)
print(masked)
else:
masked = np.random.choice(zscore.index, N_to_mask, replace=False)
except ValueError as ve:
......@@ -187,7 +194,6 @@ def grid_search(zscore_folder, masked_folder, output_folder,
z_output = "{0}/z_{1}_{2}_{3}.txt".format(output_folder, gwas, chrom, rd)
dat_imp = pd.read_csv(z_output, sep="\t", index_col=0)
print(rd)
print(dat_imp)
try:
res = imputation_performance(dat_orig, dat_imp, masked_SNP)
except KeyError: # If KeyError none of the masked_SNP are in the imputed dataframe
......
......@@ -35,16 +35,10 @@ def realigned_zfiles_on_panel(ref_panel, zscore):
zscore.drop_duplicates(keep='first', inplace=True)
ref_panel.drop_duplicates(keep='first', inplace=True)
inter_id = zscore.index.intersection(ref_panel.index).drop_duplicates(keep='first')
print(inter_id)
zscore = zscore.loc[inter_id]
sub_ref_panel = ref_panel.loc[inter_id]
print(inter_id)
print(sub_ref_panel)
print(sub_ref_panel.shape)
print(zscore)
print(zscore.shape)
zscore.sort_index(inplace=True)
sub_ref_panel.sort_index(inplace=True)
......@@ -189,7 +183,6 @@ def impg_like_imputation(ld_file, ref_panel, zscore, window_size, buffer, lamb,
# keep only SNP with non negligible explained variance
snp_well_predicted = (batch_df.Var < 0.9)
print(batch_df)
batch_df_filt = batch_df.loc[(in_core_window & snp_well_predicted), zscore_results.columns]
zscore_results = pd.concat([zscore_results, batch_df_filt])
except (ValueError, KeyError, TypeError) as e:
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment