Commit 691042d3 authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

fixed position error due to liftover

parent e2a0b1ae
......@@ -123,6 +123,7 @@ def compute_snp_alignement(mgwas):
def sorted_alleles(x):
return "".join(sorted(x))
if __name__ == '__main__':
signif_signal = pd.read_csv("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/external/result_df6_B2_compare.tsv", sep="\t")
......@@ -136,7 +137,7 @@ if __name__ == '__main__':
Zscores_col = [zscore for zscore in eur_filled_out.columns if re.search("_Z$", zscore)]
loci_id=14
for loci_id in Loci_dict.index:
try :
print("PROCESSIN LOCI ")
......@@ -145,19 +146,31 @@ if __name__ == '__main__':
ld_file = Loci_dict.loc[loci_id,'LD_matrix']
ref_panel = pd.read_csv( "/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/raw/ref_panel/ref_panel_chr{}.bim".format(int(Loci_dict.loc[loci_id,'CHR'])), sep="\t", names=['chr',"nothing", 'pos', 'Ref_all', 'alt_all'], index_col=1)
LD_matrix = raiss.ld_matrix.load_sparse_matrix("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/raw/LD_matrices/nfe/{}".format(ld_file), ref_panel)
ref_panel.reset_index(inplace=True)
ref_panel = ref_panel.loc[~(ref_panel.Ref_all+ref_panel.alt_all).isin(["AT", "TA", 'CG','GC'])]
ref_panel["positional_index"] = ref_panel.chr.apply(str)+ref_panel.pos.apply(str)+(ref_panel.Ref_all+ref_panel.alt_all).apply(sorted_alleles)
ref_panel.set_index("positional_index", inplace=True)
ref_panel.shape
eur_filled_out.shape
eur_filled_out.loc[(eur_filled_out["loc"] == 14).values].shape
eur_filled_out.loc[(eur_filled_out["loc"] == 14).values].index.difference(ref_panel.index)
ref_panel.pos[ref_panel.pos > 61455328]
mgwas = pd.merge(ref_panel, eur_filled_out, left_index=True, right_index =True)
mgwas = compute_snp_alignement(mgwas)
mgwas.shape
col_to_flip = [zscore for zscore in mgwas.columns if re.search("_Z$|_beta$", zscore)]
mgwas.loc[mgwas.sign_flip==-1,col_to_flip] = -mgwas.loc[mgwas.sign_flip==-1,col_to_flip]
loci = mgwas.loc[(mgwas['loc']==loci_id)]
loci.set_index("index", inplace=True)
loci = loci.loc[loci.index.intersection(LD_matrix.index)]
loci.shape
to_mask_globally = np.random.choice(loci.index, int(loci.shape[0]/10))
known = loci.index.difference(to_mask_globally)
......@@ -165,14 +178,16 @@ if __name__ == '__main__':
print('SNP masked : {0}, SNP known : {1}, SNP IMPUTED : {2}'.format(len(to_mask_globally),len(known),len(unknown)))
print("PROCESS GLOBAL MASKING")
for study in Zscores_col:
#print(study)
print(study)
Zscore = loci[['#CHR', "POS", "Ref_all", "Ref_all", study]]
Zscore.columns = ['rsID', "pos", "A0", "A1", "Z"]
print(Zscore.head())
Z_masked = loci[study].copy(deep=True)
Z_masked.loc[to_mask_globally] = np.nan
print(Z_masked.head())
imp = raiss.stat_models.raiss_model(Zscore.loc[known, "Z"], LD_matrix.loc[known,known], LD_matrix.loc[unknown,known],rcond=0.000001 )
Z_imputed = format_result_df(imp, unknown, Z_masked.loc[known], known)
......@@ -187,7 +202,7 @@ if __name__ == '__main__':
loci = loci.loc[loci.index.intersection(LD_matrix.index)]
print("PROCESS RANDOM MASKING")
for study in Zscores_col:
#print(study)
print(study)
to_mask_in_study = np.random.choice(loci.index, int(loci.shape[0]/10))
Zscore = loci[['#CHR', "POS", "Ref_all", "Ref_all", study]]
Zscore.columns = ['rsID', "pos", "A0", "A1", "Z"]
......
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment