Commit e2a0b1ae authored by Hanna  JULIENNE's avatar Hanna JULIENNE
Browse files

imputation test on real data + loci drawing

parent fd9057ca
......@@ -31,12 +31,16 @@ Project Organization
│   │   └
| |__ 1 generate_genotype.R : enerate genotype with a specified LD structure
| |__ 2 generate_Zscores.py
| | Generate estimated Zscore by simulating a continuous phenotypes from genotype and
| | Generate estimated Zscore by simulating a continuous
phenotypes from genotype and
| normal noise. then estimate beta and Zscore with a linear regression. Cohort size
| are generated based on hgcovid cohort sizes
| _ 3 generate_Zscores_small_cohort.py
Variant of the precedent script to simulate Z-scores using only small cohorts. Hypothesis to test:
Can we recover a proper signal in the meta analysis using very noisy input
| Variant of the precedent script to simulate Z-scores using
| only small cohorts. Hypothesis to test:
| Can we recover a proper signal in the meta analysis using
| very noisy input
|_ 4 retrieve_european_cohort.py : Filter to retrieve european study with enough SNP available
│ │
│   ├── features
......@@ -50,6 +54,10 @@ Project Organization
│   │   └── Imputation_strategy_simulation.py
impute Zscores using simulated data : mask 50 SNPs on
200, reimpute then save results assuming different sample size (based on hgcovid consortium or always 100)
|__ Impution_real_data.py
│ │
│   └── visualization <- Scripts to create exploratory and results oriented visualizations
│   └── Draw_LD.R : Draw the LD matrix used to simulate Data
......
......@@ -40,8 +40,6 @@ def format_result_df(imp, id_masked, Z_known ,known):
return batch_df
if __name__ == '__main__':
# import sample size each study
print(sys.argv)
......
import pandas as pd
import importlib.util
import raiss
import re
import numpy as np
def dna_complement_base(inputbase):
dna_complement_dict = {'A':'T', 'T':'A', 'G':'C', 'C':'G'}
try:
return(dna_complement_dict[inputbase])
except KeyError:
return('Not ATGC')
def dna_complement(input):
return([dna_complement_base(x) for x in input])
def format_result_df(imp, id_masked, Z_known ,known):
result_dict_unknown = {
'pos': id_masked,
"Z" : imp['mu'],
"Var": imp["var"],
"ld_score" : imp["ld_score"],
"condition_number": imp['condition_number'],
"correct_inversion":imp["correct_inversion"],
"Nsnp_to_impute" : len(known)
}
result_dict_known = {
'pos': known,
"Z" : Z_known,
"Var": 0,
"ld_score" : -1,
"condition_number": -1,
"correct_inversion":-1,
"Nsnp_to_impute" : len(known)
}
column_order = ['pos',"Z","Var", "ld_score", "condition_number",
"correct_inversion", "Nsnp_to_impute"]
batch_df_unknown = pd.DataFrame(result_dict_unknown, columns = column_order)
batch_df_known = pd.DataFrame(result_dict_known, columns = column_order)
batch_df = pd.concat([batch_df_unknown,batch_df_known]).set_index("pos")
#batch_df.index = batch_df.pos
return batch_df
def compute_is_flipped(mgwas):
"""
Check if the reference panel and the GWAS data have the same reference
allele. return a boolean vector.
Args:
mgwas (pandas dataframe): GWAS study dataframe merged with the reference_panel
Return:
is_flipped (pandas dataframe): merge studies,
"""
flipped = pd.DataFrame({"ref_flipped" : (mgwas.Ref_all == mgwas.ALT), "alt_flipped" : (mgwas.alt_all == mgwas.REF)})
flipped_complement = pd.DataFrame({"ref_flippedc" : (mgwas.Ref_all == mgwas.ALTc), "alt_flippedc" : (mgwas.alt_all == mgwas.REFc)})
is_flipped = pd.DataFrame({"flipped":flipped.all(1), # The allele of the
"flipped_complement":flipped_complement.all(1)}
).any(1)
return is_flipped
def compute_is_aligned(mgwas):
"""
Check if the reference panel and the GWAS data have the same reference
allele. return a boolean vector.
The function should be the complement of "is_flipped" but we still compute
the two function to eventually detect weird cases (more than two alleles for
instance)
"""
aligned = pd.DataFrame({"ref_ok" : (mgwas.Ref_all == mgwas.REF), "alt_ok" : (mgwas.alt_all == mgwas.ALT)})
aligned_complement = pd.DataFrame({"ref_ok" : (mgwas.Ref_all == mgwas.REFc), "alt_ok" : (mgwas.alt_all == mgwas.ALTc)})
is_aligned = pd.DataFrame({"aligned":aligned.all(1), # The allele of the
"aligned_complement":aligned_complement.all(1)}
).any(1)
return is_aligned
def compute_snp_alignement(mgwas):
"""
Add a column to mgwas indicating if the reference and coded
allele is flipped compared to the reference panel.
If it is, the sign of the statistic must be flipped
Args:
mgwas: a pandas dataframe of the GWAS data merged
with the reference panel
"""
#ensure that allele are upper cases:
mgwas['REF'] = mgwas.REF.str.upper()
mgwas['ALT'] = mgwas.ALT.str.upper()
mgwas['REFc'] = dna_complement(mgwas.REF)
mgwas['ALTc'] = dna_complement(mgwas.ALT)
is_aligned = compute_is_aligned(mgwas)
is_flipped = compute_is_flipped(mgwas)
# does some SNP are not "alignable" on reference
not_aligned_at_all = ~pd.DataFrame([is_aligned.values, is_flipped.values ]).any()
if(not_aligned_at_all.any()):
warnings.warn('Some snps are not aligned at all with the reference panel\nand will be filtered:\n{}'.format(mgwas.index[is_flipped]),
DeprecationWarning)
mgwas = mgwas.loc[~not_aligned_at_all]
warnings.warn('Number of unaligned SNPs: {}'.format(len(mgwas.index[is_flipped])),
DeprecationWarning)
aligned_everywhere = pd.DataFrame([is_aligned.values, is_flipped.values ]).all()
if(aligned_everywhere.any()):
raise ValueError('Some snps are both aligned and flipped:\n{}'.format(mgwas.index[is_flipped]),
DeprecationWarning)
mgwas['sign_flip'] = np.nan
mgwas.loc[is_aligned,"sign_flip"] = 1
mgwas.loc[is_flipped, "sign_flip"] = -1
return(mgwas)
def sorted_alleles(x):
return "".join(sorted(x))
if __name__ == '__main__':
signif_signal = pd.read_csv("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/external/result_df6_B2_compare.tsv", sep="\t")
eur_filled_out = pd.read_csv("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/interim/Zscores_hg37/Z_scores_EUR.tsv", sep="\t")
Loci_dict = pd.read_csv("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/external/Loci_matrix_dict_2.csv", sep=",", index_col=1)
Loci_dict = Loci_dict.loc[~Loci_dict.LD_matrix.isna()]
eur_filled_out["positional_index"] = eur_filled_out["#CHR"].apply(str)+eur_filled_out.hg37_pos.apply(str)+(eur_filled_out.REF+eur_filled_out.ALT).apply(sorted_alleles)
eur_filled_out.set_index("positional_index", inplace=True)
Zscores_col = [zscore for zscore in eur_filled_out.columns if re.search("_Z$", zscore)]
for loci_id in Loci_dict.index:
try :
print("PROCESSIN LOCI ")
print(Loci_dict.loc[loci_id])
ld_file = Loci_dict.loc[loci_id,'LD_matrix']
ref_panel = pd.read_csv( "/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/raw/ref_panel/ref_panel_chr{}.bim".format(int(Loci_dict.loc[loci_id,'CHR'])), sep="\t", names=['chr',"nothing", 'pos', 'Ref_all', 'alt_all'], index_col=1)
LD_matrix = raiss.ld_matrix.load_sparse_matrix("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/raw/LD_matrices/nfe/{}".format(ld_file), ref_panel)
ref_panel.reset_index(inplace=True)
ref_panel = ref_panel.loc[~(ref_panel.Ref_all+ref_panel.alt_all).isin(["AT", "TA", 'CG','GC'])]
ref_panel["positional_index"] = ref_panel.chr.apply(str)+ref_panel.pos.apply(str)+(ref_panel.Ref_all+ref_panel.alt_all).apply(sorted_alleles)
ref_panel.set_index("positional_index", inplace=True)
mgwas = pd.merge(ref_panel, eur_filled_out, left_index=True, right_index =True)
mgwas = compute_snp_alignement(mgwas)
col_to_flip = [zscore for zscore in mgwas.columns if re.search("_Z$|_beta$", zscore)]
mgwas.loc[mgwas.sign_flip==-1,col_to_flip] = -mgwas.loc[mgwas.sign_flip==-1,col_to_flip]
loci = mgwas.loc[(mgwas['loc']==loci_id)]
loci.set_index("index", inplace=True)
loci = loci.loc[loci.index.intersection(LD_matrix.index)]
to_mask_globally = np.random.choice(loci.index, int(loci.shape[0]/10))
known = loci.index.difference(to_mask_globally)
unknown = LD_matrix.index.difference(known)
print('SNP masked : {0}, SNP known : {1}, SNP IMPUTED : {2}'.format(len(to_mask_globally),len(known),len(unknown)))
print("PROCESS GLOBAL MASKING")
for study in Zscores_col:
#print(study)
Zscore = loci[['#CHR', "POS", "Ref_all", "Ref_all", study]]
Zscore.columns = ['rsID', "pos", "A0", "A1", "Z"]
Z_masked = loci[study].copy(deep=True)
Z_masked.loc[to_mask_globally] = np.nan
imp = raiss.stat_models.raiss_model(Zscore.loc[known, "Z"], LD_matrix.loc[known,known], LD_matrix.loc[unknown,known],rcond=0.000001 )
Z_imputed = format_result_df(imp, unknown, Z_masked.loc[known], known)
loci["{}_imputed".format(study)] = Z_imputed.Z.loc[loci.index]
loci["{}_Var_imputed".format(study)] = Z_imputed.Var.loc[loci.index]
loci["{}_LD_imputed".format(study)] = Z_imputed.ld_score.loc[loci.index]
loci.to_csv("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/processed/Real_data/global_masking/loci_{}.tsv".format(loci_id), sep="\t")
loci = mgwas.loc[(mgwas['loc']==loci_id)]
loci.set_index("index", inplace=True)
loci = loci.loc[loci.index.intersection(LD_matrix.index)]
print("PROCESS RANDOM MASKING")
for study in Zscores_col:
#print(study)
to_mask_in_study = np.random.choice(loci.index, int(loci.shape[0]/10))
Zscore = loci[['#CHR', "POS", "Ref_all", "Ref_all", study]]
Zscore.columns = ['rsID', "pos", "A0", "A1", "Z"]
Z_masked = loci[study].copy(deep=True)
Z_masked.loc[to_mask_in_study] = np.nan
known = loci.index.difference(to_mask_in_study)
unknown = LD_matrix.index.difference(known)
imp = raiss.stat_models.raiss_model(Zscore.loc[known, "Z"], LD_matrix.loc[known,known], LD_matrix.loc[unknown,known],rcond=0.000001 )
Z_imputed = format_result_df(imp, unknown, Z_masked.loc[known], known)
loci["{}_imputed".format(study)] = Z_imputed.Z.loc[loci.index]
loci["{}_Var_imputed".format(study)] = Z_imputed.Var.loc[loci.index]
loci["{}_LD_imputed".format(study)] = Z_imputed.ld_score.loc[loci.index]
loci.to_csv("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/processed/Real_data/random_masking/loci_{}.tsv".format(loci_id), sep="\t")
except Exception as e:
print("Loci {} not processed with:".format(loci_id))
print(e)
Supports Markdown
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment