results from simulation

11560560 · Hanna JULIENNE · f76f8cd3 · 11560560 · f76f8cd3 · 11560560
Commit 11560560 authored 3 years ago by Hanna JULIENNE
--- a/README.md
+++ b/README.md
@@ -45,6 +45,7 @@ Project Organization
    │   │
    │   ├── features       
    │   │   └── add_hg_37_pos.py : perform liftover to hg37 (ad GnomAD is in hg37)
+    |   |   |__ retrieve_filled_out.py : Retrieve european studies with more than 80% of filled out SNPS
    │   │
    │   ├── models         
    │   │   │   
@@ -55,9 +56,7 @@ Project Organization
                impute Zscores using simulated data : mask 50 SNPs on  
                200,     reimpute then save results assuming different sample size (based on hgcovid consortium or always 100)
            |__ Impution_real_data.py
-
-
-
+            |__ Imputation_test_real_data.py : mask 10% of SNPs around significant loci and impute them back
    │   │
    │   └── visualization  <- Scripts to create exploratory and results oriented visualizations
    │       └── Draw_LD.R : Draw the LD matrix used to simulate Data
@@ -65,6 +64,7 @@ Project Organization
    |           (intrinsic variability due to sample size)
    |       |__ Draw_signal_variability.R : draw imputed signal
    |           ( variability due to sample size + imputation error)
+            |_ Imputation_strategy_real_data.R : Compare meta_analysis based on original Z-scores vs Imputed Z-scores
    │
    └── tox.ini            <- tox file with settings for running tox; see tox.readthedocs.io


--- a/reports/figures/meta_analysis_perf.png
+++ b/reports/figures/meta_analysis_perf.png
--- a/reports/tables/Meta_analysis_imputation_performance.csv
+++ b/reports/tables/Meta_analysis_imputation_performance.csv
 "","correlation","Max_L1_error","Mean_L1_error"
-"meta_Z_imputed",0.995683217500374,1.22529552037174,0.0562121582752216
-"meta_Z_imputed_R2weight",0.990440000978169,1.59426643449452,0.454110616671281
-"meta_Z_imputed_high_samp",0.989021574001197,3.62404793904812,1.07203953928316
-"meta_Z_imputed_valid",0.996024141348819,1.22529552037174,0.05602862121342
-"meta_Z_imputed_valid_R2weight",0.996024141348819,1.22529552037174,0.05602862121342
-"meta_Z_imputed_valid_high_samp",0.995289648693123,3.36583792058164,1.0635503576084
+"meta_Z_imputed",0.997715664577698,2.10996809407546,0.0675865516980447
+"meta_Z_imputed_R2weight",0.995827954783326,5.18573176225036,0.430368355995472
+"meta_Z_imputed_high_samp",0.995382502246784,14.0988026398761,1.05536810725203
+"meta_Z_imputed_valid",0.998172604572373,2.10996809407546,0.0660996762793101
+"meta_Z_imputed_valid_R2weight",0.998172604572373,2.10996809407546,0.0660996762793099
+"meta_Z_imputed_valid_high_samp",0.997893748220566,14.0588918701583,1.09929561392649
--- a/reports/tables/random_masking_real_data_error.csv
+++ b/reports/tables/random_masking_real_data_error.csv
@@ -356,3 +356,37 @@
 "355","BoSCO_EUR_Z",0.985476354338373,0.0956705476111556,0.807414380574432,26
 "356","Amsterdam_UMC_COVID_study_group_EUR_Z",0.968965449672844,0.128762104108945,0.798285013504903,26
 "357","meta_Z",0.976898019792069,0.113298175037217,1.0469342369506,26
+"358","HOSTAGE_EUR_Z",0.991328644870294,0.141343612759392,0.160127095950146,27
+"359","GHS_Freeze_145_EUR_Z",0.99766767575791,0.0749037098892356,0.228020291817981,27
+"360","UKBB_EUR_Z",0.997094759585677,0.0515461004095718,0.160799166711033,27
+"361","GENCOVID_EUR_Z",0.909612515827131,0.136975421410257,0.435332338970176,27
+"362","BelCovid_EUR_Z",0.996923577061899,0.0584577391022102,0.0744064871095851,27
+"363","23ANDME_EUR_Z",0.998858805300257,0.0639462153774435,0.100111024149858,27
+"364","SweCovid_EUR_Z",0.984107078760595,0.122312852774997,0.288453118204419,27
+"365","BQC19_EUR_Z",0.991219808057431,0.0947491992326761,0.351113880427233,27
+"366","idipaz24genetics_EUR_Z",0.978086957636132,0.149899608436772,0.441550907259962,27
+"367","ANCESTRY_Freeze_Four_EUR_Z",0.986168329804986,0.0925800491471221,0.316496190672388,27
+"368","EstBB_EUR_Z",0.987555385124586,0.0286432287004613,0.0394833486854393,27
+"369","Generation_Scotland_EUR_Z",0.984925142802746,0.0716150287562398,0.142388724433706,27
+"370","DECODE_EUR_Z",0.999718201250732,0.0512618437741581,0.0795751464786553,27
+"371","MVP_EUR_Z",0.999129826194626,0.0443966500674167,0.130271603592105,27
+"372","BoSCO_EUR_Z",0.998222783557483,0.0359005277769725,0.0521612337125028,27
+"373","Amsterdam_UMC_COVID_study_group_EUR_Z",0.911884325479305,0.0970165938865812,0.21003961291722,27
+"374","meta_Z",0.996714601848827,0.054356164774863,0.116558759040066,27
+"375","HOSTAGE_EUR_Z",0.996746629790268,0.103698167140976,0.193617866467096,28
+"376","GHS_Freeze_145_EUR_Z",0.986706077331538,0.192115033844255,0.419428737445626,28
+"377","UKBB_EUR_Z",0.999728018960211,0.0208145007407018,0.0297093426862298,28
+"378","GENCOVID_EUR_Z",0.999505981199744,0.0673015356139549,0.108717238573582,28
+"379","BelCovid_EUR_Z",0.999706986330761,0.0776637753057837,0.117534253598823,28
+"380","23ANDME_EUR_Z",0.996021892955152,0.103335926547659,0.275330429767461,28
+"381","SweCovid_EUR_Z",0.997928662876803,0.0362859206230599,0.104399152792451,28
+"382","BQC19_EUR_Z",0.996417131989554,0.0504240096598093,0.106532510727776,28
+"383","idipaz24genetics_EUR_Z",0.991619880579089,0.064726198305787,0.159112901032828,28
+"384","ANCESTRY_Freeze_Four_EUR_Z",0.984404222546479,0.0682552263548015,0.101730447077309,28
+"385","EstBB_EUR_Z",0.999916106361675,0.0159565059447764,0.029138947868637,28
+"386","Generation_Scotland_EUR_Z",0.993150020263495,0.0692212245736399,0.166621314818111,28
+"387","DECODE_EUR_Z",0.999905560185049,0.0384715385041048,0.0738413955579982,28
+"388","MVP_EUR_Z",0.999939041634757,0.0395354830763295,0.0699736515063329,28
+"389","BoSCO_EUR_Z",0.953628275945103,0.127214268108066,0.380480757814329,28
+"390","Amsterdam_UMC_COVID_study_group_EUR_Z",0.98314126972152,0.186044808227531,0.385032944988245,28
+"391","meta_Z",0.993455976919658,0.10198685402926,0.23546573006364,28
--- a/src/visualization/Draw_Imputation_quality.R
+++ b/src/visualization/Draw_Imputation_quality.R
@@ -4,7 +4,8 @@ library(cowplot)


 setwd("/mnt/zeus/GGS/PROJECT_imputation_covidhg/hgcovid_imputation/src/visualization")
-
+cohort=""
+tag="one_causal"
 for(cohort in c("", "_small_cohort")){
  N_eff = fread(paste0("../../data/external/meta_data/N_effective",cohort,".csv"))
  N_eff = as.data.frame(N_eff)
@@ -13,27 +14,36 @@ for(cohort in c("", "_small_cohort")){

  for( tag in c("null","one_causal", 'two_opposite', 'two_causal')){
    imp_file = paste0("../../data/processed/Simulated/Imputed/Imputed",cohort,"_",tag,".csv")
+    Zscores_file = paste0("../../data/processed/Simulated/Zscores/Zscore",cohort,"_",tag,".csv")
    Imputation = fread(imp_file)
+    Zscores = fread(Zscores_file)
+    Imputation$V1 = as.character(Imputation$V1)
+    Zscores$V1 = as.character(Zscores$V1)
+    setkey(Zscores, V1)
+    setkey(Imputation, V1)
+    ID = intersect(Zscores$V1, Imputation$V1)
+    Zscores = Zscores[ID,]
+

-    cor_ref = data.frame(correlation = cor(Imputation[,which(grepl("Z_",names(Imputation))), with=FALSE])[,1])
+    cor_ref = data.frame(correlation = diag(cor(Imputation[ID,grep("Z_",names(Zscores), value=TRUE), with=FALSE],
+     Zscores[Imputation$V1, grep("Z_",names(Zscores), value=TRUE),with=FALSE])))
    cor_ref["N"] = N_eff[row.names(cor_ref), "N_effective"]
-    head(cor_ref)
-    p_cor = ggplot(cor_ref, aes(x=N, y=correlation)) + geom_line() + geom_point() + theme_minimal()

+    p_cor = ggplot(cor_ref, aes(x=N, y=correlation)) + geom_line() + geom_point() + theme_minimal() +ylim(c(0,1))
+    p_cor
    beta_hat = melt(Imputation[,c(1,which(grepl("Beta_",names(Imputation)))), with=FALSE], id.vars=1)

    psig = ggplot(beta_hat, aes(x=V1, y=abs(value), group=variable)) + geom_line(alpha=0.2) + geom_line(data=beta_hat[variable=="Beta_ref",], color="midnightblue",lwd=1.1)
    psig =psig + theme_minimal() + xlab("snp") + ylab("beta")

-    beta_hat_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$N_effective > 50,"study"])), with=FALSE], id.vars=1)
+    beta_hat_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$study!="","study"])), with=FALSE], id.vars=1)
    psig_prec = ggplot(beta_hat_prec, aes(x=V1, y=abs(value), group=variable)) + geom_line(alpha=0.2) + geom_line(data=beta_hat_prec[variable=="Beta_ref",], color="midnightblue",lwd=1.1)
    psig_prec = psig_prec + theme_minimal() + xlab("snp") + ylab("beta")

    Beta_scatter = melt(Imputation[,c(1,which(grepl("Beta_",names(Imputation)))), with=FALSE], id.vars=c(1,2))
    ggplot(Beta_scatter, aes(x=Beta_ref, y=value))+geom_point()

-
-    Beta_scatter_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$N_effective > 50,"study"])), with=FALSE], id.vars=c(1,2))
+    Beta_scatter_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$study!="","study"])), with=FALSE], id.vars=c(1,2))
    Beta_scatter_prec = ggplot(Beta_scatter_prec, aes(x=Beta_ref, y=value, color=variable))+geom_point() + scale_colour_hue() + theme_minimal()+theme(legend.pos="none") + xlab('Beta') + ylab("imputed Beta")

    panel = plot_grid(p_cor, psig_prec, Beta_scatter_prec, labels=c("A", "B", "C"), nrow=1)

--- a/src/visualization/Draw_loci_real_data.R
+++ b/src/visualization/Draw_loci_real_data.R
@@ -4,7 +4,7 @@ library(ggplot2)

 setwd("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/src/visualization/")
 loci_id = 2
-masking_type = "global_masking"
+masking_type = "random_masking"

 sample_size = fread('/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/external/meta_data/N_effective.csv')
 correlation_list = list()

--- a/src/visualization/Imputation_strategy_real_data.R
+++ b/src/visualization/Imputation_strategy_real_data.R
@@ -31,8 +31,6 @@ z_col_var =grep("_Z_Var_imputed$", names(loci_all), value=TRUE)

 sapply(Z_cols, get_study)
 Z_score = loci_all[,Z_cols, with=FALSE]
-
-
 Z_score_var = loci_all[,z_col_var, with=FALSE]
 Z_score_imputed = loci_all[,Z_col_imputed, with=FALSE]

@@ -44,7 +42,6 @@ compute_meta_analysis <- function(x, min_samp_size=0){
    study = get_study(nm)
    if((!is.na(x[nm]) & !is.na(sample_size[study, N_effective])))
    {
-
      if((sample_size[study, N_effective>min_samp_size])){
        meta_denominator = meta_denominator + x[nm] * sample_size[study, N_effective^0.5]
        meta_numerator = meta_numerator + sample_size[study, N_effective]