diff --git a/README.md b/README.md index f3b863caf32530723e56587b121b98175edcc661..5baf3d54a5b05888e869c3b9189476251344c32e 100644 --- a/README.md +++ b/README.md @@ -45,6 +45,7 @@ Project Organization │ │ │  ├── features │  │  └── add_hg_37_pos.py : perform liftover to hg37 (ad GnomAD is in hg37) + | |  |__ retrieve_filled_out.py : Retrieve european studies with more than 80% of filled out SNPS │ │ │  ├── models │ │ │ @@ -55,9 +56,7 @@ Project Organization impute Zscores using simulated data : mask 50 SNPs on 200, reimpute then save results assuming different sample size (based on hgcovid consortium or always 100) |__ Impution_real_data.py - - - + |__ Imputation_test_real_data.py : mask 10% of SNPs around significant loci and impute them back │ │ │  └── visualization <- Scripts to create exploratory and results oriented visualizations │  └── Draw_LD.R : Draw the LD matrix used to simulate Data @@ -65,6 +64,7 @@ Project Organization | (intrinsic variability due to sample size) | |__ Draw_signal_variability.R : draw imputed signal | ( variability due to sample size + imputation error) + |_ Imputation_strategy_real_data.R : Compare meta_analysis based on original Z-scores vs Imputed Z-scores │ └── tox.ini <- tox file with settings for running tox; see tox.readthedocs.io diff --git a/reports/figures/meta_analysis_perf.png b/reports/figures/meta_analysis_perf.png index 18fd66c3b67072e0284be3e3c840287cfa7af730..ac168eb5f20df2ccca6e05e8d92f948752795dfe 100644 Binary files a/reports/figures/meta_analysis_perf.png and b/reports/figures/meta_analysis_perf.png differ diff --git a/reports/tables/Meta_analysis_imputation_performance.csv b/reports/tables/Meta_analysis_imputation_performance.csv index 7f0ced45cf50b5b234b1fa313f8b735121a6ff20..4f0f361754a5cf792f9cf572771b916ce49f7f44 100644 --- a/reports/tables/Meta_analysis_imputation_performance.csv +++ b/reports/tables/Meta_analysis_imputation_performance.csv @@ -1,7 +1,7 @@ "","correlation","Max_L1_error","Mean_L1_error" -"meta_Z_imputed",0.995683217500374,1.22529552037174,0.0562121582752216 -"meta_Z_imputed_R2weight",0.990440000978169,1.59426643449452,0.454110616671281 -"meta_Z_imputed_high_samp",0.989021574001197,3.62404793904812,1.07203953928316 -"meta_Z_imputed_valid",0.996024141348819,1.22529552037174,0.05602862121342 -"meta_Z_imputed_valid_R2weight",0.996024141348819,1.22529552037174,0.05602862121342 -"meta_Z_imputed_valid_high_samp",0.995289648693123,3.36583792058164,1.0635503576084 +"meta_Z_imputed",0.997715664577698,2.10996809407546,0.0675865516980447 +"meta_Z_imputed_R2weight",0.995827954783326,5.18573176225036,0.430368355995472 +"meta_Z_imputed_high_samp",0.995382502246784,14.0988026398761,1.05536810725203 +"meta_Z_imputed_valid",0.998172604572373,2.10996809407546,0.0660996762793101 +"meta_Z_imputed_valid_R2weight",0.998172604572373,2.10996809407546,0.0660996762793099 +"meta_Z_imputed_valid_high_samp",0.997893748220566,14.0588918701583,1.09929561392649 diff --git a/reports/tables/random_masking_real_data_error.csv b/reports/tables/random_masking_real_data_error.csv index 35281fd2e5ab96d2a7ecca96e86c27d7f4fa50ef..bd0747458c797b0ee63e70aff7b7e51f6fd7c79d 100644 --- a/reports/tables/random_masking_real_data_error.csv +++ b/reports/tables/random_masking_real_data_error.csv @@ -356,3 +356,37 @@ "355","BoSCO_EUR_Z",0.985476354338373,0.0956705476111556,0.807414380574432,26 "356","Amsterdam_UMC_COVID_study_group_EUR_Z",0.968965449672844,0.128762104108945,0.798285013504903,26 "357","meta_Z",0.976898019792069,0.113298175037217,1.0469342369506,26 +"358","HOSTAGE_EUR_Z",0.991328644870294,0.141343612759392,0.160127095950146,27 +"359","GHS_Freeze_145_EUR_Z",0.99766767575791,0.0749037098892356,0.228020291817981,27 +"360","UKBB_EUR_Z",0.997094759585677,0.0515461004095718,0.160799166711033,27 +"361","GENCOVID_EUR_Z",0.909612515827131,0.136975421410257,0.435332338970176,27 +"362","BelCovid_EUR_Z",0.996923577061899,0.0584577391022102,0.0744064871095851,27 +"363","23ANDME_EUR_Z",0.998858805300257,0.0639462153774435,0.100111024149858,27 +"364","SweCovid_EUR_Z",0.984107078760595,0.122312852774997,0.288453118204419,27 +"365","BQC19_EUR_Z",0.991219808057431,0.0947491992326761,0.351113880427233,27 +"366","idipaz24genetics_EUR_Z",0.978086957636132,0.149899608436772,0.441550907259962,27 +"367","ANCESTRY_Freeze_Four_EUR_Z",0.986168329804986,0.0925800491471221,0.316496190672388,27 +"368","EstBB_EUR_Z",0.987555385124586,0.0286432287004613,0.0394833486854393,27 +"369","Generation_Scotland_EUR_Z",0.984925142802746,0.0716150287562398,0.142388724433706,27 +"370","DECODE_EUR_Z",0.999718201250732,0.0512618437741581,0.0795751464786553,27 +"371","MVP_EUR_Z",0.999129826194626,0.0443966500674167,0.130271603592105,27 +"372","BoSCO_EUR_Z",0.998222783557483,0.0359005277769725,0.0521612337125028,27 +"373","Amsterdam_UMC_COVID_study_group_EUR_Z",0.911884325479305,0.0970165938865812,0.21003961291722,27 +"374","meta_Z",0.996714601848827,0.054356164774863,0.116558759040066,27 +"375","HOSTAGE_EUR_Z",0.996746629790268,0.103698167140976,0.193617866467096,28 +"376","GHS_Freeze_145_EUR_Z",0.986706077331538,0.192115033844255,0.419428737445626,28 +"377","UKBB_EUR_Z",0.999728018960211,0.0208145007407018,0.0297093426862298,28 +"378","GENCOVID_EUR_Z",0.999505981199744,0.0673015356139549,0.108717238573582,28 +"379","BelCovid_EUR_Z",0.999706986330761,0.0776637753057837,0.117534253598823,28 +"380","23ANDME_EUR_Z",0.996021892955152,0.103335926547659,0.275330429767461,28 +"381","SweCovid_EUR_Z",0.997928662876803,0.0362859206230599,0.104399152792451,28 +"382","BQC19_EUR_Z",0.996417131989554,0.0504240096598093,0.106532510727776,28 +"383","idipaz24genetics_EUR_Z",0.991619880579089,0.064726198305787,0.159112901032828,28 +"384","ANCESTRY_Freeze_Four_EUR_Z",0.984404222546479,0.0682552263548015,0.101730447077309,28 +"385","EstBB_EUR_Z",0.999916106361675,0.0159565059447764,0.029138947868637,28 +"386","Generation_Scotland_EUR_Z",0.993150020263495,0.0692212245736399,0.166621314818111,28 +"387","DECODE_EUR_Z",0.999905560185049,0.0384715385041048,0.0738413955579982,28 +"388","MVP_EUR_Z",0.999939041634757,0.0395354830763295,0.0699736515063329,28 +"389","BoSCO_EUR_Z",0.953628275945103,0.127214268108066,0.380480757814329,28 +"390","Amsterdam_UMC_COVID_study_group_EUR_Z",0.98314126972152,0.186044808227531,0.385032944988245,28 +"391","meta_Z",0.993455976919658,0.10198685402926,0.23546573006364,28 diff --git a/src/visualization/Draw_Imputation_quality.R b/src/visualization/Draw_Imputation_quality.R index a6719690ff538e222fd0f5c16369af7c465908cc..793c823873a44589ca06e59283a5d16ab989f027 100644 --- a/src/visualization/Draw_Imputation_quality.R +++ b/src/visualization/Draw_Imputation_quality.R @@ -4,7 +4,8 @@ library(cowplot) setwd("/mnt/zeus/GGS/PROJECT_imputation_covidhg/hgcovid_imputation/src/visualization") - +cohort="" +tag="one_causal" for(cohort in c("", "_small_cohort")){ N_eff = fread(paste0("../../data/external/meta_data/N_effective",cohort,".csv")) N_eff = as.data.frame(N_eff) @@ -13,27 +14,36 @@ for(cohort in c("", "_small_cohort")){ for( tag in c("null","one_causal", 'two_opposite', 'two_causal')){ imp_file = paste0("../../data/processed/Simulated/Imputed/Imputed",cohort,"_",tag,".csv") + Zscores_file = paste0("../../data/processed/Simulated/Zscores/Zscore",cohort,"_",tag,".csv") Imputation = fread(imp_file) + Zscores = fread(Zscores_file) + Imputation$V1 = as.character(Imputation$V1) + Zscores$V1 = as.character(Zscores$V1) + setkey(Zscores, V1) + setkey(Imputation, V1) + ID = intersect(Zscores$V1, Imputation$V1) + Zscores = Zscores[ID,] + - cor_ref = data.frame(correlation = cor(Imputation[,which(grepl("Z_",names(Imputation))), with=FALSE])[,1]) + cor_ref = data.frame(correlation = diag(cor(Imputation[ID,grep("Z_",names(Zscores), value=TRUE), with=FALSE], + Zscores[Imputation$V1, grep("Z_",names(Zscores), value=TRUE),with=FALSE]))) cor_ref["N"] = N_eff[row.names(cor_ref), "N_effective"] - head(cor_ref) - p_cor = ggplot(cor_ref, aes(x=N, y=correlation)) + geom_line() + geom_point() + theme_minimal() + p_cor = ggplot(cor_ref, aes(x=N, y=correlation)) + geom_line() + geom_point() + theme_minimal() +ylim(c(0,1)) + p_cor beta_hat = melt(Imputation[,c(1,which(grepl("Beta_",names(Imputation)))), with=FALSE], id.vars=1) psig = ggplot(beta_hat, aes(x=V1, y=abs(value), group=variable)) + geom_line(alpha=0.2) + geom_line(data=beta_hat[variable=="Beta_ref",], color="midnightblue",lwd=1.1) psig =psig + theme_minimal() + xlab("snp") + ylab("beta") - beta_hat_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$N_effective > 50,"study"])), with=FALSE], id.vars=1) + beta_hat_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$study!="","study"])), with=FALSE], id.vars=1) psig_prec = ggplot(beta_hat_prec, aes(x=V1, y=abs(value), group=variable)) + geom_line(alpha=0.2) + geom_line(data=beta_hat_prec[variable=="Beta_ref",], color="midnightblue",lwd=1.1) psig_prec = psig_prec + theme_minimal() + xlab("snp") + ylab("beta") Beta_scatter = melt(Imputation[,c(1,which(grepl("Beta_",names(Imputation)))), with=FALSE], id.vars=c(1,2)) ggplot(Beta_scatter, aes(x=Beta_ref, y=value))+geom_point() - - Beta_scatter_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$N_effective > 50,"study"])), with=FALSE], id.vars=c(1,2)) + Beta_scatter_prec = melt(Imputation[,c("V1","Beta_ref",paste0("Beta_",N_eff[N_eff$study!="","study"])), with=FALSE], id.vars=c(1,2)) Beta_scatter_prec = ggplot(Beta_scatter_prec, aes(x=Beta_ref, y=value, color=variable))+geom_point() + scale_colour_hue() + theme_minimal()+theme(legend.pos="none") + xlab('Beta') + ylab("imputed Beta") panel = plot_grid(p_cor, psig_prec, Beta_scatter_prec, labels=c("A", "B", "C"), nrow=1) diff --git a/src/visualization/Draw_loci_real_data.R b/src/visualization/Draw_loci_real_data.R index c803a0eb088bbb2da5a3f7b7e9e03cc14964e833..987ff0bfe71585575efda01d096952c82b5b0a77 100644 --- a/src/visualization/Draw_loci_real_data.R +++ b/src/visualization/Draw_loci_real_data.R @@ -4,7 +4,7 @@ library(ggplot2) setwd("/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/src/visualization/") loci_id = 2 -masking_type = "global_masking" +masking_type = "random_masking" sample_size = fread('/pasteur/zeus/projets/p02/GGS_WKD/PROJECT_imputation_covidhg/hgcovid_imputation/data/external/meta_data/N_effective.csv') correlation_list = list() diff --git a/src/visualization/Imputation_strategy_real_data.R b/src/visualization/Imputation_strategy_real_data.R index 1fca1001881aecc54b6395482df2db895fd7ee5c..4714332e251ae651855fdb9a19dfbb95e09e7bf2 100644 --- a/src/visualization/Imputation_strategy_real_data.R +++ b/src/visualization/Imputation_strategy_real_data.R @@ -31,8 +31,6 @@ z_col_var =grep("_Z_Var_imputed$", names(loci_all), value=TRUE) sapply(Z_cols, get_study) Z_score = loci_all[,Z_cols, with=FALSE] - - Z_score_var = loci_all[,z_col_var, with=FALSE] Z_score_imputed = loci_all[,Z_col_imputed, with=FALSE] @@ -44,7 +42,6 @@ compute_meta_analysis <- function(x, min_samp_size=0){ study = get_study(nm) if((!is.na(x[nm]) & !is.na(sample_size[study, N_effective]))) { - if((sample_size[study, N_effective>min_samp_size])){ meta_denominator = meta_denominator + x[nm] * sample_size[study, N_effective^0.5] meta_numerator = meta_numerator + sample_size[study, N_effective]