diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R b/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R index d1987f2bf24a6bcbeaf63493254ab581041d7534..4ebfe7e8d93403202765fd3287541bf2aa0a31a5 100644 --- a/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R +++ b/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R @@ -91,3 +91,18 @@ SEED <- 12345 # - FALSE: curation rule will not be enforced (default) CURATE_DATA_RECALCULATE_AGEY_ETHIOPIA <- TRUE CURATE_DATA_RECALCULATE_AGEY_MADAGASCAR <- FALSE + + + + + +###################################################################### +### ACCEPTABLE DIFFERENCES +###################################################################### +## Global flag defining the *largest* acceptable difference, in years, +## for the reported age between observational and inventory studies. +ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MIN <- 3 + +## Global flag defining the *smallest* acceptable difference, in years, +## for the reported age between observational and inventory studies. +ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX <- 0 diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R index 7c713b5ec79e2c9d261e726451f6cf3b2c003bce..99ea1aa7ef5c1c9ef6e4573fa22e37cf8480ce6f 100644 --- a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R +++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R @@ -24,7 +24,7 @@ if (!exists("DATA_EXTRACT_IS_RECENT_OBS") || !DATA_EXTRACT_IS_RECENT_OBS) { cat("Databases are outdated and will be dumped again.\n") source("./01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R") source("./01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R") - source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R") + source("./02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R") } @@ -53,8 +53,14 @@ dat_observational_curated %>% count(country, record_id_is_censusid, .drop = FALSE) -tmp = dat_observational_curated %>% - select(censusid, consent, sex, agey) %>% +## Check for consistency of demographics across INVENTORY and OBSERVATIONAL data +## NOTE: This is restricted to data for which NO QC FLAG is raised. +dat_observational_inventory_demographics_fulljoin <- dat_observational_curated_QC %>% + filter(!QC_any_rule_is_FALSE) %>% + select(censusid, consent, + sex, + dob_yn, agey) %>% + # REDCap labels were translated in Madagascar. # Handle it here, before it's maybe handled before at the curation stage? mutate(consent = plyr::mapvalues(x = consent, @@ -62,12 +68,17 @@ tmp = dat_observational_curated %>% to = c("Yes", "No")), sex = plyr::mapvalues(x = sex, from = c("Féminin", "Masculin"), - to = c("Female", "Male"))) %>% + to = c("Female", "Male")), + dob_yn = plyr::mapvalues(x = dob_yn, + from = c("Oui", "Non"), + to = c("Yes", "No"))) %>% full_join(inventory_list_p %>% select(censusid, sex, agey), by = join_by(censusid == censusid), suffix = c(".obs", ".inv")) %>% + filter(consent == "Yes") %>% + separate_wider_regex(cols = censusid, patterns = c(country = "^(?:E|M)", "-", @@ -78,7 +89,166 @@ tmp = dat_observational_curated %>% "-", nested_hhid = "\\d{2}", "-", - nested_subjid = "\\d{2}"), too_few = "debug") + nested_subjid = "\\d{2}")) %>% + + # Compare agey and sex columns from INVENTORY and OBSERVATIONAL + mutate(sex_obs_inv_match = (sex.obs == sex.inv), + agey_obs_inv_match = ((agey.obs - agey.inv) <= ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MIN & + (agey.obs - agey.inv) >= ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX)) + + + + + +###################################################################### +### DEMOGRAPHICS CONSISTENCY: SEX / PLOT +###################################################################### +## Contingency table +OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01 <- ggplot(dat_observational_inventory_demographics_fulljoin %>% + count(country, + sex.obs, sex.inv, sex_obs_inv_match), + aes(x = sex.inv, y = sex.obs)) + + geom_tile(aes(fill = sex_obs_inv_match), + color = "#000000FF", + show.legend = FALSE) + + geom_text(aes(label = n)) + + + labs(x = "Sex as reported at INVENTORY phase", + y = "Sex as reported at OBSERVATIONAL phase") + + + scale_fill_manual(values = c("TRUE" = "#00FF007F", + "FALSE" = "#FF00007F"), + na.value = "#D3D3D37F", + guide = "none") + + + facet_wrap(~country) + + theme_bw() + + theme(legend.position = "bottom") + +print(OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01) + + + + + +###################################################################### +### DEMOGRAPHICS CONSISTENCY: AGE / PLOT +###################################################################### +## Scatterplot +OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01 <- ggplot(dat_observational_inventory_demographics_fulljoin, + aes(x = agey.inv, y = agey.obs)) + + geom_point(aes(color = agey_obs_inv_match)) + + + labs(x = "Age as reported at INVENTORY phase", + y = "Age as reported at OBSERVATIONAL phase", + color = "Data consistency") + + + scale_color_manual(values = c("TRUE" = "#00FF007F", + "FALSE" = "#FF00007F"), + labels = c("TRUE" = "OK", + "FALSE" = "NOK"), + na.value = "#D3D3D37F") + + + facet_wrap(~country) + + theme_bw() + + theme(legend.position = "bottom") + +print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01) + +## Histogram +OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02 <- ggplot(dat_observational_inventory_demographics_fulljoin, + aes(x = agey.obs - agey.inv)) + + geom_histogram(aes(fill = agey_obs_inv_match), binwidth = 1) + + geom_vline(xintercept = ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX, + lty = "dashed", + alpha = 0.50) + + geom_vline(xintercept = ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MIN, + lty = "dashed", + alpha = 0.50) + + + labs(x = "Age as reported at INVENTORY phase", + y = "Age as reported at OBSERVATIONAL phase", + fill = "Data consistency") + + + scale_fill_manual(values = c("TRUE" = "#00FF007F", + "FALSE" = "#FF00007F"), + labels = c("TRUE" = "OK", + "FALSE" = "NOK"), + na.value = "#D3D3D37F") + + + facet_grid(dob_yn~country, + scales = "free", + labeller = "label_both") + + theme_bw() + + theme(legend.position = "bottom") + +print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02) + + + + + +###################################################################### +### WRITE DATA AND PLOTS TO OUTPUT DIRECTORY +###################################################################### +## Name of output filess +# Name of output data table file +OBSERVATIONAL_OUT_03_QC_02_FILENAME <- paste0("OBSERVATIONAL_OUT_03_QC_02", + "_country-", + paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".pdf") + +# Name of output graph files +OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_SEX_01 <- paste0("OBSERVATIONAL_OUT_03_QC_02_SEX_PLOT_01", + "_country-", + paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".pdf") +OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_01 <- paste0("OBSERVATIONAL_OUT_03_QC_02_AGEY_PLOT_01", + "_country-", + paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".pdf") +OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02 <- paste0("OBSERVATIONAL_OUT_03_QC_02_AGEY_PLOT_02", + "_country-", + paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".pdf") + +## Write to output files +# The graphs +ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_SEX_01, + path = "02_OBSERVATIONAL/outputs/", + plot = OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01, + width = 8, + height = 8) + +ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_01, + path = "02_OBSERVATIONAL/outputs/", + plot = OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01, + width = 8, + height = 8) + +ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02, + path = "02_OBSERVATIONAL/outputs/", + plot = OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02, + width = 8, + height = 8) + +# The data +write.table(dat_observational_inventory_demographics_fulljoin, + file = paste0("./02_OBSERVATIONAL/outputs/", + OBSERVATIONAL_OUT_03_QC_02_FILENAME), + sep = ",", + dec = ".", + quote = TRUE, + col.names = TRUE, + row.names = FALSE) @@ -87,7 +257,9 @@ tmp = dat_observational_curated %>% ###################################################################### ### CLEANUP ###################################################################### -rm() +rm(OBSERVATIONAL_OUT_03_QC_02_FILENAME, + OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_SEX_01, + OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_01, OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02)