Skip to content
Snippets Groups Projects
Commit 6b75b2df authored by Thomas  OBADIA's avatar Thomas OBADIA :speech_balloon:
Browse files

Implement QC checks based on merging OBS and INV datasets together to assess...

Implement QC checks based on merging OBS and INV datasets together to assess data consistency for demographics.
parent ba9c6548
No related branches found
No related tags found
No related merge requests found
......@@ -91,3 +91,18 @@ SEED <- 12345
# - FALSE: curation rule will not be enforced (default)
CURATE_DATA_RECALCULATE_AGEY_ETHIOPIA <- TRUE
CURATE_DATA_RECALCULATE_AGEY_MADAGASCAR <- FALSE
######################################################################
### ACCEPTABLE DIFFERENCES
######################################################################
## Global flag defining the *largest* acceptable difference, in years,
## for the reported age between observational and inventory studies.
ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MIN <- 3
## Global flag defining the *smallest* acceptable difference, in years,
## for the reported age between observational and inventory studies.
ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX <- 0
......@@ -24,7 +24,7 @@ if (!exists("DATA_EXTRACT_IS_RECENT_OBS") || !DATA_EXTRACT_IS_RECENT_OBS) {
cat("Databases are outdated and will be dumped again.\n")
source("./01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R")
source("./01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R")
source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R")
source("./02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R")
}
......@@ -53,8 +53,14 @@ dat_observational_curated %>%
count(country, record_id_is_censusid,
.drop = FALSE)
tmp = dat_observational_curated %>%
select(censusid, consent, sex, agey) %>%
## Check for consistency of demographics across INVENTORY and OBSERVATIONAL data
## NOTE: This is restricted to data for which NO QC FLAG is raised.
dat_observational_inventory_demographics_fulljoin <- dat_observational_curated_QC %>%
filter(!QC_any_rule_is_FALSE) %>%
select(censusid, consent,
sex,
dob_yn, agey) %>%
# REDCap labels were translated in Madagascar.
# Handle it here, before it's maybe handled before at the curation stage?
mutate(consent = plyr::mapvalues(x = consent,
......@@ -62,12 +68,17 @@ tmp = dat_observational_curated %>%
to = c("Yes", "No")),
sex = plyr::mapvalues(x = sex,
from = c("Féminin", "Masculin"),
to = c("Female", "Male"))) %>%
to = c("Female", "Male")),
dob_yn = plyr::mapvalues(x = dob_yn,
from = c("Oui", "Non"),
to = c("Yes", "No"))) %>%
full_join(inventory_list_p %>%
select(censusid, sex, agey),
by = join_by(censusid == censusid),
suffix = c(".obs", ".inv")) %>%
filter(consent == "Yes") %>%
separate_wider_regex(cols = censusid,
patterns = c(country = "^(?:E|M)",
"-",
......@@ -78,7 +89,166 @@ tmp = dat_observational_curated %>%
"-",
nested_hhid = "\\d{2}",
"-",
nested_subjid = "\\d{2}"), too_few = "debug")
nested_subjid = "\\d{2}")) %>%
# Compare agey and sex columns from INVENTORY and OBSERVATIONAL
mutate(sex_obs_inv_match = (sex.obs == sex.inv),
agey_obs_inv_match = ((agey.obs - agey.inv) <= ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MIN &
(agey.obs - agey.inv) >= ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX))
######################################################################
### DEMOGRAPHICS CONSISTENCY: SEX / PLOT
######################################################################
## Contingency table
OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01 <- ggplot(dat_observational_inventory_demographics_fulljoin %>%
count(country,
sex.obs, sex.inv, sex_obs_inv_match),
aes(x = sex.inv, y = sex.obs)) +
geom_tile(aes(fill = sex_obs_inv_match),
color = "#000000FF",
show.legend = FALSE) +
geom_text(aes(label = n)) +
labs(x = "Sex as reported at INVENTORY phase",
y = "Sex as reported at OBSERVATIONAL phase") +
scale_fill_manual(values = c("TRUE" = "#00FF007F",
"FALSE" = "#FF00007F"),
na.value = "#D3D3D37F",
guide = "none") +
facet_wrap(~country) +
theme_bw() +
theme(legend.position = "bottom")
print(OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01)
######################################################################
### DEMOGRAPHICS CONSISTENCY: AGE / PLOT
######################################################################
## Scatterplot
OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01 <- ggplot(dat_observational_inventory_demographics_fulljoin,
aes(x = agey.inv, y = agey.obs)) +
geom_point(aes(color = agey_obs_inv_match)) +
labs(x = "Age as reported at INVENTORY phase",
y = "Age as reported at OBSERVATIONAL phase",
color = "Data consistency") +
scale_color_manual(values = c("TRUE" = "#00FF007F",
"FALSE" = "#FF00007F"),
labels = c("TRUE" = "OK",
"FALSE" = "NOK"),
na.value = "#D3D3D37F") +
facet_wrap(~country) +
theme_bw() +
theme(legend.position = "bottom")
print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01)
## Histogram
OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02 <- ggplot(dat_observational_inventory_demographics_fulljoin,
aes(x = agey.obs - agey.inv)) +
geom_histogram(aes(fill = agey_obs_inv_match), binwidth = 1) +
geom_vline(xintercept = ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX,
lty = "dashed",
alpha = 0.50) +
geom_vline(xintercept = ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MIN,
lty = "dashed",
alpha = 0.50) +
labs(x = "Age as reported at INVENTORY phase",
y = "Age as reported at OBSERVATIONAL phase",
fill = "Data consistency") +
scale_fill_manual(values = c("TRUE" = "#00FF007F",
"FALSE" = "#FF00007F"),
labels = c("TRUE" = "OK",
"FALSE" = "NOK"),
na.value = "#D3D3D37F") +
facet_grid(dob_yn~country,
scales = "free",
labeller = "label_both") +
theme_bw() +
theme(legend.position = "bottom")
print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02)
######################################################################
### WRITE DATA AND PLOTS TO OUTPUT DIRECTORY
######################################################################
## Name of output filess
# Name of output data table file
OBSERVATIONAL_OUT_03_QC_02_FILENAME <- paste0("OBSERVATIONAL_OUT_03_QC_02",
"_country-",
paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".pdf")
# Name of output graph files
OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_SEX_01 <- paste0("OBSERVATIONAL_OUT_03_QC_02_SEX_PLOT_01",
"_country-",
paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".pdf")
OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_01 <- paste0("OBSERVATIONAL_OUT_03_QC_02_AGEY_PLOT_01",
"_country-",
paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".pdf")
OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02 <- paste0("OBSERVATIONAL_OUT_03_QC_02_AGEY_PLOT_02",
"_country-",
paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".pdf")
## Write to output files
# The graphs
ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_SEX_01,
path = "02_OBSERVATIONAL/outputs/",
plot = OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01,
width = 8,
height = 8)
ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_01,
path = "02_OBSERVATIONAL/outputs/",
plot = OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01,
width = 8,
height = 8)
ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02,
path = "02_OBSERVATIONAL/outputs/",
plot = OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02,
width = 8,
height = 8)
# The data
write.table(dat_observational_inventory_demographics_fulljoin,
file = paste0("./02_OBSERVATIONAL/outputs/",
OBSERVATIONAL_OUT_03_QC_02_FILENAME),
sep = ",",
dec = ".",
quote = TRUE,
col.names = TRUE,
row.names = FALSE)
......@@ -87,7 +257,9 @@ tmp = dat_observational_curated %>%
######################################################################
### CLEANUP
######################################################################
rm()
rm(OBSERVATIONAL_OUT_03_QC_02_FILENAME,
OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_SEX_01,
OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_01, OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02)
......
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment