Skip to content
Snippets Groups Projects
Commit 00bff6e8 authored by Thomas  OBADIA's avatar Thomas OBADIA :speech_balloon:
Browse files

Prepare QC rules checking uniqueness and validity of all IDs across the...

Prepare QC rules checking uniqueness and validity of all IDs across the collected data in the observational study. More QC rules will likely come in the future.
parent 6a921e36
No related branches found
No related tags found
No related merge requests found
## OBSERVATIONAL_03_QC_curated_data.R ## OBSERVATIONAL_03_QC_01_curated_data_integrity.R
## Date : 2024/12/04 ## Date : 2024/12/04
## Author : Thomas Obadia ## Author : Thomas Obadia
## ##
...@@ -17,4 +17,270 @@ ...@@ -17,4 +17,270 @@
###################################################################### ######################################################################
### SOURCE THE DATA ### SOURCE THE DATA
###################################################################### ######################################################################
source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R") source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R")
\ No newline at end of file
######################################################################
### QC RULES
######################################################################
## List of variables involved in QC investigations
## Note that some variables are repeated, because they are used for
## multiple QC rules.
QC_VARS <- c(
# Unique REDCap identifier
"record_id",
# ScreeningID
"screeningid",
# CensusID
"censusid",
# HouseID
"censusid", "hid",
# SubjectID
"uniqueid", "uniqueid2",
# Lab SampleID
"lb_samplecode", "uniqueid", "uniqueid2"
)
## Calculate QC rules triggers
dat_observational_curated_QC <- dat_observational_curated %>%
## ScreeningID
# Validity of ScreeningID
# (Need to account for ill-formed record_id in Madagascar as it did
# not use auto numbering)
mutate(QC_screeningid_is_valid = (grepl(pattern = "^(?:E|M)-SCR-(?:0000[1-9]|000[1-9][0-9]|00[1-9][0-9]{2}|0[1-9][0-9]{3}|[1-9][0-9]{4})$",
x = screeningid) |
grepl(pattern = "^(?:E|M)-SCR-(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-H(?:00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)-0[1-5]-(?:0[1-9]|1[0-5])$",
x = screeningid))) %>%
# ScreeningID is unique
group_by(screeningid) %>%
mutate(QC_screeningid_is_unique = case_when(is.na(screeningid) ~ TRUE,
!is.na(screeningid) ~ (n() == 1),
.default = NA)) %>%
ungroup() %>%
## CensusID
# Validity of CensusID
mutate(QC_censusid_is_valid = grepl(pattern = "^(?:E|M)-(0[1-9]|[1-2][0-9]|3[0-6])-H(00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)-0[1-5]-(0[1-9]|1[0-5])$",
x = censusid)) %>%
# CensusID is unique
group_by(censusid) %>%
mutate(QC_censusid_is_unique = case_when(is.na(censusid) ~ TRUE,
!is.na(censusid) ~ (n() == 1),
.default = NA)) %>%
ungroup() %>%
## HouseID
# HouseID is valid
mutate(QC_hid_is_valid = case_when(is.na(hid) ~ TRUE,
!is.na(hid) ~ grepl(pattern = "H?(00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)",
x = hid),
.default = NA)) %>%
# HouseID nested in censusid is identical to hid
separate_wider_regex(col = censusid,
patterns = c(".*",
"H",
censusid_hid = "\\d{3}",
".*"),
cols_remove = FALSE) %>%
mutate(QC_censusid_hid_is_identical_to_hid = case_when(is.na(censusid_hid) & is.na(hid) ~ TRUE,
is.na(censusid_hid) & !is.na(hid) ~ FALSE,
!is.na(censusid_hid) & is.na(hid) ~ FALSE,
!is.na(censusid_hid) & !is.na(hid) ~ (censusid_hid == hid),
.default = NA)) %>%
select(-censusid_hid) %>%
## SubjectID
# Unique SubjectID matches across the two corresponding fields
mutate(QC_uniqueid_is_identical_to_uniqueid2 = case_when(is.na(uniqueid) & is.na(uniqueid2) ~ TRUE,
is.na(uniqueid) & !is.na(uniqueid2) ~ FALSE,
!is.na(uniqueid) & is.na(uniqueid2) ~ FALSE,
!is.na(uniqueid) & !is.na(uniqueid2) ~ (uniqueid == uniqueid2),
.default = NA)) %>%
# Unique SubjectID is valid
mutate(QC_uniqueid_is_valid = case_when(is.na(uniqueid) ~ TRUE,
!is.na(uniqueid) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)$",
x = uniqueid),
.default = NA)) %>%
mutate(QC_uniqueid2_is_valid = case_when(is.na(uniqueid2) ~ TRUE,
!is.na(uniqueid2) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)$",
x = uniqueid2),
.default = NA)) %>%
# Unique SubjectID is actually unique
group_by(uniqueid) %>%
mutate(QC_uniqueid_is_unique = case_when(is.na(uniqueid) ~ TRUE,
!is.na(uniqueid) ~ (n() == 1),
.default = NA)) %>%
ungroup() %>%
group_by(uniqueid2) %>%
mutate(QC_uniqueid2_is_unique = case_when(is.na(uniqueid2) ~ TRUE,
!is.na(uniqueid2) ~ (n() == 1),
.default = NA)) %>%
ungroup() %>%
## Lam SampleID
# Lab SampleID is valid
mutate(QC_lb_samplecode_is_unique = case_when(is.na(lb_samplecode) ~ TRUE,
!is.na(lb_samplecode) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)-OB$",
x = lb_samplecode),
.default = NA)) %>%
# Lab SampleID is unique
group_by(lb_samplecode) %>%
mutate(QC_lb_samplecode_is_unique = case_when(is.na(lb_samplecode) ~ TRUE,
!is.na(lb_samplecode) ~ (n() == 1),
.default = NA)) %>%
ungroup() %>%
# Lab SampleID matches SubjectID
mutate(QC_lb_samplecode_matches_uniqueid = case_when(is.na(lb_samplecode) & is.na(uniqueid) ~ TRUE,
is.na(lb_samplecode) & !is.na(uniqueid) ~ FALSE,
!is.na(lb_samplecode) & is.na(uniqueid) ~ FALSE,
!is.na(lb_samplecode) & !is.na(uniqueid) ~ gsub(pattern = "-OB",
replacement = "",
x = lb_samplecode) == uniqueid,
.default = NA),
QC_lb_samplecode_matches_uniqueid2 = case_when(is.na(lb_samplecode) & is.na(uniqueid2) ~ TRUE,
is.na(lb_samplecode) & !is.na(uniqueid2) ~ FALSE,
!is.na(lb_samplecode) & is.na(uniqueid2) ~ FALSE,
!is.na(lb_samplecode) & !is.na(uniqueid2) ~ gsub(pattern = "-OB",
replacement = "",
x = lb_samplecode) == uniqueid2,
.default = NA)) %>%
## General flag for any QC rule triggered
mutate(QC_any_rule_is_FALSE = !if_all(starts_with("QC_"))) %>%
select(record_id,
QC_any_rule_is_FALSE, starts_with("QC_"),
everything())
## Summary of queries
# Full list
observational_curated_QC_summary_full <- dat_observational_curated_QC %>%
select(country, starts_with("QC_")) %>%
pivot_longer(cols = starts_with("QC_")) %>%
count(country, name, value) %>%
pivot_wider(names_from = name,
values_from = n,
values_fill = 0)
# Restricted to participants consenting to study
observational_curated_QC_summary_consentonly <- dat_observational_curated_QC %>%
filter(consent %in% c("Yes", "Oui")) %>%
select(country, starts_with("QC_")) %>%
pivot_longer(cols = starts_with("QC_")) %>%
count(country, name, value) %>%
pivot_wider(names_from = name,
values_from = n,
values_fill = 0)
######################################################################
### WRITE QC DATA TO OUTPUT DIRECTORY
######################################################################
## Name of output files
# SUMMARY files contain *summary counts* of QC rules
OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL <- paste0("OBSERVATIONAL_OUT_03_QC_01_SUMMARY_FULL",
"_country-",
paste(unique(dat_observational_curated_QC$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".csv")
OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY <- paste0("OBSERVATIONAL_OUT_03_QC_01_SUMMARY_CONSENTONLY",
"_country-",
paste(unique(dat_observational_curated_QC$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".csv")
# CONTENT files contain the corresponding *participant-level database content*
OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL <- paste0("OBSERVATIONAL_OUT_03_QC_01_CONTENT_FULL",
"_country-",
paste(unique(dat_observational_curated_QC$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".csv")
OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY <- paste0("OBSERVATIONAL_OUT_03_QC_01_CONTENT_CONSENTONLY",
"_country-",
paste(unique(dat_observational_curated_QC$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".csv")
## Write to output files
# Summary tables
write.table(observational_curated_QC_summary_full,
file = paste0("./02_OBSERVATIONAL/outputs/",
OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL),
sep = ",",
dec = ".",
quote = TRUE,
col.names = TRUE,
row.names = FALSE)
write.table(observational_curated_QC_summary_consentonly,
file = paste0("./02_OBSERVATIONAL/outputs/",
OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY),
sep = ",",
dec = ".",
quote = TRUE,
col.names = TRUE,
row.names = FALSE)
# Content tables
write.table(dat_observational_curated_QC,
file = paste0("./02_OBSERVATIONAL/outputs/",
OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL),
sep = ",",
dec = ".",
quote = TRUE,
col.names = TRUE,
row.names = FALSE)
write.table(dat_observational_curated_QC %>%
filter(consent %in% c("Yes", "Oui")) %>%
select(record_id, all_of(QC_VARS),
QC_any_rule_is_FALSE, starts_with("QC_")),
file = paste0("./02_OBSERVATIONAL/outputs/",
OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY),
sep = ",",
dec = ".",
quote = TRUE,
col.names = TRUE,
row.names = FALSE)
######################################################################
### CLEANUP
######################################################################
rm(QC_VARS,
OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL,
OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY,
OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL,
OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY)
######################################################################
### UPDATE DATA_EXTRACT_IS_RECENT_OBS
######################################################################
DATA_EXTRACT_IS_RECENT_OBS <- as.logical(difftime(time1 = Sys.Date(),
time2 = as.Date(ifelse(exists("DATA_EXTRACT_TS_OBS"), DATA_EXTRACT_TS_OBS, DATA_EXTRACT_TS_DEFAULT)),
units = "days") <= DATA_EXTRACT_EXPIRY_TIME_D)
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment