diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R index f3b7fa1ccb5550dd0b419602b30626e0bf7acadc..a45d4ee6d45105a86a3af17cf1d43d8075e64e71 100644 --- a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R +++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R @@ -1,4 +1,4 @@ -## OBSERVATIONAL_03_QC_curated_data.R +## OBSERVATIONAL_03_QC_01_curated_data_integrity.R ## Date : 2024/12/04 ## Author : Thomas Obadia ## @@ -17,4 +17,270 @@ ###################################################################### ### SOURCE THE DATA ###################################################################### -source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R") \ No newline at end of file +source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R") + + + + + +###################################################################### +### QC RULES +###################################################################### +## List of variables involved in QC investigations +## Note that some variables are repeated, because they are used for +## multiple QC rules. +QC_VARS <- c( + # Unique REDCap identifier + "record_id", + + # ScreeningID + "screeningid", + + # CensusID + "censusid", + + # HouseID + "censusid", "hid", + + # SubjectID + "uniqueid", "uniqueid2", + + # Lab SampleID + "lb_samplecode", "uniqueid", "uniqueid2" +) + +## Calculate QC rules triggers +dat_observational_curated_QC <- dat_observational_curated %>% + ## ScreeningID + # Validity of ScreeningID + # (Need to account for ill-formed record_id in Madagascar as it did + # not use auto numbering) + mutate(QC_screeningid_is_valid = (grepl(pattern = "^(?:E|M)-SCR-(?:0000[1-9]|000[1-9][0-9]|00[1-9][0-9]{2}|0[1-9][0-9]{3}|[1-9][0-9]{4})$", + x = screeningid) | + grepl(pattern = "^(?:E|M)-SCR-(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-H(?:00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)-0[1-5]-(?:0[1-9]|1[0-5])$", + x = screeningid))) %>% + + # ScreeningID is unique + group_by(screeningid) %>% + mutate(QC_screeningid_is_unique = case_when(is.na(screeningid) ~ TRUE, + !is.na(screeningid) ~ (n() == 1), + .default = NA)) %>% + ungroup() %>% + + ## CensusID + # Validity of CensusID + mutate(QC_censusid_is_valid = grepl(pattern = "^(?:E|M)-(0[1-9]|[1-2][0-9]|3[0-6])-H(00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)-0[1-5]-(0[1-9]|1[0-5])$", + x = censusid)) %>% + + # CensusID is unique + group_by(censusid) %>% + mutate(QC_censusid_is_unique = case_when(is.na(censusid) ~ TRUE, + !is.na(censusid) ~ (n() == 1), + .default = NA)) %>% + ungroup() %>% + + ## HouseID + # HouseID is valid + mutate(QC_hid_is_valid = case_when(is.na(hid) ~ TRUE, + !is.na(hid) ~ grepl(pattern = "H?(00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)", + x = hid), + .default = NA)) %>% + + # HouseID nested in censusid is identical to hid + separate_wider_regex(col = censusid, + patterns = c(".*", + "H", + censusid_hid = "\\d{3}", + ".*"), + cols_remove = FALSE) %>% + mutate(QC_censusid_hid_is_identical_to_hid = case_when(is.na(censusid_hid) & is.na(hid) ~ TRUE, + is.na(censusid_hid) & !is.na(hid) ~ FALSE, + !is.na(censusid_hid) & is.na(hid) ~ FALSE, + !is.na(censusid_hid) & !is.na(hid) ~ (censusid_hid == hid), + .default = NA)) %>% + select(-censusid_hid) %>% + + ## SubjectID + # Unique SubjectID matches across the two corresponding fields + mutate(QC_uniqueid_is_identical_to_uniqueid2 = case_when(is.na(uniqueid) & is.na(uniqueid2) ~ TRUE, + is.na(uniqueid) & !is.na(uniqueid2) ~ FALSE, + !is.na(uniqueid) & is.na(uniqueid2) ~ FALSE, + !is.na(uniqueid) & !is.na(uniqueid2) ~ (uniqueid == uniqueid2), + .default = NA)) %>% + # Unique SubjectID is valid + mutate(QC_uniqueid_is_valid = case_when(is.na(uniqueid) ~ TRUE, + !is.na(uniqueid) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)$", + x = uniqueid), + .default = NA)) %>% + mutate(QC_uniqueid2_is_valid = case_when(is.na(uniqueid2) ~ TRUE, + !is.na(uniqueid2) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)$", + x = uniqueid2), + .default = NA)) %>% + + # Unique SubjectID is actually unique + group_by(uniqueid) %>% + mutate(QC_uniqueid_is_unique = case_when(is.na(uniqueid) ~ TRUE, + !is.na(uniqueid) ~ (n() == 1), + .default = NA)) %>% + ungroup() %>% + group_by(uniqueid2) %>% + mutate(QC_uniqueid2_is_unique = case_when(is.na(uniqueid2) ~ TRUE, + !is.na(uniqueid2) ~ (n() == 1), + .default = NA)) %>% + ungroup() %>% + + ## Lam SampleID + # Lab SampleID is valid + mutate(QC_lb_samplecode_is_unique = case_when(is.na(lb_samplecode) ~ TRUE, + !is.na(lb_samplecode) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)-OB$", + x = lb_samplecode), + .default = NA)) %>% + + # Lab SampleID is unique + group_by(lb_samplecode) %>% + mutate(QC_lb_samplecode_is_unique = case_when(is.na(lb_samplecode) ~ TRUE, + !is.na(lb_samplecode) ~ (n() == 1), + .default = NA)) %>% + ungroup() %>% + + # Lab SampleID matches SubjectID + mutate(QC_lb_samplecode_matches_uniqueid = case_when(is.na(lb_samplecode) & is.na(uniqueid) ~ TRUE, + is.na(lb_samplecode) & !is.na(uniqueid) ~ FALSE, + !is.na(lb_samplecode) & is.na(uniqueid) ~ FALSE, + !is.na(lb_samplecode) & !is.na(uniqueid) ~ gsub(pattern = "-OB", + replacement = "", + x = lb_samplecode) == uniqueid, + .default = NA), + QC_lb_samplecode_matches_uniqueid2 = case_when(is.na(lb_samplecode) & is.na(uniqueid2) ~ TRUE, + is.na(lb_samplecode) & !is.na(uniqueid2) ~ FALSE, + !is.na(lb_samplecode) & is.na(uniqueid2) ~ FALSE, + !is.na(lb_samplecode) & !is.na(uniqueid2) ~ gsub(pattern = "-OB", + replacement = "", + x = lb_samplecode) == uniqueid2, + .default = NA)) %>% + ## General flag for any QC rule triggered + mutate(QC_any_rule_is_FALSE = !if_all(starts_with("QC_"))) %>% + select(record_id, + QC_any_rule_is_FALSE, starts_with("QC_"), + everything()) + +## Summary of queries +# Full list +observational_curated_QC_summary_full <- dat_observational_curated_QC %>% + select(country, starts_with("QC_")) %>% + pivot_longer(cols = starts_with("QC_")) %>% + count(country, name, value) %>% + pivot_wider(names_from = name, + values_from = n, + values_fill = 0) + +# Restricted to participants consenting to study +observational_curated_QC_summary_consentonly <- dat_observational_curated_QC %>% + filter(consent %in% c("Yes", "Oui")) %>% + select(country, starts_with("QC_")) %>% + pivot_longer(cols = starts_with("QC_")) %>% + count(country, name, value) %>% + pivot_wider(names_from = name, + values_from = n, + values_fill = 0) + + + + + +###################################################################### +### WRITE QC DATA TO OUTPUT DIRECTORY +###################################################################### +## Name of output files +# SUMMARY files contain *summary counts* of QC rules +OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL <- paste0("OBSERVATIONAL_OUT_03_QC_01_SUMMARY_FULL", + "_country-", + paste(unique(dat_observational_curated_QC$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".csv") +OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY <- paste0("OBSERVATIONAL_OUT_03_QC_01_SUMMARY_CONSENTONLY", + "_country-", + paste(unique(dat_observational_curated_QC$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".csv") + +# CONTENT files contain the corresponding *participant-level database content* +OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL <- paste0("OBSERVATIONAL_OUT_03_QC_01_CONTENT_FULL", + "_country-", + paste(unique(dat_observational_curated_QC$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".csv") +OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY <- paste0("OBSERVATIONAL_OUT_03_QC_01_CONTENT_CONSENTONLY", + "_country-", + paste(unique(dat_observational_curated_QC$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".csv") + +## Write to output files +# Summary tables +write.table(observational_curated_QC_summary_full, + file = paste0("./02_OBSERVATIONAL/outputs/", + OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL), + sep = ",", + dec = ".", + quote = TRUE, + col.names = TRUE, + row.names = FALSE) +write.table(observational_curated_QC_summary_consentonly, + file = paste0("./02_OBSERVATIONAL/outputs/", + OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY), + sep = ",", + dec = ".", + quote = TRUE, + col.names = TRUE, + row.names = FALSE) + +# Content tables +write.table(dat_observational_curated_QC, + file = paste0("./02_OBSERVATIONAL/outputs/", + OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL), + sep = ",", + dec = ".", + quote = TRUE, + col.names = TRUE, + row.names = FALSE) + +write.table(dat_observational_curated_QC %>% + filter(consent %in% c("Yes", "Oui")) %>% + select(record_id, all_of(QC_VARS), + QC_any_rule_is_FALSE, starts_with("QC_")), + file = paste0("./02_OBSERVATIONAL/outputs/", + OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY), + sep = ",", + dec = ".", + quote = TRUE, + col.names = TRUE, + row.names = FALSE) + + + + + +###################################################################### +### CLEANUP +###################################################################### +rm(QC_VARS, + OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL, + OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY, + OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL, + OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY) + + + + + +###################################################################### +### UPDATE DATA_EXTRACT_IS_RECENT_OBS +###################################################################### +DATA_EXTRACT_IS_RECENT_OBS <- as.logical(difftime(time1 = Sys.Date(), + time2 = as.Date(ifelse(exists("DATA_EXTRACT_TS_OBS"), DATA_EXTRACT_TS_OBS, DATA_EXTRACT_TS_DEFAULT)), + units = "days") <= DATA_EXTRACT_EXPIRY_TIME_D)