From eefbe5f4cc5af04c369c15e74eb30e48044b9881 Mon Sep 17 00:00:00 2001 From: Thomas OBADIA <tobadia@ml24-0166.corp.pasteur.fr> Date: Wed, 4 Dec 2024 17:35:03 +0100 Subject: [PATCH] Add a script as step 03 in the observational project to run QC rules on the curated dataset. This moves the merging operation with inventory data, which is also a form of QC, into a subsequent, nested QC script. --- ..._of_participants_for_observational_study.R | 2 +- ...VATIONAL_03_QC_01_curated_data_integrity.R | 20 +++++ ...servational_and_inventory_data_integrity.R | 82 +++++++++++++++++++ ...BSERVATIONAL_03_merge_inventory_metadata.R | 36 -------- 4 files changed, 103 insertions(+), 37 deletions(-) create mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R create mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R delete mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R diff --git a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R index 15fa9b7..d032e13 100644 --- a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R +++ b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R @@ -1,4 +1,4 @@ -## INVENTORY_05_select_list_function.R +## INVENTORY_05_generate_list_of_participants_for_observational_study.R ## Date : 2024/02/02 ## Author : Eliharintsoa Rajaoranimirana, Thomas Obadia ## diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R new file mode 100644 index 0000000..f3b7fa1 --- /dev/null +++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R @@ -0,0 +1,20 @@ +## OBSERVATIONAL_03_QC_curated_data.R +## Date : 2024/12/04 +## Author : Thomas Obadia +## +## This script processes the curated dataset from +## OBSERVATIONAL_02_curate_REDCap_raw_data.R and applies a series of +## QC rules. +## It returns a distinct dataset with columns corresponding to the +## outcome of each QC rule. Any 'TRUE' in these columns should warrant +## further investigation and clarification by study team. +###################################################################### + + + + + +###################################################################### +### SOURCE THE DATA +###################################################################### +source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R") \ No newline at end of file diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R new file mode 100644 index 0000000..0f96399 --- /dev/null +++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R @@ -0,0 +1,82 @@ +## OBSERVATIONAL_04_merge_inventory_metadata.R +## Date : 2024/10/17 +## Author : Thomas Obadia +## +## At the population inventory phase, many descriptors including +## GPS coordinates of houses were taken. +## The subset of individuals enrolled in the observational study +## will make use of these metadata to link with epidemiological and +## clinical data such as lab results, malaria prevalence etc. +## +## This script will amend the data from the observational databases +## with descriptors from the inventory databases, using the ID +## allocated at inventory phase. +###################################################################### + + + + + +###################################################################### +### SOURCE THE DATABASES +###################################################################### +if (!exists("DATA_EXTRACT_IS_RECENT_OBS") || !DATA_EXTRACT_IS_RECENT_OBS) { + cat("Databases are outdated and will be dumped again.\n") + source("./01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R") + source("./01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R") + source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R") +} + + + + + +###################################################################### +### MERGE INVENTORY AND OBSERVATIONAL DATA +###################################################################### +### The list of individuals from the inventory phase is stored in the +### inventory_list_p table. It merely contains the CensusID (which +### encodes the country, cluster, house, household and subject), +### as well as age and gender. +### As part of the observational study, the same data was collected and +### *should* report the CensusID when it was available. +### This section will confront demographics from both studies, and +### explore if reconciling these two cross-sectional datasets is +### feasible. + +## In the observational data, record_id differs across countries: +## - Ethiopia used consecutive autonumbering +## - Madagascar used censusid +## Check that censusid is actually redundant with record_id in Madagascar +dat_observational_curated %>% + mutate(record_id_is_censusid = (record_id == censusid)) %>% + count(country, record_id_is_censusid, + .drop = FALSE) + +tmp = dat_observational_curated %>% + select(censusid, consent, sex, agey) %>% + # REDCap labels were translated in Madagascar. + # Handle it here, before it's maybe handled before at the curation stage? + mutate(consent = plyr::mapvalues(x = consent, + from = c("Oui", "Non"), + to = c("Yes", "No")), + sex = plyr::mapvalues(x = sex, + from = c("Féminin", "Masculin"), + to = c("Female", "Male"))) %>% + full_join(inventory_list_p %>% + select(censusid, sex, agey), + by = join_by(censusid == censusid), + suffix = c(".obs", ".inv")) %>% + filter(consent == "Yes") %>% + separate_wider_regex(cols = censusid, + patterns = c(country = "^(?:E|M)", + "-", + clusterid = "\\d{2}", + "-", + "H", + houseid = "\\d{3}", + "-", + nested_hhid = "\\d{2}", + "-", + nested_subjid = "\\d{2}"), too_few = "debug") + diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R deleted file mode 100644 index f112c0c..0000000 --- a/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R +++ /dev/null @@ -1,36 +0,0 @@ -## OBSERVATIONAL_02_merge_inventory_metadata.R -## Date : 2024/10/17 -## Author : Thomas Obadia -## -## At the population inventory phase, many descriptors including -## GPS coordinates of houses were taken. -## The subset of individuals enrolled in the observational study -## will make use of these metadata to link with epidemiological and -## clinical data such as lab results, malaria prevalence etc. -## -## This script will amend the data from the observational databases -## with descriptors from the inventory databases, using the ID -## allocated at inventory phase. -###################################################################### - - - - - -###################################################################### -### SOURCE THE DATABASES -###################################################################### -if (!exists("DATA_EXTRACT_IS_RECENT_OBS") || !DATA_EXTRACT_IS_RECENT_OBS) { - cat("Databases are outdated and will be dumped again.\n") - source("./01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R") - source("./01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R") - source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R") -} - - - - - -###################################################################### -### -###################################################################### -- GitLab