From eefbe5f4cc5af04c369c15e74eb30e48044b9881 Mon Sep 17 00:00:00 2001
From: Thomas  OBADIA <tobadia@ml24-0166.corp.pasteur.fr>
Date: Wed, 4 Dec 2024 17:35:03 +0100
Subject: [PATCH] Add a script as step 03 in the observational project to run
 QC rules on the curated dataset. This moves the merging operation with
 inventory data, which is also a form of QC, into a subsequent, nested QC
 script.

---
 ..._of_participants_for_observational_study.R |  2 +-
 ...VATIONAL_03_QC_01_curated_data_integrity.R | 20 +++++
 ...servational_and_inventory_data_integrity.R | 82 +++++++++++++++++++
 ...BSERVATIONAL_03_merge_inventory_metadata.R | 36 --------
 4 files changed, 103 insertions(+), 37 deletions(-)
 create mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R
 create mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R
 delete mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R

diff --git a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
index 15fa9b7..d032e13 100644
--- a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
+++ b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
@@ -1,4 +1,4 @@
-## INVENTORY_05_select_list_function.R
+## INVENTORY_05_generate_list_of_participants_for_observational_study.R
 ## Date     : 2024/02/02
 ## Author   : Eliharintsoa Rajaoranimirana, Thomas Obadia
 ##
diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R
new file mode 100644
index 0000000..f3b7fa1
--- /dev/null
+++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R
@@ -0,0 +1,20 @@
+## OBSERVATIONAL_03_QC_curated_data.R
+## Date     : 2024/12/04
+## Author   : Thomas Obadia
+##
+## This script processes the curated dataset from 
+## OBSERVATIONAL_02_curate_REDCap_raw_data.R and applies a series of 
+## QC rules. 
+## It returns a distinct dataset with columns corresponding to the 
+## outcome of each QC rule. Any 'TRUE' in these columns should warrant
+## further investigation and clarification by study team.
+######################################################################
+
+
+
+
+
+######################################################################
+### SOURCE THE DATA
+######################################################################
+source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R")
\ No newline at end of file
diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R
new file mode 100644
index 0000000..0f96399
--- /dev/null
+++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R
@@ -0,0 +1,82 @@
+## OBSERVATIONAL_04_merge_inventory_metadata.R
+## Date     : 2024/10/17
+## Author   : Thomas Obadia
+##
+## At the population inventory phase, many descriptors including 
+## GPS coordinates of houses were taken.
+## The subset of individuals enrolled in the observational study 
+## will make use of these metadata to link with epidemiological and 
+## clinical data such as lab results, malaria prevalence etc.
+##
+## This script will amend the data from the observational databases
+## with descriptors from the inventory databases, using the ID 
+## allocated at inventory phase.
+######################################################################
+
+
+
+
+
+######################################################################
+### SOURCE THE DATABASES
+######################################################################
+if (!exists("DATA_EXTRACT_IS_RECENT_OBS") || !DATA_EXTRACT_IS_RECENT_OBS) {
+  cat("Databases are outdated and will be dumped again.\n")
+  source("./01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R")
+  source("./01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R")
+  source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R")
+}
+
+
+
+
+
+######################################################################
+### MERGE INVENTORY AND OBSERVATIONAL DATA
+######################################################################
+### The list of individuals from the inventory phase is stored in the
+### inventory_list_p table. It merely contains the CensusID (which 
+### encodes the country, cluster, house, household and subject), 
+### as well as age and gender.
+### As part of the observational study, the same data was collected and
+### *should* report the CensusID when it was available.
+### This section will confront demographics from both studies, and
+### explore if reconciling these two cross-sectional datasets is 
+### feasible.
+
+## In the observational data, record_id differs across countries:
+##   - Ethiopia used consecutive autonumbering
+##   - Madagascar used censusid
+## Check that censusid is actually redundant with record_id in Madagascar
+dat_observational_curated %>% 
+  mutate(record_id_is_censusid = (record_id == censusid)) %>% 
+  count(country, record_id_is_censusid, 
+        .drop = FALSE)
+
+tmp = dat_observational_curated %>% 
+  select(censusid, consent, sex, agey) %>% 
+  # REDCap labels were translated in Madagascar. 
+  # Handle it here, before it's maybe handled before at the curation stage?
+  mutate(consent = plyr::mapvalues(x    = consent, 
+                                   from = c("Oui", "Non"), 
+                                   to   = c("Yes", "No")), 
+         sex     = plyr::mapvalues(x    = sex, 
+                                   from = c("Féminin", "Masculin"), 
+                                   to   = c("Female", "Male"))) %>% 
+  full_join(inventory_list_p %>% 
+              select(censusid, sex, agey), 
+            by = join_by(censusid == censusid), 
+            suffix = c(".obs", ".inv")) %>% 
+  filter(consent == "Yes") %>% 
+  separate_wider_regex(cols = censusid, 
+                       patterns = c(country       = "^(?:E|M)", 
+                                    "-", 
+                                    clusterid     = "\\d{2}", 
+                                    "-", 
+                                    "H", 
+                                    houseid       = "\\d{3}", 
+                                    "-", 
+                                    nested_hhid   = "\\d{2}", 
+                                    "-", 
+                                    nested_subjid = "\\d{2}"), too_few = "debug")
+  
diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R
deleted file mode 100644
index f112c0c..0000000
--- a/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R
+++ /dev/null
@@ -1,36 +0,0 @@
-## OBSERVATIONAL_02_merge_inventory_metadata.R
-## Date     : 2024/10/17
-## Author   : Thomas Obadia
-##
-## At the population inventory phase, many descriptors including 
-## GPS coordinates of houses were taken.
-## The subset of individuals enrolled in the observational study 
-## will make use of these metadata to link with epidemiological and 
-## clinical data such as lab results, malaria prevalence etc.
-##
-## This script will amend the data from the observational databases
-## with descriptors from the inventory databases, using the ID 
-## allocated at inventory phase.
-######################################################################
-
-
-
-
-
-######################################################################
-### SOURCE THE DATABASES
-######################################################################
-if (!exists("DATA_EXTRACT_IS_RECENT_OBS") || !DATA_EXTRACT_IS_RECENT_OBS) {
-  cat("Databases are outdated and will be dumped again.\n")
-  source("./01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R")
-  source("./01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R")
-  source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R")
-}
-
-
-
-
-
-######################################################################
-### 
-######################################################################
-- 
GitLab