From 8c42f1fa274f8dbf53e8e89d92e16ae50a68c47e Mon Sep 17 00:00:00 2001
From: Thomas  OBADIA <thomas.obadia@pasteur.fr>
Date: Thu, 17 Oct 2024 15:48:31 +0200
Subject: [PATCH] Fix some more issues introduced by dumping the databases with
 categorical data as labels and a more major issue where record_id would still
 be selected in subsetting some lists when it did not exist anymore.
 Conversely, the censusid does not need to be calculated as part of the
 get_target_list_for_observational_study() function anymore.

---
 01_INVENTORY/INVENTORY_00_R_functions.R         | 17 +++--------------
 .../INVENTORY_01_dump_REDCap_database.R         |  4 ++--
 .../INVENTORY_03_list_all_inventory_houses.R    |  5 ++---
 ...st_of_participants_for_observational_study.R |  4 ++--
 4 files changed, 9 insertions(+), 21 deletions(-)

diff --git a/01_INVENTORY/INVENTORY_00_R_functions.R b/01_INVENTORY/INVENTORY_00_R_functions.R
index 9cd9c3f..6f56732 100644
--- a/01_INVENTORY/INVENTORY_00_R_functions.R
+++ b/01_INVENTORY/INVENTORY_00_R_functions.R
@@ -90,12 +90,7 @@ get_target_list_for_observational_study <- function(x, n_target, n_backup) {
   # Order sampling list to appear grouped by house, for ease of use
   observational_list_p_main <- bind_rows(observational_list_p_main) %>% 
     arrange(country, clusterid, hid, nested_hhid, nested_subjid) %>%
-    mutate(censusid  = paste0(country, "-", 
-                              sprintf("%02d", clusterid), "-", 
-                              "H", sprintf("%03d", hid), "-", 
-                              sprintf("%02s", nested_hhid), "-", 
-                              sprintf("%02s", nested_subjid)), 
-           list_name = "main")
+    mutate(list_name = "main")
   
   # A backup list is generated after excluding people participants
   # already sampled in the main list
@@ -107,19 +102,13 @@ get_target_list_for_observational_study <- function(x, n_target, n_backup) {
                                         "n_target"      = sampling_plan$nested_backup_sampling_n, 
                                         MoreArgs        = list("x" = x %>% 
                                                                  anti_join(observational_list_p_main %>% 
-                                                                             select(record_id, 
-                                                                                    country, clusterid, hid, nested_hhid, nested_subjid))), 
+                                                                             select(country, clusterid, hid, nested_hhid, nested_subjid))), 
                                         SIMPLIFY        = FALSE)
   
   # Order sampling list to appear grouped by house, for ease of use
   observational_list_p_backup <- bind_rows(observational_list_p_backup) %>% 
     arrange(country, clusterid, hid, nested_hhid, nested_subjid) %>%
-    mutate(censusid  = paste0(country, "-", 
-                              sprintf("%02d", clusterid), "-", 
-                              "H", sprintf("%03d", hid), "-", 
-                              sprintf("%02s", nested_hhid), "-", 
-                              sprintf("%02s", nested_subjid)), 
-           list_name = "backup")
+    mutate(list_name = "backup")
   
   # Return the full list
   res <- observational_list_p_main %>% 
diff --git a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
index 6ba3730..ea8dbdf 100644
--- a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
+++ b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
@@ -110,8 +110,8 @@ if (!DATA_EXTRACT_IS_RECENT_INV) {
       mutate(gps_is_manual = case_when(is.na(gps_lat) & is.na(gps_lon) & !is.na(gps_lat_manual) & !is.na(gps_lon_manual) ~ TRUE, 
                                        !is.na(gps_lat) & !is.na(gps_lon) & is.na(gps_lat_manual) & is.na(gps_lon_manual) ~ FALSE), 
              gps_is_valid  = FALSE, 
-             gps_is_valid  = case_when(((country == "E" & (gps_lat >= GPS_LAT_MIN_E & gps_lat <= GPS_LAT_MAX_E & gps_lon >= GPS_LON_MIN_E & gps_lon <= GPS_LON_MAX_E)) | 
-                                          (country == "M" & (gps_lat >= GPS_LAT_MIN_M & gps_lat <= GPS_LAT_MAX_M & gps_lon >= GPS_LON_MIN_M & gps_lon <= GPS_LON_MAX_M))) ~ TRUE)) %>% 
+             gps_is_valid  = case_when(((country %in% c("E", "Ethiopia") & (gps_lat >= GPS_LAT_MIN_E & gps_lat <= GPS_LAT_MAX_E & gps_lon >= GPS_LON_MIN_E & gps_lon <= GPS_LON_MAX_E)) | 
+                                          (country %in% c("M", "Madagascar") & (gps_lat >= GPS_LAT_MIN_M & gps_lat <= GPS_LAT_MAX_M & gps_lon >= GPS_LON_MIN_M & gps_lon <= GPS_LON_MAX_M))) ~ TRUE)) %>% 
       
       # Add data source for convenience
       mutate(data_source   = RCON)
diff --git a/01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R b/01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R
index 7748b7e..0b0d946 100644
--- a/01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R
+++ b/01_INVENTORY/INVENTORY_03_list_all_inventory_houses.R
@@ -40,11 +40,10 @@ inventory_list_h <- dat_inventory_raw %>%
   
   # Merge in the number of individuals per house-and-household
   left_join(inventory_list_p %>% 
-              group_by(record_id, country, clusterid, hid) %>% 
+              group_by(country, clusterid, hid) %>% 
               summarize(n_living_at_house = n(), 
                         .groups = "keep"), 
-            by = join_by("record_id" == "record_id", 
-                         "country"   == "country", 
+            by = join_by("country"   == "country", 
                          "clusterid" == "clusterid", 
                          "hid"       == "hid"))
 
diff --git a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
index 06284a3..15fa9b7 100644
--- a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
+++ b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
@@ -33,8 +33,8 @@ set.seed(seed = SEED)
 ### GENERATE SAMPLING LIST
 ######################################################################
 observational_list_p <- get_target_list_for_observational_study(x        = inventory_list_p, 
-                                                                n_target = c("E" = 220, 
-                                                                             "M" = 110), 
+                                                                n_target = c("Ethiopia"   = 220, 
+                                                                             "Madagascar" = 110), 
                                                                 n_backup = 50)
 
 
-- 
GitLab