Merge branch 'devel-refactor-subset-participant-list' into 'main'

Implement sampling from population inventory See merge request !1

Merge branch 'devel-refactor-subset-participant-list' into 'main'
3d725d71 · Thomas OBADIA · 95a3022b · 2ea2fa1f · 3d725d71 · 3d725d71
Commit 3d725d71 authored 1 year ago by Thomas OBADIA
--- a/01_INVENTORY/INVENTORY_00_R_environment.R
+++ b/01_INVENTORY/INVENTORY_00_R_environment.R
@@ -101,3 +101,13 @@ MAP_TYPE                      <- "hybrid"
 ## Layout of panels for plotting GPS maps
 MAP_PLOT_PANEL_LAYOUT_NROW    <- 3
 MAP_PLOT_PANEL_LAYOUT_NCOL    <- 3
+
+
+
+
+
+######################################################################
+### SEED VALUE
+######################################################################
+## A global seed value to be used by set.seed() calls
+SEED <- 12345
--- a/01_INVENTORY/INVENTORY_00_R_functions.R
+++ b/01_INVENTORY/INVENTORY_00_R_functions.R
+## INVENTORY_00_R_functions.R
+## Date     : 2024/02/06
+## Author   : Thomas Obadia
+##
+## Compilation of R functions developed for the 01_INVENTORY project,
+## that will be loaded for use by our analysis pipelines. The functions 
+## should be documented in an Roxygen2 fashion in case corresponding
+## *.Rd doc is generated in the future.
+##
+## Each block of funtions define a collection used for a specific task.
+## 
+## You should usually not source this file alone: it is sourced in the
+## script that handles the REDCap dump.
+######################################################################
+
+
+
+
+
+######################################################################
+### SUBSET POPULATION INVENTORY FOR OBSERVATIONAL STUDY
+######################################################################
+#' Subset a total population to a target sample size stratified by age and sex.
+#'
+#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}.
+#' @param n_target The total target sample size in a cluster.
+#' @return A random sample subset from \code{x} where individuals are representative of those from the whole population in terms of age-(categorical)-and-sex-structure, within each country and cluster.
+get_target_list_for_observational_study <- function(x, n_target) {
+  
+  sampling_plan <- x %>% 
+    count(country, clusterid, sex, agey_cat, name = "group_n") %>% 
+    
+    # Target sample size in a stratified group is:
+    #   - The group size scaled-down by its proportion if the cluster is large enough
+    #   - The whole group if the cluster is too small for the total target sample size
+    group_by(country, clusterid) %>% 
+    mutate(cluster_size_n       = sum(group_n, na.rm = TRUE), 
+           cluster_target_n     = min(c(cluster_size_n, n_target)), 
+           nested_sampling_prob = ifelse(cluster_size_n <= n_target, 1, group_n / cluster_size_n), 
+           nested_sampling_n    = round(cluster_target_n * nested_sampling_prob, 0))
+  
+  # The sampling plan is now applied to the population inventory list
+  res <- mapply(get_group_subset, 
+                "country_f"     = sampling_plan$country, 
+                "clusterid_f"   = sampling_plan$clusterid, 
+                "sex_f"         = sampling_plan$sex, 
+                "agey_cat_f"    = sampling_plan$agey_cat, 
+                "n_target"      = sampling_plan$nested_sampling_n, 
+                MoreArgs        = list("x" = x), 
+                SIMPLIFY        = FALSE)
+  
+  # Order sampling list to appear grouped by house, for ease of use
+  res <- bind_rows(res) %>% 
+    arrange(country, clusterid, hid, nested_hhid, nested_subjid)
+  
+  # Return the list
+  return(res)
+}
+
+
+
+
+
+#' Randomly subset group to a given size
+#'
+#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}.
+#' @param country_f The country to restrict the data to.
+#' @param clusterid_f The cluster ID to restrict the data to.
+#' @param sex_f The sex value to restrict the data to.
+#' @param agey_cat_f The age group to restrict the data to.
+#' @param n_target The target sample size.
+#' @return A subset of of the input data frame with \code{limit} rows each corresponding to a single individual.
+get_group_subset <- function(x, country_f, clusterid_f, sex_f, agey_cat_f, n_target) {
+  res <- x %>% 
+    filter(country == country_f & clusterid == clusterid_f & sex == sex_f & agey_cat == agey_cat_f) %>% 
+    # Shuffle current table to break any possible tie between sequential order and geographic location
+    sample_n(n()) %>% 
+    # Keep only target sample size
+    sample_n(min(n_target, n()))
+  
+  return(as.data.frame(res))
+}
--- a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
+++ b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
@@ -17,6 +17,7 @@
 ### SOURCE THE R ENVIRONMENT
 ######################################################################
 source("./01_INVENTORY/INVENTORY_00_R_environment.R")
+source("./01_INVENTORY/INVENTORY_00_R_functions.R")




--- a/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R
+++ b/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R
-## INVENTORY_02_list_all_inventory_participants.R
-## Date     : 2023/10/31
-## Author   : Thomas Obadia
+## INVENTORY_05_select_list_function.R
+## Date     : 2024/02/02
+## Author   : Eliharintsoa Rajaoranimirana, Thomas Obadia
 ##
-## This script starts by fetching another one to dump the database
-## for the PvSTATEM Inventory project (hosted into REDCap).
-## It subsequently processes the inventory database to generate a list
-## of all individuals that were listed, and saves that list into
-## an output CSV file.
+## This script provides functions to sample from the population from 
+## the inventory project and generate a list of participants of 
+## interest to target for the observational study.
 ######################################################################


@@ -30,60 +28,19 @@ DATA_EXTRACT_IS_RECENT    <- as.logical(as.numeric(difftime(time1 = Sys.Date(),


 ######################################################################
-### DERIVE LIST OF ALL INDIVIDUALS
+### REPLICABILITY
 ######################################################################
-inventory_list_p <- dat_inventory_raw %>% 
-  select(
-    # REDCap internal record_id
-    record_id, 
-    
-    # Items used to derive the house-and-household ID
-    country, clusterid, hid, hh_n, 
-    tidyselect::matches("^hh\\d{2}_subjid\\d{2}_(id|sex|agey)$")) %>% 
-  
-  # Prevent mis-interpretation of *_sex columns because 'F' can be FALSE in some context
-  mutate(across(ends_with("_sex"), function(x) {plyr::mapvalues(x, 
-                                                                from         = c(FALSE), 
-                                                                to           = c("F"), 
-                                                                warn_missing = FALSE)})) %>% 
-  
-  # Go from wide format to long to list household members by row, 
-  # then propagate age and sex to dedicated columns, to end up with
-  # a single row per individual
-  pivot_longer(cols             = starts_with("hh0"), 
-               names_to         = c("pivot_hh_n", "pivot_hh_subjid", "pivot_variable"), 
-               names_pattern    = c("^(hh\\d{2})_(subjid\\d{2})_(id|sex|agey)$"), 
-               values_transform = as.character) %>% 
-  pivot_wider(names_from  = "pivot_variable", 
-              values_from = "value") %>% 
-  mutate(agey = as.numeric(agey)) %>% 
-  extract(pivot_hh_n, 
-          into  = c("nested_hhid"), 
-          regex = "^hh(\\d{2})$") %>% 
-  extract(pivot_hh_subjid, 
-          into  = c("nested_subjid"), 
-          regex = "^subjid(\\d{2})$") %>% 
-  
-  # Add age as a categorical variable
-  mutate(agey_cat = case_when(country == "M" & agey < 1               ~ "y00_01",
-                              country == "M" & agey >= 1 & agey <= 5  ~ "y01_05",
-                              country == "M" & agey > 5  & agey <= 13 ~ "y05_13",
-                              country == "M" & agey > 13 & agey <= 24 ~ "y13_24", 
-                              country == "M" & agey > 24              ~ "y24_inf")) %>% 
-  
-  # Discard rows for whom no data was filled, i.e. empty placeholders in REDCap
-  filter(!is.na(sex) & !is.na(agey)) %>% 
-  
-  # Retain only columns of interest for derived data (list of everyone in inventory) 
-  select(
-    # REDCap internal record_id
-    record_id, 
-    
-    # Items used to derive the house-and-household ID
-    country, clusterid, hid, nested_hhid, nested_subjid, 
-    
-    # Actual demographics
-    sex, agey, agey_cat)
+set.seed(seed = SEED)
+
+
+
+
+
+######################################################################
+### GENERATE SAMPLING LIST
+######################################################################
+observational_list_p <- get_target_list_for_observational_study(x        = inventory_list_p, 
+                                                                n_target = 270)



@@ -93,17 +50,17 @@ inventory_list_p <- dat_inventory_raw %>%
 ### WRITE LIST TO OUTPUT
 ######################################################################
 ## Name of output file
-INVENTORY_OUT_02_FILENAME <- paste0("INVENTORY_OUT_02_list_individuals", 
+INVENTORY_OUT_05_FILENAME <- paste0("INVENTORY_OUT_05_list_participants_for_observational_study", 
                                    "_country-", 
-                                    paste(unique(inventory_list_p$country), collapse = "-"), 
+                                    paste(unique(observational_list_p$country), collapse = "-"), 
                                    "_timestamp-", 
                                    strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
                                    ".csv")

 ## Write to output file
-write.table(inventory_list_p, 
+write.table(observational_list_p, 
            file      = paste0("./01_INVENTORY/outputs/", 
-                               INVENTORY_OUT_02_FILENAME), 
+                               INVENTORY_OUT_05_FILENAME), 
            sep       = ",", 
            dec       = ".", 
            quote     = TRUE, 
@@ -117,4 +74,4 @@ write.table(inventory_list_p,
 ######################################################################
 ### CLEANUP
 ######################################################################
-rm(INVENTORY_OUT_02_FILENAME)
+rm(INVENTORY_OUT_05_FILENAME)
--- a/01_INVENTORY/INVENTORY_05_select_list_function.R
+++ b/01_INVENTORY/INVENTORY_05_select_list_function.R
-#function computes the number of samples needed in each cluster
-size_sample <- function(n_observ, n_cluster, n_sample){
-  
-  if(n_cluster <= n_sample){
-    return(n_observ)
-  }
-  else{
-    
-    prop = round((n_observ/n_cluster)*n_sample, 0)
-    return(prop)}
-}
-
-
-#### function select list randomly
-extract_list_sample <- function(clusterFilter, sexFilter, ageClassFilter, limit){
-  inventory_list_p %>% 
-    filter(clusterid == clusterFilter & sex == sexFilter & ageClass == ageClassFilter) %>% 
-    sample_n(n()) %>% 
-    sample_n(limit)
-  
-}
-
-#### function extract final selected list randomly
-
-myfunction <-  function(clusteridFilter, sampleNeeded){
-  df <- inventory_list_p %>% 
-    count(country, clusterid, sex, ageClass, name = "sexage_n") %>% 
-    left_join(
-      inventory_list_p %>% group_by(clusterid) %>% 
-        summarise(pop_cluster = n())) %>%
-    mutate(n_sample = mapply(size_sample, 
-                             sexage_n, 
-                             pop_cluster, 
-                             sampleNeeded),
-           list = mapply(extract_list_sample, 
-                         clusterid, 
-                         sex,
-                         ageClass,
-                         n_sample,
-                         SIMPLIFY = FALSE)) %>% 
-    filter(clusterid == clusteridFilter) 
-  sample_list <- bind_rows(df$list)
-  file_name <- paste0("outputs/sample_cluster_",
-                      clusteridFilter,"_", 
-                      strftime(Sys.time(), 
-                               format = "%Y%m%d_%H%M%S"),".csv")
-  return(write.csv2(sample_list, file_name))
-                 
-  
-  }
-  
-  myfunction(32, 20)
-  
-  #if we need more clusters
-  save_multiple_cluster <- function(listClusterid, numberSampleNeeded){
-    index <- 1
-    for (index in 1:length(listClusterid)) {
-      myfunction(listClusterid[index], numberSampleNeeded[index])
-      index <- index + 1
-    }
-  }
-  
-  save_multiple_cluster(c(3,4), c(10,5))
-  
\ No newline at end of file