diff --git a/01_INVENTORY/INVENTORY_00_R_environment.R b/01_INVENTORY/INVENTORY_00_R_environment.R
index aa7f8ab362bd18070021eab12a5dd37ebb9a708c..58a7fdb06ec1143980415c2271b5e0143ec37976 100644
--- a/01_INVENTORY/INVENTORY_00_R_environment.R
+++ b/01_INVENTORY/INVENTORY_00_R_environment.R
@@ -101,3 +101,13 @@ MAP_TYPE                      <- "hybrid"
 ## Layout of panels for plotting GPS maps
 MAP_PLOT_PANEL_LAYOUT_NROW    <- 3
 MAP_PLOT_PANEL_LAYOUT_NCOL    <- 3
+
+
+
+
+
+######################################################################
+### SEED VALUE
+######################################################################
+## A global seed value to be used by set.seed() calls
+SEED <- 12345
diff --git a/01_INVENTORY/INVENTORY_00_R_functions.R b/01_INVENTORY/INVENTORY_00_R_functions.R
new file mode 100644
index 0000000000000000000000000000000000000000..2f1fe537878fbbbe83434c28e812657a767c643e
--- /dev/null
+++ b/01_INVENTORY/INVENTORY_00_R_functions.R
@@ -0,0 +1,82 @@
+## INVENTORY_00_R_functions.R
+## Date     : 2024/02/06
+## Author   : Thomas Obadia
+##
+## Compilation of R functions developed for the 01_INVENTORY project,
+## that will be loaded for use by our analysis pipelines. The functions 
+## should be documented in an Roxygen2 fashion in case corresponding
+## *.Rd doc is generated in the future.
+##
+## Each block of funtions define a collection used for a specific task.
+## 
+## You should usually not source this file alone: it is sourced in the
+## script that handles the REDCap dump.
+######################################################################
+
+
+
+
+
+######################################################################
+### SUBSET POPULATION INVENTORY FOR OBSERVATIONAL STUDY
+######################################################################
+#' Subset a total population to a target sample size stratified by age and sex.
+#'
+#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}.
+#' @param n_target The total target sample size in a cluster.
+#' @return A random sample subset from \code{x} where individuals are representative of those from the whole population in terms of age-(categorical)-and-sex-structure, within each country and cluster.
+get_target_list_for_observational_study <- function(x, n_target) {
+  
+  sampling_plan <- x %>% 
+    count(country, clusterid, sex, agey_cat, name = "group_n") %>% 
+    
+    # Target sample size in a stratified group is:
+    #   - The group size scaled-down by its proportion if the cluster is large enough
+    #   - The whole group if the cluster is too small for the total target sample size
+    group_by(country, clusterid) %>% 
+    mutate(cluster_size_n       = sum(group_n, na.rm = TRUE), 
+           cluster_target_n     = min(c(cluster_size_n, n_target)), 
+           nested_sampling_prob = ifelse(cluster_size_n <= n_target, 1, group_n / cluster_size_n), 
+           nested_sampling_n    = round(cluster_target_n * nested_sampling_prob, 0))
+  
+  # The sampling plan is now applied to the population inventory list
+  res <- mapply(get_group_subset, 
+                "country_f"     = sampling_plan$country, 
+                "clusterid_f"   = sampling_plan$clusterid, 
+                "sex_f"         = sampling_plan$sex, 
+                "agey_cat_f"    = sampling_plan$agey_cat, 
+                "n_target"      = sampling_plan$nested_sampling_n, 
+                MoreArgs        = list("x" = x), 
+                SIMPLIFY        = FALSE)
+  
+  # Order sampling list to appear grouped by house, for ease of use
+  res <- bind_rows(res) %>% 
+    arrange(country, clusterid, hid, nested_hhid, nested_subjid)
+  
+  # Return the list
+  return(res)
+}
+
+
+
+
+
+#' Randomly subset group to a given size
+#'
+#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}.
+#' @param country_f The country to restrict the data to.
+#' @param clusterid_f The cluster ID to restrict the data to.
+#' @param sex_f The sex value to restrict the data to.
+#' @param agey_cat_f The age group to restrict the data to.
+#' @param n_target The target sample size.
+#' @return A subset of of the input data frame with \code{limit} rows each corresponding to a single individual.
+get_group_subset <- function(x, country_f, clusterid_f, sex_f, agey_cat_f, n_target) {
+  res <- x %>% 
+    filter(country == country_f & clusterid == clusterid_f & sex == sex_f & agey_cat == agey_cat_f) %>% 
+    # Shuffle current table to break any possible tie between sequential order and geographic location
+    sample_n(n()) %>% 
+    # Keep only target sample size
+    sample_n(min(n_target, n()))
+  
+  return(as.data.frame(res))
+}
diff --git a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
index bba087b925adf78624a351ba5bd5a1f737014998..2eea27c0637a8b3265f2548755071f0d138b634a 100644
--- a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
+++ b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R
@@ -17,6 +17,7 @@
 ### SOURCE THE R ENVIRONMENT
 ######################################################################
 source("./01_INVENTORY/INVENTORY_00_R_environment.R")
+source("./01_INVENTORY/INVENTORY_00_R_functions.R")
 
 
 
diff --git a/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R b/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R
deleted file mode 100644
index e5faf98dc5de82d5f82630ed56222a59acc4fe18..0000000000000000000000000000000000000000
--- a/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R
+++ /dev/null
@@ -1,120 +0,0 @@
-## INVENTORY_02_list_all_inventory_participants.R
-## Date     : 2023/10/31
-## Author   : Thomas Obadia
-##
-## This script starts by fetching another one to dump the database
-## for the PvSTATEM Inventory project (hosted into REDCap).
-## It subsequently processes the inventory database to generate a list
-## of all individuals that were listed, and saves that list into
-## an output CSV file.
-######################################################################
-
-
-
-
-
-######################################################################
-### SOURCE THE DATABASE
-######################################################################
-if (!exists("DATA_EXTRACT_IS_RECENT") || DATA_EXTRACT_IS_RECENT) {
-  source("./01_INVENTORY/INVENTORY_01_dump_REDCap_database.R")
-}
-
-## Update DATA_EXTRACT_IS_RECENT_FLAG for future calls
-DATA_EXTRACT_IS_RECENT    <- as.logical(as.numeric(difftime(time1 = Sys.Date(), 
-                                                            time2 = DATA_EXTRACT_TS, 
-                                                            units = "days")) <= DATA_EXTRACT_EXPIRY_TIME_D)
-
-
-
-
-
-######################################################################
-### DERIVE LIST OF ALL INDIVIDUALS
-######################################################################
-inventory_list_p <- dat_inventory_raw %>% 
-  select(
-    # REDCap internal record_id
-    record_id, 
-    
-    # Items used to derive the house-and-household ID
-    country, clusterid, hid, hh_n, 
-    tidyselect::matches("^hh\\d{2}_subjid\\d{2}_(id|sex|agey)$")) %>% 
-  
-  # Prevent mis-interpretation of *_sex columns because 'F' can be FALSE in some context
-  mutate(across(ends_with("_sex"), function(x) {plyr::mapvalues(x, 
-                                                                from         = c(FALSE), 
-                                                                to           = c("F"), 
-                                                                warn_missing = FALSE)})) %>% 
-  
-  # Go from wide format to long to list household members by row, 
-  # then propagate age and sex to dedicated columns, to end up with
-  # a single row per individual
-  pivot_longer(cols             = starts_with("hh0"), 
-               names_to         = c("pivot_hh_n", "pivot_hh_subjid", "pivot_variable"), 
-               names_pattern    = c("^(hh\\d{2})_(subjid\\d{2})_(id|sex|agey)$"), 
-               values_transform = as.character) %>% 
-  pivot_wider(names_from  = "pivot_variable", 
-              values_from = "value") %>% 
-  mutate(agey = as.numeric(agey)) %>% 
-  extract(pivot_hh_n, 
-          into  = c("nested_hhid"), 
-          regex = "^hh(\\d{2})$") %>% 
-  extract(pivot_hh_subjid, 
-          into  = c("nested_subjid"), 
-          regex = "^subjid(\\d{2})$") %>% 
-  
-  # Add age as a categorical variable
-  mutate(agey_cat = case_when(country == "M" & agey < 1               ~ "y00_01",
-                              country == "M" & agey >= 1 & agey <= 5  ~ "y01_05",
-                              country == "M" & agey > 5  & agey <= 13 ~ "y05_13",
-                              country == "M" & agey > 13 & agey <= 24 ~ "y13_24", 
-                              country == "M" & agey > 24              ~ "y24_inf")) %>% 
-  
-  # Discard rows for whom no data was filled, i.e. empty placeholders in REDCap
-  filter(!is.na(sex) & !is.na(agey)) %>% 
-  
-  # Retain only columns of interest for derived data (list of everyone in inventory) 
-  select(
-    # REDCap internal record_id
-    record_id, 
-    
-    # Items used to derive the house-and-household ID
-    country, clusterid, hid, nested_hhid, nested_subjid, 
-    
-    # Actual demographics
-    sex, agey, agey_cat)
-
-
-
-
-
-######################################################################
-### WRITE LIST TO OUTPUT
-######################################################################
-## Name of output file
-INVENTORY_OUT_02_FILENAME <- paste0("INVENTORY_OUT_02_list_individuals", 
-                                    "_country-", 
-                                    paste(unique(inventory_list_p$country), collapse = "-"), 
-                                    "_timestamp-", 
-                                    strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
-                                    ".csv")
-
-## Write to output file
-write.table(inventory_list_p, 
-            file      = paste0("./01_INVENTORY/outputs/", 
-                               INVENTORY_OUT_02_FILENAME), 
-            sep       = ",", 
-            dec       = ".", 
-            quote     = TRUE, 
-            col.names = TRUE, 
-            row.names = FALSE)
-
-
-
-
-
-######################################################################
-### CLEANUP
-######################################################################
-rm(INVENTORY_OUT_02_FILENAME)
diff --git a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
new file mode 100644
index 0000000000000000000000000000000000000000..3ec471852394810dfd0df468aee4642a82d779c4
--- /dev/null
+++ b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R
@@ -0,0 +1,77 @@
+## INVENTORY_05_select_list_function.R
+## Date     : 2024/02/02
+## Author   : Eliharintsoa Rajaoranimirana, Thomas Obadia
+##
+## This script provides functions to sample from the population from 
+## the inventory project and generate a list of participants of 
+## interest to target for the observational study.
+######################################################################
+
+
+
+
+
+######################################################################
+### SOURCE THE DATABASE
+######################################################################
+if (!exists("DATA_EXTRACT_IS_RECENT") || DATA_EXTRACT_IS_RECENT) {
+  source("./01_INVENTORY/INVENTORY_01_dump_REDCap_database.R")
+}
+
+## Update DATA_EXTRACT_IS_RECENT_FLAG for future calls
+DATA_EXTRACT_IS_RECENT    <- as.logical(as.numeric(difftime(time1 = Sys.Date(), 
+                                                            time2 = DATA_EXTRACT_TS, 
+                                                            units = "days")) <= DATA_EXTRACT_EXPIRY_TIME_D)
+
+
+
+
+
+######################################################################
+### REPLICABILITY
+######################################################################
+set.seed(seed = SEED)
+
+
+
+
+
+######################################################################
+### GENERATE SAMPLING LIST
+######################################################################
+observational_list_p <- get_target_list_for_observational_study(x        = inventory_list_p, 
+                                                                n_target = 270)
+
+
+
+
+
+######################################################################
+### WRITE LIST TO OUTPUT
+######################################################################
+## Name of output file
+INVENTORY_OUT_05_FILENAME <- paste0("INVENTORY_OUT_05_list_participants_for_observational_study", 
+                                    "_country-", 
+                                    paste(unique(observational_list_p$country), collapse = "-"), 
+                                    "_timestamp-", 
+                                    strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                    ".csv")
+
+## Write to output file
+write.table(observational_list_p, 
+            file      = paste0("./01_INVENTORY/outputs/", 
+                               INVENTORY_OUT_05_FILENAME), 
+            sep       = ",", 
+            dec       = ".", 
+            quote     = TRUE, 
+            col.names = TRUE, 
+            row.names = FALSE)
+
+
+
+
+
+######################################################################
+### CLEANUP
+######################################################################
+rm(INVENTORY_OUT_05_FILENAME)
diff --git a/01_INVENTORY/INVENTORY_05_select_list_function.R b/01_INVENTORY/INVENTORY_05_select_list_function.R
deleted file mode 100644
index 0eb5da21c07e98a836c4e8aa17c73250b28e8493..0000000000000000000000000000000000000000
--- a/01_INVENTORY/INVENTORY_05_select_list_function.R
+++ /dev/null
@@ -1,64 +0,0 @@
-#function computes the number of samples needed in each cluster
-size_sample <- function(n_observ, n_cluster, n_sample){
-  
-  if(n_cluster <= n_sample){
-    return(n_observ)
-  }
-  else{
-    
-    prop = round((n_observ/n_cluster)*n_sample, 0)
-    return(prop)}
-}
-
-
-#### function select list randomly
-extract_list_sample <- function(clusterFilter, sexFilter, ageClassFilter, limit){
-  inventory_list_p %>% 
-    filter(clusterid == clusterFilter & sex == sexFilter & ageClass == ageClassFilter) %>% 
-    sample_n(n()) %>% 
-    sample_n(limit)
-  
-}
-
-#### function extract final selected list randomly
-
-myfunction <-  function(clusteridFilter, sampleNeeded){
-  df <- inventory_list_p %>% 
-    count(country, clusterid, sex, ageClass, name = "sexage_n") %>% 
-    left_join(
-      inventory_list_p %>% group_by(clusterid) %>% 
-        summarise(pop_cluster = n())) %>%
-    mutate(n_sample = mapply(size_sample, 
-                             sexage_n, 
-                             pop_cluster, 
-                             sampleNeeded),
-           list = mapply(extract_list_sample, 
-                         clusterid, 
-                         sex,
-                         ageClass,
-                         n_sample,
-                         SIMPLIFY = FALSE)) %>% 
-    filter(clusterid == clusteridFilter) 
-  sample_list <- bind_rows(df$list)
-  file_name <- paste0("outputs/sample_cluster_",
-                      clusteridFilter,"_", 
-                      strftime(Sys.time(), 
-                               format = "%Y%m%d_%H%M%S"),".csv")
-  return(write.csv2(sample_list, file_name))
-                 
-  
-  }
-  
-  myfunction(32, 20)
-  
-  #if we need more clusters
-  save_multiple_cluster <- function(listClusterid, numberSampleNeeded){
-    index <- 1
-    for (index in 1:length(listClusterid)) {
-      myfunction(listClusterid[index], numberSampleNeeded[index])
-      index <- index + 1
-    }
-  }
-  
-  save_multiple_cluster(c(3,4), c(10,5))
-  
\ No newline at end of file