Skip to content
Snippets Groups Projects
Commit 3d725d71 authored by Thomas  OBADIA's avatar Thomas OBADIA :speech_balloon:
Browse files

Merge branch 'devel-refactor-subset-participant-list' into 'main'

Implement sampling from population inventory

See merge request !1
parents 95a3022b 2ea2fa1f
No related branches found
No related tags found
1 merge request!1Implement sampling from population inventory
......@@ -101,3 +101,13 @@ MAP_TYPE <- "hybrid"
## Layout of panels for plotting GPS maps
MAP_PLOT_PANEL_LAYOUT_NROW <- 3
MAP_PLOT_PANEL_LAYOUT_NCOL <- 3
######################################################################
### SEED VALUE
######################################################################
## A global seed value to be used by set.seed() calls
SEED <- 12345
## INVENTORY_00_R_functions.R
## Date : 2024/02/06
## Author : Thomas Obadia
##
## Compilation of R functions developed for the 01_INVENTORY project,
## that will be loaded for use by our analysis pipelines. The functions
## should be documented in an Roxygen2 fashion in case corresponding
## *.Rd doc is generated in the future.
##
## Each block of funtions define a collection used for a specific task.
##
## You should usually not source this file alone: it is sourced in the
## script that handles the REDCap dump.
######################################################################
######################################################################
### SUBSET POPULATION INVENTORY FOR OBSERVATIONAL STUDY
######################################################################
#' Subset a total population to a target sample size stratified by age and sex.
#'
#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}.
#' @param n_target The total target sample size in a cluster.
#' @return A random sample subset from \code{x} where individuals are representative of those from the whole population in terms of age-(categorical)-and-sex-structure, within each country and cluster.
get_target_list_for_observational_study <- function(x, n_target) {
sampling_plan <- x %>%
count(country, clusterid, sex, agey_cat, name = "group_n") %>%
# Target sample size in a stratified group is:
# - The group size scaled-down by its proportion if the cluster is large enough
# - The whole group if the cluster is too small for the total target sample size
group_by(country, clusterid) %>%
mutate(cluster_size_n = sum(group_n, na.rm = TRUE),
cluster_target_n = min(c(cluster_size_n, n_target)),
nested_sampling_prob = ifelse(cluster_size_n <= n_target, 1, group_n / cluster_size_n),
nested_sampling_n = round(cluster_target_n * nested_sampling_prob, 0))
# The sampling plan is now applied to the population inventory list
res <- mapply(get_group_subset,
"country_f" = sampling_plan$country,
"clusterid_f" = sampling_plan$clusterid,
"sex_f" = sampling_plan$sex,
"agey_cat_f" = sampling_plan$agey_cat,
"n_target" = sampling_plan$nested_sampling_n,
MoreArgs = list("x" = x),
SIMPLIFY = FALSE)
# Order sampling list to appear grouped by house, for ease of use
res <- bind_rows(res) %>%
arrange(country, clusterid, hid, nested_hhid, nested_subjid)
# Return the list
return(res)
}
#' Randomly subset group to a given size
#'
#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}.
#' @param country_f The country to restrict the data to.
#' @param clusterid_f The cluster ID to restrict the data to.
#' @param sex_f The sex value to restrict the data to.
#' @param agey_cat_f The age group to restrict the data to.
#' @param n_target The target sample size.
#' @return A subset of of the input data frame with \code{limit} rows each corresponding to a single individual.
get_group_subset <- function(x, country_f, clusterid_f, sex_f, agey_cat_f, n_target) {
res <- x %>%
filter(country == country_f & clusterid == clusterid_f & sex == sex_f & agey_cat == agey_cat_f) %>%
# Shuffle current table to break any possible tie between sequential order and geographic location
sample_n(n()) %>%
# Keep only target sample size
sample_n(min(n_target, n()))
return(as.data.frame(res))
}
......@@ -17,6 +17,7 @@
### SOURCE THE R ENVIRONMENT
######################################################################
source("./01_INVENTORY/INVENTORY_00_R_environment.R")
source("./01_INVENTORY/INVENTORY_00_R_functions.R")
......
## INVENTORY_02_list_all_inventory_participants.R
## Date : 2023/10/31
## Author : Thomas Obadia
## INVENTORY_05_select_list_function.R
## Date : 2024/02/02
## Author : Eliharintsoa Rajaoranimirana, Thomas Obadia
##
## This script starts by fetching another one to dump the database
## for the PvSTATEM Inventory project (hosted into REDCap).
## It subsequently processes the inventory database to generate a list
## of all individuals that were listed, and saves that list into
## an output CSV file.
## This script provides functions to sample from the population from
## the inventory project and generate a list of participants of
## interest to target for the observational study.
######################################################################
......@@ -30,60 +28,19 @@ DATA_EXTRACT_IS_RECENT <- as.logical(as.numeric(difftime(time1 = Sys.Date(),
######################################################################
### DERIVE LIST OF ALL INDIVIDUALS
### REPLICABILITY
######################################################################
inventory_list_p <- dat_inventory_raw %>%
select(
# REDCap internal record_id
record_id,
# Items used to derive the house-and-household ID
country, clusterid, hid, hh_n,
tidyselect::matches("^hh\\d{2}_subjid\\d{2}_(id|sex|agey)$")) %>%
# Prevent mis-interpretation of *_sex columns because 'F' can be FALSE in some context
mutate(across(ends_with("_sex"), function(x) {plyr::mapvalues(x,
from = c(FALSE),
to = c("F"),
warn_missing = FALSE)})) %>%
# Go from wide format to long to list household members by row,
# then propagate age and sex to dedicated columns, to end up with
# a single row per individual
pivot_longer(cols = starts_with("hh0"),
names_to = c("pivot_hh_n", "pivot_hh_subjid", "pivot_variable"),
names_pattern = c("^(hh\\d{2})_(subjid\\d{2})_(id|sex|agey)$"),
values_transform = as.character) %>%
pivot_wider(names_from = "pivot_variable",
values_from = "value") %>%
mutate(agey = as.numeric(agey)) %>%
extract(pivot_hh_n,
into = c("nested_hhid"),
regex = "^hh(\\d{2})$") %>%
extract(pivot_hh_subjid,
into = c("nested_subjid"),
regex = "^subjid(\\d{2})$") %>%
# Add age as a categorical variable
mutate(agey_cat = case_when(country == "M" & agey < 1 ~ "y00_01",
country == "M" & agey >= 1 & agey <= 5 ~ "y01_05",
country == "M" & agey > 5 & agey <= 13 ~ "y05_13",
country == "M" & agey > 13 & agey <= 24 ~ "y13_24",
country == "M" & agey > 24 ~ "y24_inf")) %>%
# Discard rows for whom no data was filled, i.e. empty placeholders in REDCap
filter(!is.na(sex) & !is.na(agey)) %>%
# Retain only columns of interest for derived data (list of everyone in inventory)
select(
# REDCap internal record_id
record_id,
# Items used to derive the house-and-household ID
country, clusterid, hid, nested_hhid, nested_subjid,
# Actual demographics
sex, agey, agey_cat)
set.seed(seed = SEED)
######################################################################
### GENERATE SAMPLING LIST
######################################################################
observational_list_p <- get_target_list_for_observational_study(x = inventory_list_p,
n_target = 270)
......@@ -93,17 +50,17 @@ inventory_list_p <- dat_inventory_raw %>%
### WRITE LIST TO OUTPUT
######################################################################
## Name of output file
INVENTORY_OUT_02_FILENAME <- paste0("INVENTORY_OUT_02_list_individuals",
INVENTORY_OUT_05_FILENAME <- paste0("INVENTORY_OUT_05_list_participants_for_observational_study",
"_country-",
paste(unique(inventory_list_p$country), collapse = "-"),
paste(unique(observational_list_p$country), collapse = "-"),
"_timestamp-",
strftime(Sys.time(), format = "%Y%m%d_%H%M%S"),
".csv")
## Write to output file
write.table(inventory_list_p,
write.table(observational_list_p,
file = paste0("./01_INVENTORY/outputs/",
INVENTORY_OUT_02_FILENAME),
INVENTORY_OUT_05_FILENAME),
sep = ",",
dec = ".",
quote = TRUE,
......@@ -117,4 +74,4 @@ write.table(inventory_list_p,
######################################################################
### CLEANUP
######################################################################
rm(INVENTORY_OUT_02_FILENAME)
rm(INVENTORY_OUT_05_FILENAME)
#function computes the number of samples needed in each cluster
size_sample <- function(n_observ, n_cluster, n_sample){
if(n_cluster <= n_sample){
return(n_observ)
}
else{
prop = round((n_observ/n_cluster)*n_sample, 0)
return(prop)}
}
#### function select list randomly
extract_list_sample <- function(clusterFilter, sexFilter, ageClassFilter, limit){
inventory_list_p %>%
filter(clusterid == clusterFilter & sex == sexFilter & ageClass == ageClassFilter) %>%
sample_n(n()) %>%
sample_n(limit)
}
#### function extract final selected list randomly
myfunction <- function(clusteridFilter, sampleNeeded){
df <- inventory_list_p %>%
count(country, clusterid, sex, ageClass, name = "sexage_n") %>%
left_join(
inventory_list_p %>% group_by(clusterid) %>%
summarise(pop_cluster = n())) %>%
mutate(n_sample = mapply(size_sample,
sexage_n,
pop_cluster,
sampleNeeded),
list = mapply(extract_list_sample,
clusterid,
sex,
ageClass,
n_sample,
SIMPLIFY = FALSE)) %>%
filter(clusterid == clusteridFilter)
sample_list <- bind_rows(df$list)
file_name <- paste0("outputs/sample_cluster_",
clusteridFilter,"_",
strftime(Sys.time(),
format = "%Y%m%d_%H%M%S"),".csv")
return(write.csv2(sample_list, file_name))
}
myfunction(32, 20)
#if we need more clusters
save_multiple_cluster <- function(listClusterid, numberSampleNeeded){
index <- 1
for (index in 1:length(listClusterid)) {
myfunction(listClusterid[index], numberSampleNeeded[index])
index <- index + 1
}
}
save_multiple_cluster(c(3,4), c(10,5))
\ No newline at end of file
0% Loading or .
You are about to add 0 people to the discussion. Proceed with caution.
Please register or to comment