diff --git a/01_INVENTORY/INVENTORY_00_R_environment.R b/01_INVENTORY/INVENTORY_00_R_environment.R index aa7f8ab362bd18070021eab12a5dd37ebb9a708c..58a7fdb06ec1143980415c2271b5e0143ec37976 100644 --- a/01_INVENTORY/INVENTORY_00_R_environment.R +++ b/01_INVENTORY/INVENTORY_00_R_environment.R @@ -101,3 +101,13 @@ MAP_TYPE <- "hybrid" ## Layout of panels for plotting GPS maps MAP_PLOT_PANEL_LAYOUT_NROW <- 3 MAP_PLOT_PANEL_LAYOUT_NCOL <- 3 + + + + + +###################################################################### +### SEED VALUE +###################################################################### +## A global seed value to be used by set.seed() calls +SEED <- 12345 diff --git a/01_INVENTORY/INVENTORY_00_R_functions.R b/01_INVENTORY/INVENTORY_00_R_functions.R new file mode 100644 index 0000000000000000000000000000000000000000..2f1fe537878fbbbe83434c28e812657a767c643e --- /dev/null +++ b/01_INVENTORY/INVENTORY_00_R_functions.R @@ -0,0 +1,82 @@ +## INVENTORY_00_R_functions.R +## Date : 2024/02/06 +## Author : Thomas Obadia +## +## Compilation of R functions developed for the 01_INVENTORY project, +## that will be loaded for use by our analysis pipelines. The functions +## should be documented in an Roxygen2 fashion in case corresponding +## *.Rd doc is generated in the future. +## +## Each block of funtions define a collection used for a specific task. +## +## You should usually not source this file alone: it is sourced in the +## script that handles the REDCap dump. +###################################################################### + + + + + +###################################################################### +### SUBSET POPULATION INVENTORY FOR OBSERVATIONAL STUDY +###################################################################### +#' Subset a total population to a target sample size stratified by age and sex. +#' +#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}. +#' @param n_target The total target sample size in a cluster. +#' @return A random sample subset from \code{x} where individuals are representative of those from the whole population in terms of age-(categorical)-and-sex-structure, within each country and cluster. +get_target_list_for_observational_study <- function(x, n_target) { + + sampling_plan <- x %>% + count(country, clusterid, sex, agey_cat, name = "group_n") %>% + + # Target sample size in a stratified group is: + # - The group size scaled-down by its proportion if the cluster is large enough + # - The whole group if the cluster is too small for the total target sample size + group_by(country, clusterid) %>% + mutate(cluster_size_n = sum(group_n, na.rm = TRUE), + cluster_target_n = min(c(cluster_size_n, n_target)), + nested_sampling_prob = ifelse(cluster_size_n <= n_target, 1, group_n / cluster_size_n), + nested_sampling_n = round(cluster_target_n * nested_sampling_prob, 0)) + + # The sampling plan is now applied to the population inventory list + res <- mapply(get_group_subset, + "country_f" = sampling_plan$country, + "clusterid_f" = sampling_plan$clusterid, + "sex_f" = sampling_plan$sex, + "agey_cat_f" = sampling_plan$agey_cat, + "n_target" = sampling_plan$nested_sampling_n, + MoreArgs = list("x" = x), + SIMPLIFY = FALSE) + + # Order sampling list to appear grouped by house, for ease of use + res <- bind_rows(res) %>% + arrange(country, clusterid, hid, nested_hhid, nested_subjid) + + # Return the list + return(res) +} + + + + + +#' Randomly subset group to a given size +#' +#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}. +#' @param country_f The country to restrict the data to. +#' @param clusterid_f The cluster ID to restrict the data to. +#' @param sex_f The sex value to restrict the data to. +#' @param agey_cat_f The age group to restrict the data to. +#' @param n_target The target sample size. +#' @return A subset of of the input data frame with \code{limit} rows each corresponding to a single individual. +get_group_subset <- function(x, country_f, clusterid_f, sex_f, agey_cat_f, n_target) { + res <- x %>% + filter(country == country_f & clusterid == clusterid_f & sex == sex_f & agey_cat == agey_cat_f) %>% + # Shuffle current table to break any possible tie between sequential order and geographic location + sample_n(n()) %>% + # Keep only target sample size + sample_n(min(n_target, n())) + + return(as.data.frame(res)) +} diff --git a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R index bba087b925adf78624a351ba5bd5a1f737014998..2eea27c0637a8b3265f2548755071f0d138b634a 100644 --- a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R +++ b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R @@ -17,6 +17,7 @@ ### SOURCE THE R ENVIRONMENT ###################################################################### source("./01_INVENTORY/INVENTORY_00_R_environment.R") +source("./01_INVENTORY/INVENTORY_00_R_functions.R") diff --git a/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R b/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R deleted file mode 100644 index e5faf98dc5de82d5f82630ed56222a59acc4fe18..0000000000000000000000000000000000000000 --- a/01_INVENTORY/INVENTORY_02_list_all_inventory_participants.R +++ /dev/null @@ -1,120 +0,0 @@ -## INVENTORY_02_list_all_inventory_participants.R -## Date : 2023/10/31 -## Author : Thomas Obadia -## -## This script starts by fetching another one to dump the database -## for the PvSTATEM Inventory project (hosted into REDCap). -## It subsequently processes the inventory database to generate a list -## of all individuals that were listed, and saves that list into -## an output CSV file. -###################################################################### - - - - - -###################################################################### -### SOURCE THE DATABASE -###################################################################### -if (!exists("DATA_EXTRACT_IS_RECENT") || DATA_EXTRACT_IS_RECENT) { - source("./01_INVENTORY/INVENTORY_01_dump_REDCap_database.R") -} - -## Update DATA_EXTRACT_IS_RECENT_FLAG for future calls -DATA_EXTRACT_IS_RECENT <- as.logical(as.numeric(difftime(time1 = Sys.Date(), - time2 = DATA_EXTRACT_TS, - units = "days")) <= DATA_EXTRACT_EXPIRY_TIME_D) - - - - - -###################################################################### -### DERIVE LIST OF ALL INDIVIDUALS -###################################################################### -inventory_list_p <- dat_inventory_raw %>% - select( - # REDCap internal record_id - record_id, - - # Items used to derive the house-and-household ID - country, clusterid, hid, hh_n, - tidyselect::matches("^hh\\d{2}_subjid\\d{2}_(id|sex|agey)$")) %>% - - # Prevent mis-interpretation of *_sex columns because 'F' can be FALSE in some context - mutate(across(ends_with("_sex"), function(x) {plyr::mapvalues(x, - from = c(FALSE), - to = c("F"), - warn_missing = FALSE)})) %>% - - # Go from wide format to long to list household members by row, - # then propagate age and sex to dedicated columns, to end up with - # a single row per individual - pivot_longer(cols = starts_with("hh0"), - names_to = c("pivot_hh_n", "pivot_hh_subjid", "pivot_variable"), - names_pattern = c("^(hh\\d{2})_(subjid\\d{2})_(id|sex|agey)$"), - values_transform = as.character) %>% - pivot_wider(names_from = "pivot_variable", - values_from = "value") %>% - mutate(agey = as.numeric(agey)) %>% - extract(pivot_hh_n, - into = c("nested_hhid"), - regex = "^hh(\\d{2})$") %>% - extract(pivot_hh_subjid, - into = c("nested_subjid"), - regex = "^subjid(\\d{2})$") %>% - - # Add age as a categorical variable - mutate(agey_cat = case_when(country == "M" & agey < 1 ~ "y00_01", - country == "M" & agey >= 1 & agey <= 5 ~ "y01_05", - country == "M" & agey > 5 & agey <= 13 ~ "y05_13", - country == "M" & agey > 13 & agey <= 24 ~ "y13_24", - country == "M" & agey > 24 ~ "y24_inf")) %>% - - # Discard rows for whom no data was filled, i.e. empty placeholders in REDCap - filter(!is.na(sex) & !is.na(agey)) %>% - - # Retain only columns of interest for derived data (list of everyone in inventory) - select( - # REDCap internal record_id - record_id, - - # Items used to derive the house-and-household ID - country, clusterid, hid, nested_hhid, nested_subjid, - - # Actual demographics - sex, agey, agey_cat) - - - - - -###################################################################### -### WRITE LIST TO OUTPUT -###################################################################### -## Name of output file -INVENTORY_OUT_02_FILENAME <- paste0("INVENTORY_OUT_02_list_individuals", - "_country-", - paste(unique(inventory_list_p$country), collapse = "-"), - "_timestamp-", - strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), - ".csv") - -## Write to output file -write.table(inventory_list_p, - file = paste0("./01_INVENTORY/outputs/", - INVENTORY_OUT_02_FILENAME), - sep = ",", - dec = ".", - quote = TRUE, - col.names = TRUE, - row.names = FALSE) - - - - - -###################################################################### -### CLEANUP -###################################################################### -rm(INVENTORY_OUT_02_FILENAME) diff --git a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R new file mode 100644 index 0000000000000000000000000000000000000000..3ec471852394810dfd0df468aee4642a82d779c4 --- /dev/null +++ b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R @@ -0,0 +1,77 @@ +## INVENTORY_05_select_list_function.R +## Date : 2024/02/02 +## Author : Eliharintsoa Rajaoranimirana, Thomas Obadia +## +## This script provides functions to sample from the population from +## the inventory project and generate a list of participants of +## interest to target for the observational study. +###################################################################### + + + + + +###################################################################### +### SOURCE THE DATABASE +###################################################################### +if (!exists("DATA_EXTRACT_IS_RECENT") || DATA_EXTRACT_IS_RECENT) { + source("./01_INVENTORY/INVENTORY_01_dump_REDCap_database.R") +} + +## Update DATA_EXTRACT_IS_RECENT_FLAG for future calls +DATA_EXTRACT_IS_RECENT <- as.logical(as.numeric(difftime(time1 = Sys.Date(), + time2 = DATA_EXTRACT_TS, + units = "days")) <= DATA_EXTRACT_EXPIRY_TIME_D) + + + + + +###################################################################### +### REPLICABILITY +###################################################################### +set.seed(seed = SEED) + + + + + +###################################################################### +### GENERATE SAMPLING LIST +###################################################################### +observational_list_p <- get_target_list_for_observational_study(x = inventory_list_p, + n_target = 270) + + + + + +###################################################################### +### WRITE LIST TO OUTPUT +###################################################################### +## Name of output file +INVENTORY_OUT_05_FILENAME <- paste0("INVENTORY_OUT_05_list_participants_for_observational_study", + "_country-", + paste(unique(observational_list_p$country), collapse = "-"), + "_timestamp-", + strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), + ".csv") + +## Write to output file +write.table(observational_list_p, + file = paste0("./01_INVENTORY/outputs/", + INVENTORY_OUT_05_FILENAME), + sep = ",", + dec = ".", + quote = TRUE, + col.names = TRUE, + row.names = FALSE) + + + + + +###################################################################### +### CLEANUP +###################################################################### +rm(INVENTORY_OUT_05_FILENAME) diff --git a/01_INVENTORY/INVENTORY_05_select_list_function.R b/01_INVENTORY/INVENTORY_05_select_list_function.R deleted file mode 100644 index 0eb5da21c07e98a836c4e8aa17c73250b28e8493..0000000000000000000000000000000000000000 --- a/01_INVENTORY/INVENTORY_05_select_list_function.R +++ /dev/null @@ -1,64 +0,0 @@ -#function computes the number of samples needed in each cluster -size_sample <- function(n_observ, n_cluster, n_sample){ - - if(n_cluster <= n_sample){ - return(n_observ) - } - else{ - - prop = round((n_observ/n_cluster)*n_sample, 0) - return(prop)} -} - - -#### function select list randomly -extract_list_sample <- function(clusterFilter, sexFilter, ageClassFilter, limit){ - inventory_list_p %>% - filter(clusterid == clusterFilter & sex == sexFilter & ageClass == ageClassFilter) %>% - sample_n(n()) %>% - sample_n(limit) - -} - -#### function extract final selected list randomly - -myfunction <- function(clusteridFilter, sampleNeeded){ - df <- inventory_list_p %>% - count(country, clusterid, sex, ageClass, name = "sexage_n") %>% - left_join( - inventory_list_p %>% group_by(clusterid) %>% - summarise(pop_cluster = n())) %>% - mutate(n_sample = mapply(size_sample, - sexage_n, - pop_cluster, - sampleNeeded), - list = mapply(extract_list_sample, - clusterid, - sex, - ageClass, - n_sample, - SIMPLIFY = FALSE)) %>% - filter(clusterid == clusteridFilter) - sample_list <- bind_rows(df$list) - file_name <- paste0("outputs/sample_cluster_", - clusteridFilter,"_", - strftime(Sys.time(), - format = "%Y%m%d_%H%M%S"),".csv") - return(write.csv2(sample_list, file_name)) - - - } - - myfunction(32, 20) - - #if we need more clusters - save_multiple_cluster <- function(listClusterid, numberSampleNeeded){ - index <- 1 - for (index in 1:length(listClusterid)) { - myfunction(listClusterid[index], numberSampleNeeded[index]) - index <- index + 1 - } - } - - save_multiple_cluster(c(3,4), c(10,5)) - \ No newline at end of file