diff --git a/00_COMMON/COMMON_00_R_functions.R b/00_COMMON/COMMON_00_R_functions.R new file mode 100644 index 0000000000000000000000000000000000000000..b44475ba610f148f1101c3f7ddaa40028c15ca63 --- /dev/null +++ b/00_COMMON/COMMON_00_R_functions.R @@ -0,0 +1,142 @@ +## COMMON_00_R_functions.R +## Date : 2025/05/22 +## Author : Thomas Obadia +## +## Compilation of R functions developed for the 00_INVENTORY project, +## that will be loaded for use by our analysis pipelines. The functions +## should be documented in an Roxygen2 fashion in case corresponding +## *.Rd doc is generated in the future. +## +## Each block of functions defines a collection used for a specific +## task. +## This file is always sourced in the environment when sourcing a +## subproject environment, because it contains functions that could be +## of interest regardless of what study stage we're at. +###################################################################### + + + + + +###################################################################### +### SUBSET POPULATION INVENTORY FOR TARGETED STUDY +###################################################################### +#' Subset a total population to a target sample size stratified by age and sex. +#' +#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}. +#' @param n_target A (named) vector containing the target sample size in a cluster. If named, the names are expected to exactly match the country codes in the data.frame \code{x}. In case \code{n_target} is unnamed and/or has length 1, its value is recycled for all countries in \code{x}. +#' @param n_backup A (named) vector containing the number of individuals to include in a backup list that will come in addition to the primary list. If named, the names are expected to exactly match the country codes in the data.frame \code{x}. In case \code{n_backup} is unnamed and/or has length 1, its value is recycled for all countries in \code{x}. +#' @return A random sample subset from \code{x} where individuals are representative of those from the whole population in terms of age-(categorical)-and-sex-structure, within each country and cluster. +get_target_list_for_study <- function(x, n_target, n_backup) { + + # Harmonize structure of n_target and n_backup + if (length(n_target) == 1) { + n_target <- data.frame("target_n" = rep(n_target, times = length(unique(x$country))), + "country" = unique(x$country), + row.names = NULL) + } else if (length(n_target) != length(unique(x$country))) { + stop("n_target should have length 1 or as many elements as unique values of x$country.") + } else if (!any(names(n_target) %in% unique(x$country)) | !any(unique(x$country) %in% names(n_target))) { + stop("n_target names should match exactly the values of x$country.") + } else { + n_target <- data.frame("target_n" = n_target, + "country" = names(n_target), + row.names = NULL) + } + + if (length(n_backup) == 1) { + n_backup <- data.frame("backup_n" = rep(n_backup, times = length(unique(x$country))), + "country" = unique(x$country), + row.names = NULL) + } else if (length(n_backup) != length(unique(x$country))) { + stop("n_backup should have length 1 or as many elements as unique values of x$country.") + } else if (!any(names(n_backup) %in% unique(x$country)) | !any(unique(x$country) %in% names(n_backup))) { + stop("n_backup names should match exactly the values of x$country.") + } else { + n_backup <- data.frame("backup_n" = n_backup, + "country" = names(n_backup), + row.names = NULL) + } + + # Elaborate sampling plan + sampling_plan <- x %>% + count(country, clusterid, sex, agey_cat, name = "group_n") %>% + # Join number of individuals in target and backup lists (can be unveven by country) + left_join(n_target, by = "country") %>% + left_join(n_backup, by = "country") %>% + + # Target sample size in a stratified group is: + # - The group size scaled-down by its proportion if the cluster is large enough + # - The whole group if the cluster is too small for the total target sample size + group_by(country, clusterid) %>% + mutate(cluster_size_n = sum(group_n, na.rm = TRUE), + + cluster_target_n = min(c(cluster_size_n, target_n)), + nested_sampling_prob = ifelse(cluster_size_n <= target_n, 1, group_n / cluster_size_n), + nested_sampling_n = round(cluster_target_n * nested_sampling_prob, 0), + + cluster_backup_n = min(c(cluster_size_n, backup_n)), + nested_backup_sampling_n = round(cluster_backup_n * nested_sampling_prob, 0)) + + # The sampling plan is now applied to the population inventory list + observational_list_p_main <- mapply(get_group_subset, + "country_f" = sampling_plan$country, + "clusterid_f" = sampling_plan$clusterid, + "sex_f" = sampling_plan$sex, + "agey_cat_f" = sampling_plan$agey_cat, + "n_target" = sampling_plan$nested_sampling_n, + MoreArgs = list("x" = x), + SIMPLIFY = FALSE) + + # Order sampling list to appear grouped by house, for ease of use + observational_list_p_main <- bind_rows(observational_list_p_main) %>% + arrange(country, clusterid, hid, nested_hhid, nested_subjid) %>% + mutate(list_name = "main") + + # A backup list is generated after excluding people participants + # already sampled in the main list + observational_list_p_backup <- mapply(get_group_subset, + "country_f" = sampling_plan$country, + "clusterid_f" = sampling_plan$clusterid, + "sex_f" = sampling_plan$sex, + "agey_cat_f" = sampling_plan$agey_cat, + "n_target" = sampling_plan$nested_backup_sampling_n, + MoreArgs = list("x" = x %>% + anti_join(observational_list_p_main %>% + select(country, clusterid, hid, nested_hhid, nested_subjid))), + SIMPLIFY = FALSE) + + # Order sampling list to appear grouped by house, for ease of use + observational_list_p_backup <- bind_rows(observational_list_p_backup) %>% + arrange(country, clusterid, hid, nested_hhid, nested_subjid) %>% + mutate(list_name = "backup") + + # Return the full list + res <- observational_list_p_main %>% + bind_rows(observational_list_p_backup) + return(res) +} + + + + + +#' Randomly subset group to a given size +#' +#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}. +#' @param country_f The country to restrict the data to. +#' @param clusterid_f The cluster ID to restrict the data to. +#' @param sex_f The sex value to restrict the data to. +#' @param agey_cat_f The age group to restrict the data to. +#' @param n_target The target sample size. +#' @return A subset of of the input data frame with \code{limit} rows each corresponding to a single individual. +get_group_subset <- function(x, country_f, clusterid_f, sex_f, agey_cat_f, n_target) { + res <- x %>% + filter(country == country_f & clusterid == clusterid_f & sex == sex_f & agey_cat == agey_cat_f) %>% + # Shuffle current table to break any possible tie between sequential order and geographic location + sample_n(n()) %>% + # Keep only target sample size + sample_n(min(c(n_target, n()))) + + return(as.data.frame(res)) +} diff --git a/01_INVENTORY/INVENTORY_00_R_environment.R b/01_INVENTORY/INVENTORY_00_R_environment.R index 6d65f64261a3420a42ce906a658b72bddb1612ff..2c22c3a8fcfd393815e57170258e3544291f0c02 100644 --- a/01_INVENTORY/INVENTORY_00_R_environment.R +++ b/01_INVENTORY/INVENTORY_00_R_environment.R @@ -27,6 +27,16 @@ require(ggmap) # More general set of methods +###################################################################### +### CUSTOM R FUNCTIONS +###################################################################### +source("./00_COMMON/COMMON_00_R_functions.R.R") +source("./01_INVENTORY/INVENTORY_00_R_functions.R") + + + + + ###################################################################### ### SOURCE .env FILE ###################################################################### diff --git a/01_INVENTORY/INVENTORY_00_R_functions.R b/01_INVENTORY/INVENTORY_00_R_functions.R index 6f567322c54fa84e9cdf689098a8872d9a139452..c932537b7078300f01936bba03d673ff988e7e84 100644 --- a/01_INVENTORY/INVENTORY_00_R_functions.R +++ b/01_INVENTORY/INVENTORY_00_R_functions.R @@ -7,8 +7,8 @@ ## should be documented in an Roxygen2 fashion in case corresponding ## *.Rd doc is generated in the future. ## -## Each block of functions define a collection used for a specific task. -## +## Each block of functions defines a collection used for a specific +## task. ## You should usually not source this file alone: it is sourced in the ## script that handles the REDCap dump. ###################################################################### @@ -17,133 +17,6 @@ -###################################################################### -### SUBSET POPULATION INVENTORY FOR OBSERVATIONAL STUDY -###################################################################### -#' Subset a total population to a target sample size stratified by age and sex. -#' -#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}. -#' @param n_target A (named) vector containing the target sample size in a cluster. If named, the names are expected to exactly match the country codes in the data.frame \code{x}. In case \code{n_target} is unnamed and/or has length 1, its value is recycled for all countries in \code{x}. -#' @param n_backup A (named) vector containing the number of individuals to include in a backup list that will come in addition to the primary list. If named, the names are expected to exactly match the country codes in the data.frame \code{x}. In case \code{n_backup} is unnamed and/or has length 1, its value is recycled for all countries in \code{x}. -#' @return A random sample subset from \code{x} where individuals are representative of those from the whole population in terms of age-(categorical)-and-sex-structure, within each country and cluster. -get_target_list_for_observational_study <- function(x, n_target, n_backup) { - - # Harmonize structure of n_target and n_backup - if (length(n_target) == 1) { - n_target <- data.frame("target_n" = rep(n_target, times = length(unique(x$country))), - "country" = unique(x$country), - row.names = NULL) - } else if (length(n_target) != length(unique(x$country))) { - stop("n_target should have length 1 or as many elements as unique values of x$country.") - } else if (!any(names(n_target) %in% unique(x$country)) | !any(unique(x$country) %in% names(n_target))) { - stop("n_target names should match exactly the values of x$country.") - } else { - n_target <- data.frame("target_n" = n_target, - "country" = names(n_target), - row.names = NULL) - } - - if (length(n_backup) == 1) { - n_backup <- data.frame("backup_n" = rep(n_backup, times = length(unique(x$country))), - "country" = unique(x$country), - row.names = NULL) - } else if (length(n_backup) != length(unique(x$country))) { - stop("n_backup should have length 1 or as many elements as unique values of x$country.") - } else if (!any(names(n_backup) %in% unique(x$country)) | !any(unique(x$country) %in% names(n_backup))) { - stop("n_backup names should match exactly the values of x$country.") - } else { - n_backup <- data.frame("backup_n" = n_backup, - "country" = names(n_backup), - row.names = NULL) - } - - # Elaborate sampling plan - sampling_plan <- x %>% - count(country, clusterid, sex, agey_cat, name = "group_n") %>% - # Join number of individuals in target and backup lists (can be unveven by country) - left_join(n_target, by = "country") %>% - left_join(n_backup, by = "country") %>% - - # Target sample size in a stratified group is: - # - The group size scaled-down by its proportion if the cluster is large enough - # - The whole group if the cluster is too small for the total target sample size - group_by(country, clusterid) %>% - mutate(cluster_size_n = sum(group_n, na.rm = TRUE), - - cluster_target_n = min(c(cluster_size_n, target_n)), - nested_sampling_prob = ifelse(cluster_size_n <= target_n, 1, group_n / cluster_size_n), - nested_sampling_n = round(cluster_target_n * nested_sampling_prob, 0), - - cluster_backup_n = min(c(cluster_size_n, backup_n)), - nested_backup_sampling_n = round(cluster_backup_n * nested_sampling_prob, 0)) - - # The sampling plan is now applied to the population inventory list - observational_list_p_main <- mapply(get_group_subset, - "country_f" = sampling_plan$country, - "clusterid_f" = sampling_plan$clusterid, - "sex_f" = sampling_plan$sex, - "agey_cat_f" = sampling_plan$agey_cat, - "n_target" = sampling_plan$nested_sampling_n, - MoreArgs = list("x" = x), - SIMPLIFY = FALSE) - - # Order sampling list to appear grouped by house, for ease of use - observational_list_p_main <- bind_rows(observational_list_p_main) %>% - arrange(country, clusterid, hid, nested_hhid, nested_subjid) %>% - mutate(list_name = "main") - - # A backup list is generated after excluding people participants - # already sampled in the main list - observational_list_p_backup <- mapply(get_group_subset, - "country_f" = sampling_plan$country, - "clusterid_f" = sampling_plan$clusterid, - "sex_f" = sampling_plan$sex, - "agey_cat_f" = sampling_plan$agey_cat, - "n_target" = sampling_plan$nested_backup_sampling_n, - MoreArgs = list("x" = x %>% - anti_join(observational_list_p_main %>% - select(country, clusterid, hid, nested_hhid, nested_subjid))), - SIMPLIFY = FALSE) - - # Order sampling list to appear grouped by house, for ease of use - observational_list_p_backup <- bind_rows(observational_list_p_backup) %>% - arrange(country, clusterid, hid, nested_hhid, nested_subjid) %>% - mutate(list_name = "backup") - - # Return the full list - res <- observational_list_p_main %>% - bind_rows(observational_list_p_backup) - return(res) -} - - - - - -#' Randomly subset group to a given size -#' -#' @param x A data.frame containing the population inventory in a wide format. It should be the output of \code{INVENTORY_02_list_all_inventory_participants.R}. -#' @param country_f The country to restrict the data to. -#' @param clusterid_f The cluster ID to restrict the data to. -#' @param sex_f The sex value to restrict the data to. -#' @param agey_cat_f The age group to restrict the data to. -#' @param n_target The target sample size. -#' @return A subset of of the input data frame with \code{limit} rows each corresponding to a single individual. -get_group_subset <- function(x, country_f, clusterid_f, sex_f, agey_cat_f, n_target) { - res <- x %>% - filter(country == country_f & clusterid == clusterid_f & sex == sex_f & agey_cat == agey_cat_f) %>% - # Shuffle current table to break any possible tie between sequential order and geographic location - sample_n(n()) %>% - # Keep only target sample size - sample_n(min(c(n_target, n()))) - - return(as.data.frame(res)) -} - - - - - ###################################################################### ### FIX FOR ggmap::get_map ###################################################################### diff --git a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R index 60797f004fd1dc8f8dd279a6d020082c07b9b5b6..ead0a7017b6e18131c16419bcb44b3931587b4b4 100644 --- a/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R +++ b/01_INVENTORY/INVENTORY_01_dump_REDCap_database.R @@ -17,7 +17,6 @@ ### SOURCE THE R ENVIRONMENT ###################################################################### source("./01_INVENTORY/INVENTORY_00_R_environment.R") -source("./01_INVENTORY/INVENTORY_00_R_functions.R") diff --git a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R index d032e135587af75bcc7926387c6454e3f564c643..add5761ba332511e40ff5468b5da683eb847eecd 100644 --- a/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R +++ b/01_INVENTORY/INVENTORY_05_generate_list_of_participants_for_observational_study.R @@ -32,10 +32,10 @@ set.seed(seed = SEED) ###################################################################### ### GENERATE SAMPLING LIST ###################################################################### -observational_list_p <- get_target_list_for_observational_study(x = inventory_list_p, - n_target = c("Ethiopia" = 220, - "Madagascar" = 110), - n_backup = 50) +observational_list_p <- get_target_list_for_study(x = inventory_list_p, + n_target = c("Ethiopia" = 220, + "Madagascar" = 110), + n_backup = 50) diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R b/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R index 4ebfe7e8d93403202765fd3287541bf2aa0a31a5..e2701bc73638300e99b7844955efefb5f6f5b380 100644 --- a/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R +++ b/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R @@ -27,6 +27,15 @@ require(tidyverse) # Data manipulation made easy +###################################################################### +### CUSTOM R FUNCTIONS +###################################################################### +source("./00_COMMON/COMMON_00_R_functions.R.R") + + + + + ###################################################################### ### SOURCE .env FILE ###################################################################### diff --git a/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_environment.R b/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_environment.R index 44cfb4cb80e6faabc7d9af888b7a085efce5806e..bfb8285e6270abe7beb53bb1eaaa85731f3e3183 100644 --- a/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_environment.R +++ b/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_environment.R @@ -27,6 +27,16 @@ require(tidyverse) # Data manipulation made easy +###################################################################### +### CUSTOM R FUNCTIONS +###################################################################### +source("./00_COMMON/COMMON_00_R_functions.R.R") +source("./03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_functions.R") + + + + + ###################################################################### ### SOURCE .env FILE ###################################################################### diff --git a/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_functions.R b/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_functions.R new file mode 100644 index 0000000000000000000000000000000000000000..b061ccf2611d92b91af23c45ecf8c525679520d1 --- /dev/null +++ b/03_INTERVENTIONAL_M0/INTERVENTIONAL_M0_00_R_functions.R @@ -0,0 +1,51 @@ +## INTERVENTIONAL_M0_00_R_functions.R +## Date : 2025/05/19 +## Author : Thomas Obadia +## +## Compilation of R functions developed for the 03_INTERVENTIONAL project, +## that will be loaded for use by our analysis pipelines. The functions +## should be documented in an Roxygen2 fashion in case corresponding +## *.Rd doc is generated in the future. +## +## Each block of functions define a collection used for a specific task. +## +## You should usually not source this file alone: it is sourced in the +## script that handles the REDCap dump. +###################################################################### + + + + + +#' Propagate unique data from non-repeated forms +#' +#' When longitudinal data is collected, some REDCap instruments may only be +#' prompted at baseline and not repeated at follow-ups (e.g., demographics). +#' This function will propagate these measurements to all subsequent events, +#' in case if they were not collected again. +#' +#' Note that this function assumes the record_id field is indeed named 'record_id'. +#' +#' @param x A REDCap data dump. +#' @param metadata The metadata from the the `rcon` object that served to get the data in `x`. +#' @param events.from A character vector indicating for which events the initial value should be propagated. +#' @return A data.frame with dimensions identical to `x` where the first value from certain instruments is propagated to all events. +#' @export +propagate_unique_data <- function(x, + metadata, + events.from) { + + res <- x %>% + group_by(record_id) %>% + # Use 'starts_with()' to also match checkboxes, not matched with 'any_of()' + mutate(across(starts_with(metadata %>% + filter(form_name %in% events.from) %>% + pull(field_name)), + function(x) { + if (length(na.omit(x)) == 1) {x[which(!is.na(x))]} else {NA} + })) %>% + ungroup + + ## Return output + return(res) +}