From 20a22e1e93eddb69a829fff3620ff7f00835718b Mon Sep 17 00:00:00 2001
From: Thomas  OBADIA <tobadia@ml24-0166.corp.pasteur.fr>
Date: Tue, 12 Nov 2024 17:08:44 +0100
Subject: [PATCH] Following discovery of inconsistencies in REDCap automatic
 calculations with datediff, implement a curation pipeline as step 02 in the
 02_OBSERVATIONAL project. The current curation is for fixing the agey
 calculation

---
 .../OBSERVATIONAL_00_R_environment.R          | 13 +++
 .../OBSERVATIONAL_02_curate_REDCap_raw_data.R | 80 +++++++++++++++++++
 ...SERVATIONAL_03_merge_inventory_metadata.R} |  0
 3 files changed, 93 insertions(+)
 create mode 100644 02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R
 rename 02_OBSERVATIONAL/{OBSERVATIONAL_02_merge_inventory_metadata.R => OBSERVATIONAL_03_merge_inventory_metadata.R} (100%)

diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R b/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R
index 59bdeb3..ef3c615 100644
--- a/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R
+++ b/02_OBSERVATIONAL/OBSERVATIONAL_00_R_environment.R
@@ -77,3 +77,16 @@ DATA_EXTRACT_EXPIRY_TIME_D <- 1
 ######################################################################
 ## A global seed value to be used by set.seed() calls
 SEED <- 12345
+
+
+
+
+
+######################################################################
+### CURATION
+######################################################################
+## Global flags that defines if a curation rule should be applied to 
+## the raw data dumped from REDCap or not.
+#    - TRUE : curation rule will be enforced
+#    - FALSE: curation rule will not be enforced (default)
+CURATE_DATA_RECALCULATE_AGEY <- TRUE
diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R b/02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R
new file mode 100644
index 0000000..3100c4b
--- /dev/null
+++ b/02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R
@@ -0,0 +1,80 @@
+## OBSERVATIONAL_02_curate_REDCap_raw_data.R
+## Date     : 2024/11/12
+## Author   : Thomas Obadia
+##
+## This script processes the raw data dump from 
+## OBSERVATIONAL_01_dump_REDCap_database.R and applies a series of 
+## curation rules. 
+## This particular script was developped after we realizes some fields
+## automatically calculated by REDCalp may yield inconsistent results
+## as a resulf of bugs that are fixed by the REDCap development team.
+## The primary example is the agey variable, for which calculated 
+## values are insonsitent depending on availability of a true DoB or
+## when it is estimated.
+######################################################################
+
+
+
+
+
+######################################################################
+### SOURCE THE R ENVIRONMENT
+######################################################################
+source("./02_OBSERVATIONAL/OBSERVATIONAL_01_dump_REDCap_database.R")
+
+
+
+
+
+######################################################################
+### APPLY CURATION RULES
+######################################################################
+## Start from the raw dataset generated in OBSERVATIONAL_01_dump_REDCap_database.R
+dat_observational_curated <- dat_observational_raw %>% 
+  # Go through each curation flag
+  mutate(agey = case_when(CURATE_DATA_RECALCULATE_AGEY & (dob_yn %in% c(1, "Yes") & !is.na(dob)) ~ round(x      = as.numeric(difftime(time1 = visdat, 
+                                                                                                                                      time2 = dob, 
+                                                                                                                                      units = "days")) / 365.25, 
+                                                                                                         digits = 0), 
+                          CURATE_DATA_RECALCULATE_AGEY & (dob_yn %in% c(0, "No") | is.na(dob)) ~ round(x      = as.numeric(difftime(time1 = visdat, 
+                                                                                                                                    time2 = as.Date(x      = calc_dob_from_myob, 
+                                                                                                                                                    format = "%d-%m-%Y"), 
+                                                                                                                                    units = "days")) / 365.25, 
+                                                                                                       digits = 0), 
+                          .default = agey))
+
+
+
+
+
+######################################################################
+### WRITE RAW DATA TO OUTPUT DIRECTORY
+######################################################################
+## Name of output file
+OBSERVATIONAL_OUT_02_FILENAME <- paste0("OBSERVATIONAL_OUT_02_raw-curated_data", 
+                                        "_country-", 
+                                        paste(unique(dat_observational_raw$country), collapse = "-"), 
+                                        "_timestamp-", 
+                                        strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                        ".csv")
+
+## Write to output file
+write.table(dat_observational_raw, 
+            file      = paste0("./02_OBSERVATIONAL/outputs/", 
+                               OBSERVATIONAL_OUT_02_FILENAME), 
+            sep       = ",", 
+            dec       = ".", 
+            quote     = TRUE, 
+            col.names = TRUE, 
+            row.names = FALSE)
+
+
+
+
+
+######################################################################
+### UPDATE DATA_EXTRACT_IS_RECENT_OBS
+######################################################################
+DATA_EXTRACT_IS_RECENT_OBS    <- as.logical(difftime(time1 = Sys.Date(), 
+                                                     time2 = as.Date(ifelse(exists("DATA_EXTRACT_TS_OBS"), DATA_EXTRACT_TS_OBS, DATA_EXTRACT_TS_DEFAULT)), 
+                                                     units = "days") <= DATA_EXTRACT_EXPIRY_TIME_D)
diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_02_merge_inventory_metadata.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R
similarity index 100%
rename from 02_OBSERVATIONAL/OBSERVATIONAL_02_merge_inventory_metadata.R
rename to 02_OBSERVATIONAL/OBSERVATIONAL_03_merge_inventory_metadata.R
-- 
GitLab