diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R
index f3b7fa1ccb5550dd0b419602b30626e0bf7acadc..a45d4ee6d45105a86a3af17cf1d43d8075e64e71 100644
--- a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R
+++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_01_curated_data_integrity.R
@@ -1,4 +1,4 @@
-## OBSERVATIONAL_03_QC_curated_data.R
+## OBSERVATIONAL_03_QC_01_curated_data_integrity.R
 ## Date     : 2024/12/04
 ## Author   : Thomas Obadia
 ##
@@ -17,4 +17,270 @@
 ######################################################################
 ### SOURCE THE DATA
 ######################################################################
-source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R")
\ No newline at end of file
+source("./02_OBSERVATIONAL/OBSERVATIONAL_02_curate_REDCap_raw_data.R")
+
+
+
+
+
+######################################################################
+### QC RULES
+######################################################################
+## List of variables involved in QC investigations
+## Note that some variables are repeated, because they are used for 
+## multiple QC rules.
+QC_VARS <- c(
+  # Unique REDCap identifier
+  "record_id", 
+  
+  # ScreeningID
+  "screeningid", 
+  
+  # CensusID
+  "censusid", 
+  
+  # HouseID
+  "censusid", "hid", 
+  
+  # SubjectID
+  "uniqueid", "uniqueid2", 
+  
+  # Lab SampleID
+  "lb_samplecode", "uniqueid", "uniqueid2"
+)
+
+## Calculate QC rules triggers
+dat_observational_curated_QC <- dat_observational_curated %>% 
+  ## ScreeningID
+  # Validity of ScreeningID
+  # (Need to account for ill-formed record_id in Madagascar as it did 
+  # not use auto numbering)
+  mutate(QC_screeningid_is_valid = (grepl(pattern = "^(?:E|M)-SCR-(?:0000[1-9]|000[1-9][0-9]|00[1-9][0-9]{2}|0[1-9][0-9]{3}|[1-9][0-9]{4})$", 
+                                          x       = screeningid) | 
+                                      grepl(pattern = "^(?:E|M)-SCR-(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-H(?:00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)-0[1-5]-(?:0[1-9]|1[0-5])$", 
+                                            x       = screeningid))) %>% 
+  
+  # ScreeningID is unique
+  group_by(screeningid) %>% 
+  mutate(QC_screeningid_is_unique = case_when(is.na(screeningid)  ~ TRUE, 
+                                              !is.na(screeningid) ~ (n() == 1), 
+                                              .default = NA)) %>% 
+  ungroup() %>% 
+  
+  ## CensusID
+  # Validity of CensusID
+  mutate(QC_censusid_is_valid = grepl(pattern = "^(?:E|M)-(0[1-9]|[1-2][0-9]|3[0-6])-H(00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)-0[1-5]-(0[1-9]|1[0-5])$", 
+                                      x       = censusid)) %>% 
+  
+  # CensusID is unique
+  group_by(censusid) %>% 
+  mutate(QC_censusid_is_unique = case_when(is.na(censusid)  ~ TRUE, 
+                                           !is.na(censusid) ~ (n() == 1), 
+                                           .default = NA)) %>% 
+  ungroup() %>% 
+  
+  ## HouseID
+  # HouseID is valid
+  mutate(QC_hid_is_valid = case_when(is.na(hid)  ~ TRUE, 
+                                     !is.na(hid) ~ grepl(pattern = "H?(00[1-9]|0[1-9][0-9]|1[0-9]{2}|200)", 
+                                                         x       = hid), 
+                                     .default = NA)) %>% 
+  
+  # HouseID nested in censusid is identical to hid
+  separate_wider_regex(col         = censusid, 
+                       patterns    = c(".*", 
+                                       "H", 
+                                       censusid_hid = "\\d{3}", 
+                                       ".*"), 
+                       cols_remove = FALSE) %>% 
+  mutate(QC_censusid_hid_is_identical_to_hid = case_when(is.na(censusid_hid) & is.na(hid)   ~ TRUE, 
+                                                         is.na(censusid_hid) & !is.na(hid)  ~ FALSE, 
+                                                         !is.na(censusid_hid) & is.na(hid)  ~ FALSE, 
+                                                         !is.na(censusid_hid) & !is.na(hid) ~ (censusid_hid == hid), 
+                                                         .default = NA)) %>%
+  select(-censusid_hid) %>% 
+  
+  ## SubjectID
+  # Unique SubjectID matches across the two corresponding fields
+  mutate(QC_uniqueid_is_identical_to_uniqueid2 = case_when(is.na(uniqueid) &  is.na(uniqueid2)  ~ TRUE, 
+                                                           is.na(uniqueid) & !is.na(uniqueid2)  ~ FALSE, 
+                                                           !is.na(uniqueid) &  is.na(uniqueid2) ~ FALSE, 
+                                                           !is.na(uniqueid) & !is.na(uniqueid2) ~ (uniqueid == uniqueid2), 
+                                                           .default = NA)) %>% 
+  # Unique SubjectID is valid
+  mutate(QC_uniqueid_is_valid = case_when(is.na(uniqueid)  ~ TRUE, 
+                                          !is.na(uniqueid) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)$", 
+                                                                   x       = uniqueid), 
+                                          .default = NA)) %>% 
+  mutate(QC_uniqueid2_is_valid = case_when(is.na(uniqueid2)  ~ TRUE, 
+                                           !is.na(uniqueid2) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)$", 
+                                                                     x       = uniqueid2), 
+                                           .default = NA)) %>% 
+  
+  # Unique SubjectID is actually unique
+  group_by(uniqueid) %>% 
+  mutate(QC_uniqueid_is_unique = case_when(is.na(uniqueid)  ~ TRUE, 
+                                           !is.na(uniqueid) ~ (n() == 1), 
+                                           .default = NA)) %>% 
+  ungroup() %>% 
+  group_by(uniqueid2) %>% 
+  mutate(QC_uniqueid2_is_unique = case_when(is.na(uniqueid2)  ~ TRUE, 
+                                            !is.na(uniqueid2) ~ (n() == 1), 
+                                            .default = NA)) %>% 
+  ungroup() %>% 
+  
+  ## Lam SampleID
+  # Lab SampleID is valid
+  mutate(QC_lb_samplecode_is_unique = case_when(is.na(lb_samplecode) ~ TRUE, 
+                                                !is.na(lb_samplecode) ~ grepl(pattern = "^(?:E|M)-(?:0[1-9]|[1-2][0-9]|3[0-6])-(?:00[1-9]|0[1-9][0-9]|[1-2][0-9]{2}|300)-OB$", 
+                                                                              x       = lb_samplecode), 
+                                                .default = NA)) %>% 
+  
+  # Lab SampleID is unique
+  group_by(lb_samplecode) %>% 
+  mutate(QC_lb_samplecode_is_unique = case_when(is.na(lb_samplecode) ~ TRUE, 
+                                                !is.na(lb_samplecode) ~ (n() == 1), 
+                                                .default = NA)) %>% 
+  ungroup() %>% 
+  
+  # Lab SampleID matches SubjectID
+  mutate(QC_lb_samplecode_matches_uniqueid = case_when(is.na(lb_samplecode) & is.na(uniqueid)   ~ TRUE, 
+                                                       is.na(lb_samplecode) & !is.na(uniqueid)  ~ FALSE, 
+                                                       !is.na(lb_samplecode) & is.na(uniqueid)  ~ FALSE, 
+                                                       !is.na(lb_samplecode) & !is.na(uniqueid) ~ gsub(pattern     = "-OB", 
+                                                                                                       replacement = "", 
+                                                                                                       x           = lb_samplecode) == uniqueid, 
+                                                       .default = NA), 
+         QC_lb_samplecode_matches_uniqueid2 = case_when(is.na(lb_samplecode) & is.na(uniqueid2)   ~ TRUE, 
+                                                        is.na(lb_samplecode) & !is.na(uniqueid2)  ~ FALSE, 
+                                                        !is.na(lb_samplecode) & is.na(uniqueid2)  ~ FALSE, 
+                                                        !is.na(lb_samplecode) & !is.na(uniqueid2) ~ gsub(pattern     = "-OB", 
+                                                                                                         replacement = "", 
+                                                                                                         x           = lb_samplecode) == uniqueid2, 
+                                                        .default = NA)) %>% 
+  ## General flag for any QC rule triggered
+  mutate(QC_any_rule_is_FALSE = !if_all(starts_with("QC_"))) %>% 
+  select(record_id, 
+         QC_any_rule_is_FALSE, starts_with("QC_"), 
+         everything())
+
+## Summary of queries
+# Full list
+observational_curated_QC_summary_full <- dat_observational_curated_QC %>% 
+  select(country, starts_with("QC_")) %>% 
+  pivot_longer(cols = starts_with("QC_")) %>% 
+  count(country, name, value) %>% 
+  pivot_wider(names_from  = name, 
+              values_from = n, 
+              values_fill = 0)
+
+# Restricted to participants consenting to study
+observational_curated_QC_summary_consentonly <- dat_observational_curated_QC %>% 
+  filter(consent %in% c("Yes", "Oui")) %>% 
+  select(country, starts_with("QC_")) %>% 
+  pivot_longer(cols = starts_with("QC_")) %>% 
+  count(country, name, value) %>% 
+  pivot_wider(names_from  = name, 
+              values_from = n, 
+              values_fill = 0)
+
+
+
+
+
+######################################################################
+### WRITE QC DATA TO OUTPUT DIRECTORY
+######################################################################
+## Name of output files
+# SUMMARY files contain *summary counts* of QC rules
+OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL <- paste0("OBSERVATIONAL_OUT_03_QC_01_SUMMARY_FULL", 
+                                                           "_country-", 
+                                                           paste(unique(dat_observational_curated_QC$country), collapse = "-"), 
+                                                           "_timestamp-", 
+                                                           strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                                           ".csv")
+OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY <- paste0("OBSERVATIONAL_OUT_03_QC_01_SUMMARY_CONSENTONLY", 
+                                                                  "_country-", 
+                                                                  paste(unique(dat_observational_curated_QC$country), collapse = "-"), 
+                                                                  "_timestamp-", 
+                                                                  strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                                                  ".csv")
+
+# CONTENT files contain the corresponding *participant-level database content*
+OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL <- paste0("OBSERVATIONAL_OUT_03_QC_01_CONTENT_FULL", 
+                                                           "_country-", 
+                                                           paste(unique(dat_observational_curated_QC$country), collapse = "-"), 
+                                                           "_timestamp-", 
+                                                           strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                                           ".csv")
+OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY <- paste0("OBSERVATIONAL_OUT_03_QC_01_CONTENT_CONSENTONLY", 
+                                                                  "_country-", 
+                                                                  paste(unique(dat_observational_curated_QC$country), collapse = "-"), 
+                                                                  "_timestamp-", 
+                                                                  strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                                                  ".csv")
+
+## Write to output files
+# Summary tables
+write.table(observational_curated_QC_summary_full, 
+            file      = paste0("./02_OBSERVATIONAL/outputs/", 
+                               OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL), 
+            sep       = ",", 
+            dec       = ".", 
+            quote     = TRUE, 
+            col.names = TRUE, 
+            row.names = FALSE)
+write.table(observational_curated_QC_summary_consentonly, 
+            file      = paste0("./02_OBSERVATIONAL/outputs/", 
+                               OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY), 
+            sep       = ",", 
+            dec       = ".", 
+            quote     = TRUE, 
+            col.names = TRUE, 
+            row.names = FALSE)
+
+# Content tables
+write.table(dat_observational_curated_QC, 
+            file      = paste0("./02_OBSERVATIONAL/outputs/", 
+                               OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL), 
+            sep       = ",", 
+            dec       = ".", 
+            quote     = TRUE, 
+            col.names = TRUE, 
+            row.names = FALSE)
+
+write.table(dat_observational_curated_QC %>% 
+              filter(consent %in% c("Yes", "Oui")) %>% 
+              select(record_id, all_of(QC_VARS), 
+                     QC_any_rule_is_FALSE, starts_with("QC_")), 
+            file      = paste0("./02_OBSERVATIONAL/outputs/", 
+                               OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY), 
+            sep       = ",", 
+            dec       = ".", 
+            quote     = TRUE, 
+            col.names = TRUE, 
+            row.names = FALSE)
+
+
+
+
+
+######################################################################
+### CLEANUP
+######################################################################
+rm(QC_VARS, 
+   OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_FULL, 
+   OBSERVATIONAL_OUT_03_QC_01_FILENAME_SUMMARY_CONSENTONLY, 
+   OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_FULL, 
+   OBSERVATIONAL_OUT_03_QC_01_FILENAME_CONTENT_CONSENTONLY)
+
+
+
+
+
+######################################################################
+### UPDATE DATA_EXTRACT_IS_RECENT_OBS
+######################################################################
+DATA_EXTRACT_IS_RECENT_OBS    <- as.logical(difftime(time1 = Sys.Date(), 
+                                                     time2 = as.Date(ifelse(exists("DATA_EXTRACT_TS_OBS"), DATA_EXTRACT_TS_OBS, DATA_EXTRACT_TS_DEFAULT)), 
+                                                     units = "days") <= DATA_EXTRACT_EXPIRY_TIME_D)