From 3b4b5a1612e935e5e194adb739f786434c40ec83 Mon Sep 17 00:00:00 2001
From: Thomas  OBADIA <thomas.obadia@pasteur.fr>
Date: Fri, 31 Jan 2025 11:08:09 +0100
Subject: [PATCH] Add QC checks for data consistency comparing INV and OBS
 datasets.

---
 ...servational_and_inventory_data_integrity.R | 43 +++++++++++++++++--
 PvSTATEM R Pipeline.Rproj                     |  1 +
 2 files changed, 40 insertions(+), 4 deletions(-)

diff --git a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R
index 39d1d53..55519f7 100644
--- a/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R
+++ b/02_OBSERVATIONAL/OBSERVATIONAL_03_QC_02_merging_observational_and_inventory_data_integrity.R
@@ -134,8 +134,31 @@ print(OBSERVATIONAL_OUT_03_QC_02_PLOT_SEX_01)
 ######################################################################
 ### DEMOGRAPHICS CONSISTENCY: AGE / PLOT
 ######################################################################
+## Contingency table
+OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01 <- ggplot(dat_observational_inventory_demographics_fulljoin %>% 
+                                                   count(country, 
+                                                         agey_obs_inv_match), 
+                                                 aes(x = agey_obs_inv_match, y = country)) + 
+  geom_tile(aes(fill = agey_obs_inv_match), 
+            color       = "#000000FF", 
+            show.legend = FALSE) + 
+  geom_text(aes(label = n)) + 
+  
+  labs(x    = "Agreement between INVENTORY and OBSERVATIONAL phase", 
+       y    = "Country") + 
+  
+  scale_fill_manual(values   = c("TRUE"  = "#00FF007F", 
+                                 "FALSE" = "#FF00007F"), 
+                    na.value = "#D3D3D37F", 
+                    guide = "none") + 
+  
+  theme_bw() + 
+  theme(legend.position = "bottom")
+
+print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01)
+
 ## Scatterplot
-OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01 <- ggplot(dat_observational_inventory_demographics_fulljoin, 
+OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02 <- ggplot(dat_observational_inventory_demographics_fulljoin, 
                                                   aes(x = agey.inv, y = agey.obs)) + 
   geom_point(aes(color = agey_obs_inv_match)) + 
   
@@ -153,10 +176,10 @@ OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01 <- ggplot(dat_observational_inventory_de
   theme_bw() + 
   theme(legend.position = "bottom")
 
-print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_01)
+print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02)
 
 ## Histogram
-OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02 <- ggplot(dat_observational_inventory_demographics_fulljoin, 
+OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_03 <- ggplot(dat_observational_inventory_demographics_fulljoin, 
                                                   aes(x = agey.obs - agey.inv)) + 
   geom_histogram(aes(fill = agey_obs_inv_match), binwidth = 1) + 
   geom_vline(xintercept = ACCEPTABLE_DIFF_AGEY_OBS_MINUS_INV_MAX, 
@@ -182,7 +205,7 @@ OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02 <- ggplot(dat_observational_inventory_de
   theme_bw() + 
   theme(legend.position = "bottom")
 
-print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_02)
+print(OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_03)
 
 
 
@@ -219,6 +242,12 @@ OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02 <- paste0("OBSERVATIONAL_OUT_03_QC_0
                                                       "_timestamp-", 
                                                       strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
                                                       ".pdf")
+OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_03 <- paste0("OBSERVATIONAL_OUT_03_QC_02_AGEY_PLOT_03", 
+                                                      "_country-", 
+                                                      paste(unique(dat_observational_inventory_demographics_fulljoin$country), collapse = "-"), 
+                                                      "_timestamp-", 
+                                                      strftime(Sys.time(), format = "%Y%m%d_%H%M%S"), 
+                                                      ".pdf")
 
 ## Write to output files
 # The graphs
@@ -240,6 +269,12 @@ ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_02,
        width    = 8, 
        height   = 8)
 
+ggsave(filename = OBSERVATIONAL_OUT_03_QC_02_PLOTNAME_AGEY_03, 
+       path     = "02_OBSERVATIONAL/outputs/", 
+       plot     = OBSERVATIONAL_OUT_03_QC_02_PLOT_AGEY_03, 
+       width    = 8, 
+       height   = 8)
+
 # The data
 write.table(dat_observational_inventory_demographics_fulljoin, 
             file      = paste0("./02_OBSERVATIONAL/outputs/", 
diff --git a/PvSTATEM R Pipeline.Rproj b/PvSTATEM R Pipeline.Rproj
index 8e3c2eb..e9f1a1b 100644
--- a/PvSTATEM R Pipeline.Rproj	
+++ b/PvSTATEM R Pipeline.Rproj	
@@ -1,4 +1,5 @@
 Version: 1.0
+ProjectId: 37109f91-1530-42f3-b3fe-656347abd584
 
 RestoreWorkspace: Default
 SaveWorkspace: Default
-- 
GitLab