From 87f4ef0b766701999bfd900c33a385253f0c7eb9 Mon Sep 17 00:00:00 2001
From: "Jean-Paul R. Soucy" <jeanpaul.r.soucy@gmail.com>
Date: Mon, 14 Aug 2023 16:50:33 -0400
Subject: [PATCH] Update MB report script (#95)

- Update MB report script to reflect new "season" and removal of health region data table
- Health region data must now be manually estimated from figure 3 of the report
---
 utils/report_mb.R | 37 +++++++++++++++----------------------
 1 file changed, 15 insertions(+), 22 deletions(-)

diff --git a/utils/report_mb.R b/utils/report_mb.R
index 2537fd741..4bbd82d6a 100644
--- a/utils/report_mb.R
+++ b/utils/report_mb.R
@@ -27,7 +27,6 @@ ds <- rvest::read_html(url)
 tb_cases <- ds %>% rvest::html_elements(".tgN") %>% {.[grep("tb_cases", .)]} %>% rvest::html_table() %>% dplyr::nth(1)
 tb_severity <- ds %>% rvest::html_elements(".tgN") %>% {.[grep("tb_severity", .)]} %>% rvest::html_table() %>% dplyr::nth(1)
 tb_testing <- ds %>% rvest::html_elements(".tgN") %>% {.[grep("tb_testing", .)]} %>% rvest::html_table() %>% dplyr::nth(1)
-tb_hr <- ds %>% rvest::html_table() %>% {.[grep("Health Region|Helath region", .)]} %>% dplyr::nth(1)
 
 # function: extract number from specific substring in specific row of an HTML table
 extract_from_tab <- function(tab, row_n, string, parse_num = TRUE) {
@@ -54,28 +53,26 @@ mb <- dplyr::bind_rows(
     sub_region_1 = NA,
     cases = NA, # calculated by formula
     cases_weekly = extract_from_tab(tb_cases, 2, "(?<=Cases this week: )\\d*", parse_num = FALSE),
-    `cumulative_cases_since_2022-07-03` = extract_from_tab(tb_cases, 2, "(?<=Total cases: )\\d*", parse_num = FALSE),
-    `cumulative_cases_since_2022-07-03_weekly_diff` = NA, # calculated by formula
+    `cumulative_cases_since_2022-07-03` = NA,
+    `cumulative_cases_since_2022-07-03_weekly_diff` = NA,
+    `cumulative_cases_since_2023-07-02` = extract_from_tab(tb_cases, 2, "(?<=Total cases: )\\d*", parse_num = FALSE),
+    `cumulative_cases_since_2023-07-02_weekly_diff` = NA, # calculated by formula
     deaths_weekly = extract_from_tab(tb_severity, 2, "(?<=Severe outcomes this week.{0,1000}Deaths: ).{1,5}(?!\r)", parse_num = FALSE),
-    `cumulative_deaths_since_2022-07-03` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Deaths: ).{1,5}(?!\r)", parse_num = FALSE),
-    `cumulative_hospitalizations_since_2022-07-03` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Hospital admissions: ).{1,5}(?!\r)", parse_num = FALSE),
+    `cumulative_deaths_since_2022-07-03` = NA,
+    `cumulative_deaths_since_2023-07-02` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Deaths: ).{1,5}(?!\r)", parse_num = FALSE),
+    `cumulative_hospitalizations_since_2022-07-03` = NA,
+    `cumulative_hospitalizations_since_2023-07-02` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Hospital admissions: ).{1,5}(?!\r)", parse_num = FALSE),
     new_hospitalizations = extract_from_tab(tb_severity, 2, "(?<=Severe outcomes this week.{0,1000}Hospital admissions: ).{1,5}(?!\r)", parse_num = FALSE),
-    `cumulative_icu_since_2022-07-03` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}ICU admissions: ).{1,5}(?!\r)", parse_num = FALSE),
+    `cumulative_icu_since_2022-07-03` = NA,
+    `cumulative_icu_since_2023-07-02` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}ICU admissions: ).{1,5}(?!\r)", parse_num = FALSE),
     new_icu = extract_from_tab(tb_severity, 2, "(?<=Severe outcomes this week.{0,1000}ICU admissions[\\d]{0,1}: ).{1,5}(?!\r)", parse_num = FALSE),
-    `cumulative_tests_completed_2022-07-03` = extract_from_tab(tb_testing, 2, "Specimens tested: \\d*"),
+    `cumulative_tests_completed_2022-07-03` = NA,
+    `cumulative_tests_completed_2023-07-02` = extract_from_tab(tb_testing, 2, "Specimens tested: \\d*"),
     new_tests_completed_daily = extract_from_tab(tb_testing, 2, "Average daily specimens: \\d*"),
     positivity_rate_weekly = extract_from_tab(tb_testing, 2, "Weekly positivity rate: \\d*\\.\\d*(?=%)"),
-    `cumulative_people_tested_2022-07-03` = extract_from_tab(tb_testing, 2, "Tested people: \\d*")
-  ),
-  dplyr::tibble(
-    date = date_local,
-    source = url,
-    date_start = date_start,
-    date_end = date_end,
-    region = "MB",
-    sub_region_1 = tb_hr %>% dplyr::pull(1),
-    `cumulative_cases_since_2022-07-03` = tb_hr %>% dplyr::pull("Total cases") %>% as.character(),
-    cases_weekly = tb_hr %>% dplyr::pull("Cases this week") %>% as.character()
+    `cumulative_people_tested_2022-07-03` = NA,
+    `cumulative_people_tested_2023-07-02` = extract_from_tab(tb_testing, 2, "Tested people: \\d*"),
+    notes = NA
   )
 )
 
@@ -87,9 +84,5 @@ for (i in 1:ncol(mb)) {
   }
 }
 
-# check column sums
-mb[["cumulative_cases_since_2022-07-03"]][1] == sum(mb[["cumulative_cases_since_2022-07-03"]][2:6])
-mb[["cases_weekly"]][1] == sum(mb[["cases_weekly"]][2:6])
-
 # append data
 googlesheets4::sheet_append(data = mb, ss = "1ZTUb3fVzi6CLZAbU3lj6T6FTzl5Aq-arBNL49ru3VLo", sheet = "mb_weekly_report_2")