From 87f4ef0b766701999bfd900c33a385253f0c7eb9 Mon Sep 17 00:00:00 2001 From: "Jean-Paul R. Soucy" Date: Mon, 14 Aug 2023 16:50:33 -0400 Subject: [PATCH] Update MB report script (#95) - Update MB report script to reflect new "season" and removal of health region data table - Health region data must now be manually estimated from figure 3 of the report --- utils/report_mb.R | 37 +++++++++++++++---------------------- 1 file changed, 15 insertions(+), 22 deletions(-) diff --git a/utils/report_mb.R b/utils/report_mb.R index 2537fd741..4bbd82d6a 100644 --- a/utils/report_mb.R +++ b/utils/report_mb.R @@ -27,7 +27,6 @@ ds <- rvest::read_html(url) tb_cases <- ds %>% rvest::html_elements(".tgN") %>% {.[grep("tb_cases", .)]} %>% rvest::html_table() %>% dplyr::nth(1) tb_severity <- ds %>% rvest::html_elements(".tgN") %>% {.[grep("tb_severity", .)]} %>% rvest::html_table() %>% dplyr::nth(1) tb_testing <- ds %>% rvest::html_elements(".tgN") %>% {.[grep("tb_testing", .)]} %>% rvest::html_table() %>% dplyr::nth(1) -tb_hr <- ds %>% rvest::html_table() %>% {.[grep("Health Region|Helath region", .)]} %>% dplyr::nth(1) # function: extract number from specific substring in specific row of an HTML table extract_from_tab <- function(tab, row_n, string, parse_num = TRUE) { @@ -54,28 +53,26 @@ mb <- dplyr::bind_rows( sub_region_1 = NA, cases = NA, # calculated by formula cases_weekly = extract_from_tab(tb_cases, 2, "(?<=Cases this week: )\\d*", parse_num = FALSE), - `cumulative_cases_since_2022-07-03` = extract_from_tab(tb_cases, 2, "(?<=Total cases: )\\d*", parse_num = FALSE), - `cumulative_cases_since_2022-07-03_weekly_diff` = NA, # calculated by formula + `cumulative_cases_since_2022-07-03` = NA, + `cumulative_cases_since_2022-07-03_weekly_diff` = NA, + `cumulative_cases_since_2023-07-02` = extract_from_tab(tb_cases, 2, "(?<=Total cases: )\\d*", parse_num = FALSE), + `cumulative_cases_since_2023-07-02_weekly_diff` = NA, # calculated by formula deaths_weekly = extract_from_tab(tb_severity, 2, "(?<=Severe outcomes this week.{0,1000}Deaths: ).{1,5}(?!\r)", parse_num = FALSE), - `cumulative_deaths_since_2022-07-03` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Deaths: ).{1,5}(?!\r)", parse_num = FALSE), - `cumulative_hospitalizations_since_2022-07-03` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Hospital admissions: ).{1,5}(?!\r)", parse_num = FALSE), + `cumulative_deaths_since_2022-07-03` = NA, + `cumulative_deaths_since_2023-07-02` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Deaths: ).{1,5}(?!\r)", parse_num = FALSE), + `cumulative_hospitalizations_since_2022-07-03` = NA, + `cumulative_hospitalizations_since_2023-07-02` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}Hospital admissions: ).{1,5}(?!\r)", parse_num = FALSE), new_hospitalizations = extract_from_tab(tb_severity, 2, "(?<=Severe outcomes this week.{0,1000}Hospital admissions: ).{1,5}(?!\r)", parse_num = FALSE), - `cumulative_icu_since_2022-07-03` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}ICU admissions: ).{1,5}(?!\r)", parse_num = FALSE), + `cumulative_icu_since_2022-07-03` = NA, + `cumulative_icu_since_2023-07-02` = extract_from_tab(tb_severity, 2, "(?<=Total severe outcomes.{0,1000}ICU admissions: ).{1,5}(?!\r)", parse_num = FALSE), new_icu = extract_from_tab(tb_severity, 2, "(?<=Severe outcomes this week.{0,1000}ICU admissions[\\d]{0,1}: ).{1,5}(?!\r)", parse_num = FALSE), - `cumulative_tests_completed_2022-07-03` = extract_from_tab(tb_testing, 2, "Specimens tested: \\d*"), + `cumulative_tests_completed_2022-07-03` = NA, + `cumulative_tests_completed_2023-07-02` = extract_from_tab(tb_testing, 2, "Specimens tested: \\d*"), new_tests_completed_daily = extract_from_tab(tb_testing, 2, "Average daily specimens: \\d*"), positivity_rate_weekly = extract_from_tab(tb_testing, 2, "Weekly positivity rate: \\d*\\.\\d*(?=%)"), - `cumulative_people_tested_2022-07-03` = extract_from_tab(tb_testing, 2, "Tested people: \\d*") - ), - dplyr::tibble( - date = date_local, - source = url, - date_start = date_start, - date_end = date_end, - region = "MB", - sub_region_1 = tb_hr %>% dplyr::pull(1), - `cumulative_cases_since_2022-07-03` = tb_hr %>% dplyr::pull("Total cases") %>% as.character(), - cases_weekly = tb_hr %>% dplyr::pull("Cases this week") %>% as.character() + `cumulative_people_tested_2022-07-03` = NA, + `cumulative_people_tested_2023-07-02` = extract_from_tab(tb_testing, 2, "Tested people: \\d*"), + notes = NA ) ) @@ -87,9 +84,5 @@ for (i in 1:ncol(mb)) { } } -# check column sums -mb[["cumulative_cases_since_2022-07-03"]][1] == sum(mb[["cumulative_cases_since_2022-07-03"]][2:6]) -mb[["cases_weekly"]][1] == sum(mb[["cases_weekly"]][2:6]) - # append data googlesheets4::sheet_append(data = mb, ss = "1ZTUb3fVzi6CLZAbU3lj6T6FTzl5Aq-arBNL49ru3VLo", sheet = "mb_weekly_report_2")