From 97d43b80620063bc7f29128409737b08b0cf3fe6 Mon Sep 17 00:00:00 2001 From: Ernest Guevarra Date: Mon, 18 May 2020 16:25:46 +0100 Subject: [PATCH 01/40] update v0.2.0 --- .Rbuildignore | 1 + R/02-get_doh.R | 354 +++++++++++++++++++++++++++++++ R/02-get_press_release.R | 178 ---------------- R/03-get_dfa.R | 44 ++++ R/{03-combine.R => 10-combine.R} | 0 scrapeDFA.js | 16 ++ 6 files changed, 415 insertions(+), 178 deletions(-) create mode 100644 R/02-get_doh.R delete mode 100644 R/02-get_press_release.R create mode 100644 R/03-get_dfa.R rename R/{03-combine.R => 10-combine.R} (100%) create mode 100644 scrapeDFA.js diff --git a/.Rbuildignore b/.Rbuildignore index 209aae9..004d8dd 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^\.travis\.yml$ ^appveyor\.yml$ ^codecov\.yml$ +^scrapeDFA\.js$ diff --git a/R/02-get_doh.R b/R/02-get_doh.R new file mode 100644 index 0000000..7db866d --- /dev/null +++ b/R/02-get_doh.R @@ -0,0 +1,354 @@ +################################################################################ +# +#' +#' Extract title of press releases from Department of Health website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph/press-releases} +#' @param pages A vector of page numbers corresponding to the page panel +#' containing the press release link. Default is 1:25. Press releases only go +#' up to page 25. Values higher than 25 will error. +#' +#' @return A tibble of 2 columns: 1) press release title; 2) press release +#' unique identifier; and, 3) date of press release. +#' +#' @examples +#' ## Get press release URLs for first page of press release panel. The base +#' ## argument has been specified using non-SSL version of base URL to prevent +#' ## erroring for those with proxy server connections (i.e., in Travis tests) +#' get_pr_url(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' +#' @export +#' +# +################################################################################ + +get_pr_url <- function(base = "https://www.doh.gov.ph/press-releases", + pages = 1:25) { + ## Check that pages goes up to 25 only + if(max(pages) > 25) { + stop("The current maximum pages for press releases in the DoH website + is 25. Please try again.", call. = TRUE) + } + + ## Concatenating vectors + prURL <- NULL + prID <- NULL + prDate <- NULL + + ## Cycle through pages + for(i in pages) { + wp <- paste(base, "?page=", i - 1, sep = "") + if(i == 1) wp <- base + + ## Extract and process press release relative links + href <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .views-field-title .field-content a") %>% + rvest::html_attr(name = "href") + + href <- stringr::str_subset(string = href, pattern = "press-release|node") + href <- stringr::str_subset(string = href, pattern = "20343|19904", negate = TRUE) + + ## Generate unique identifiers for each press release + id <- stringr::str_split_fixed(string = href, pattern = "/", n = 3)[ , 3] + id <- id %>% + stringr::str_remove_all(pattern = "-") %>% + stringr::str_extract(pattern = "[a-zA-Z]{8}") %>% + stringr::str_to_upper() %>% + stringr::str_split(pattern = "", simplify = TRUE) + + id <- matrix(data = match(id, LETTERS[1:26]), + nrow = nrow(id), + ncol = 8, + byrow = FALSE) + + hrefID <- NULL + + for(j in 1:nrow(id)) { + if(all(is.na(id[j, ]))) { + hrefID <- c(hrefID, + stringr::str_extract(string = href[j], + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}")) + } else { + hrefID <- c(hrefID, paste(id[j, ], collapse = "")) + } + } + + hrefID <- stringr::str_extract(string = hrefID, + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}") + + ## Extract and process press release issue date + hrefDate <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .content-time") %>% + rvest::html_text() + + hrefDate <- hrefDate[1:length(href)] + + ## Concatenate url, id and date + prURL <- c(prURL, href) + prID <- c(prID, hrefID) + prDate <- c(prDate, hrefDate) + } + + ## Convert prDate to date format + prDate <- lubridate::mdy(prDate) + + ## Convert prID to numeric + prID <- as.numeric(prID) + + ## Create tibble + pr <- tibble::tibble(data.frame(url = prURL, + id = prID, + date = prDate, + stringsAsFactors = FALSE)) + + ## Return DF + return(pr) +} + + +################################################################################ +# +#' +#' Extract title, URLs, unique identifiers and date of release of press releases +#' from the Department of Health website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph/press-releases} +#' @param pages A vector of page numbers corresponding to the page panel +#' containing the press release link. Default is 1:25. Press releases only go +#' up to page 25. Values higher than 25 will error. +#' +#' @return A tibble of 4 columns: 1) relative URLs of press release; 2) press +#' release title; 3) press release unique identifier; and, 4) date of press +#' release. +#' +#' @examples +#' ## Get press release URLs for first page of press release panel. The base +#' ## argument has been specified using non-SSL version of base URL to prevent +#' ## erroring for those with proxy server connections (i.e., in Travis tests) +#' get_doh_links(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' +#' @export +#' +# +################################################################################ + +get_doh_links <- function(base = "https://www.doh.gov.ph/press-releases", + pages = 1:25) { + ## Check that pages goes up to 25 only + if(max(pages) > 25) { + stop("The current maximum pages for press releases in the DoH website + is 25. Please try again.", call. = TRUE) + } + + ## Concatenating vectors + prURL <- NULL + prID <- NULL + prDate <- NULL + + ## Cycle through pages + for(i in pages) { + wp <- paste(base, "?page=", i - 1, sep = "") + if(i == 1) wp <- base + + ## Extract and process press release relative links + href <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .views-field-title .field-content a") %>% + rvest::html_attr(name = "href") + + href <- stringr::str_subset(string = href, pattern = "press-release|node") + href <- stringr::str_subset(string = href, pattern = "20343|19904", negate = TRUE) + + ## Generate unique identifiers for each press release + id <- stringr::str_split_fixed(string = href, pattern = "/", n = 3)[ , 3] + id <- id %>% + stringr::str_remove_all(pattern = "-") %>% + stringr::str_extract(pattern = "[a-zA-Z]{8}") %>% + stringr::str_to_upper() %>% + stringr::str_split(pattern = "", simplify = TRUE) + + id <- matrix(data = match(id, LETTERS[1:26]), + nrow = nrow(id), + ncol = 8, + byrow = FALSE) + + hrefID <- NULL + + for(j in 1:nrow(id)) { + if(all(is.na(id[j, ]))) { + hrefID <- c(hrefID, + stringr::str_extract(string = href[j], + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}")) + } else { + hrefID <- c(hrefID, paste(id[j, ], collapse = "")) + } + } + + hrefID <- stringr::str_extract(string = hrefID, + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}") + + ## Extract and process press release issue date + hrefDate <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .content-time") %>% + rvest::html_text() + + hrefDate <- hrefDate[1:length(href)] + + ## Concatenate url, id and date + prURL <- c(prURL, href) + prID <- c(prID, hrefID) + prDate <- c(prDate, hrefDate) + } + + ## Convert prDate to date format + prDate <- lubridate::mdy(prDate) + + ## Convert prID to numeric + prID <- as.numeric(prID) + + ## Create tibble + pr <- tibble::tibble(data.frame(url = prURL, + id = prID, + date = prDate, + stringsAsFactors = FALSE)) + + ## Return DF + return(pr) +} + + +################################################################################ +# +#' Extract text of press release from the Philippines Department of Health +#' website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph} +#' @param df A data.frame created using \code{get_pr_url} providing values for +#' relative URL of press release/s, unique identifier of press release and, +#' date of issue of press release. +#' +#' @return A tibble containing text of the press release with additional +#' information on line number, type of text, unique identifier and date of +#' press release. +#' +#' @examples +#' prURL <- get_pr_url(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' get_press_release(base = "http://www.doh.gov.ph", +#' df = prURL[1, ]) +#' +#' @export +#' +#' +# +################################################################################ + +get_press_release <- function(base = "https://www.doh.gov.ph", + df) { + ## Form URL + url <- paste(base, df$url, sep = "") + + ## Extract text from URL + z <- xml2::read_html(x = url) %>% + rvest::html_nodes(css = ".panel") %>% + rvest::html_text() %>% + stringr::str_split(pattern = "\n") %>% + unlist() %>% + stringr::str_trim(side = "both") + + ## Remove empty elements + z <- z[z != ""] + + ## Split z to 80 characters width + pressRelease <- stringr::str_wrap(string = z[[3]], width = 80) + pressRelease <- stringr::str_split(string = pressRelease, pattern = "\n") + + ## Concatenate title with body of press release + pressRelease <- c(z[[2]], pressRelease[[1]]) + + ## Create pressRelease data.frame + pressRelease <- data.frame(linenumber = 1:length(pressRelease), + text = pressRelease, + source = "DOH", + type = "press release", + id = df$id, + date = df$date, + stringsAsFactors = FALSE) + + ## Convert pressRelease to tibble + pressRelease <- tibble::tibble(pressRelease) + + ## Return pressRelease + return(pressRelease) +} + + +################################################################################ +# +#' Extract text of press release from the Philippines Department of Health +#' website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph} +#' @param df A data.frame created using \code{get_pr_url} providing values for +#' relative URL of press release/s, unique identifier of press release and, +#' date of issue of press release. +#' +#' @return A tibble containing text of the press release with additional +#' information on line number, type of text, unique identifier and date of +#' press release. +#' +#' @examples +#' prURL <- get_pr_url(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' get_press_release(base = "http://www.doh.gov.ph", +#' df = prURL[1, ]) +#' +#' @export +#' +#' +# +################################################################################ + +get_doh_release <- function(base = "https://www.doh.gov.ph", df) { + ## Form URL + url <- paste(base, df$url, sep = "") + + ## Extract text from URL + z <- xml2::read_html(x = url) %>% + rvest::html_nodes(css = ".panel") %>% + rvest::html_text() %>% + stringr::str_split(pattern = "\n") %>% + unlist() %>% + stringr::str_trim(side = "both") + + ## Remove empty elements + z <- z[z != ""] + + ## Split z to 80 characters width + pressRelease <- stringr::str_wrap(string = z[[3]], width = 80) + pressRelease <- stringr::str_split(string = pressRelease, pattern = "\n") + + ## Concatenate title with body of press release + pressRelease <- c(z[[2]], pressRelease[[1]]) + + ## Create pressRelease data.frame + pressRelease <- data.frame(linenumber = 1:length(pressRelease), + text = pressRelease, + source = "DOH", + type = "press release", + id = df$id, + date = df$date, + stringsAsFactors = FALSE) + + ## Convert pressRelease to tibble + pressRelease <- tibble::tibble(pressRelease) + + ## Return pressRelease + return(pressRelease) +} + diff --git a/R/02-get_press_release.R b/R/02-get_press_release.R deleted file mode 100644 index e0810d0..0000000 --- a/R/02-get_press_release.R +++ /dev/null @@ -1,178 +0,0 @@ -################################################################################ -# -#' -#' Extract title of press releases from Department of Health website -#' -#' @param base Base URL for press releases in the Department of Health website. -#' Default is \url{https://www.doh.gov.ph/press-releases} -#' @param pages A vector of page numbers corresponding to the page panel -#' containing the press release link. Default is 1:25. Press releases only go -#' up to page 25. Values higher than 25 will error. -#' -#' @return A tibble of 2 columns: 1) press release title; 2) press release -#' unique identifier; and, 3) date of press release. -#' -#' @examples -#' ## Get press release URLs for first page of press release panel. The base -#' ## argument has been specified using non-SSL version of base URL to prevent -#' ## erroring for those with proxy server connections (i.e., in Travis tests) -#' get_pr_url(base = "http://www.doh.gov.ph/press-releases", -#' pages = 1) -#' -#' @export -#' -# -################################################################################ - -get_pr_url <- function(base = "https://www.doh.gov.ph/press-releases", - pages = 1:25) { - ## Check that pages goes up to 25 only - if(max(pages) > 25) { - stop("The current maximum pages for press releases in the DoH website - is 25. Please try again.", call. = TRUE) - } - - ## Concatenating vectors - prURL <- NULL - prID <- NULL - prDate <- NULL - - ## Cycle through pages - for(i in pages) { - wp <- paste(base, "?page=", i - 1, sep = "") - if(i == 1) wp <- base - - ## Extract and process press release relative links - href <- xml2::read_html(x = wp) %>% - rvest::html_nodes(css = ".view-content .views-field-title .field-content a") %>% - rvest::html_attr(name = "href") - - href <- stringr::str_subset(string = href, pattern = "press-release|node") - href <- stringr::str_subset(string = href, pattern = "20343|19904", negate = TRUE) - - ## Generate unique identifiers for each press release - id <- stringr::str_split_fixed(string = href, pattern = "/", n = 3)[ , 3] - id <- id %>% - stringr::str_remove_all(pattern = "-") %>% - stringr::str_extract(pattern = "[a-zA-Z]{8}") %>% - stringr::str_to_upper() %>% - stringr::str_split(pattern = "", simplify = TRUE) - - id <- matrix(data = match(id, LETTERS[1:26]), - nrow = nrow(id), - ncol = 8, - byrow = FALSE) - - hrefID <- NULL - - for(j in 1:nrow(id)) { - if(all(is.na(id[j, ]))) { - hrefID <- c(hrefID, - stringr::str_extract(string = href[j], - pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}")) - } else { - hrefID <- c(hrefID, paste(id[j, ], collapse = "")) - } - } - - hrefID <- stringr::str_extract(string = hrefID, - pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}") - - ## Extract and process press release issue date - hrefDate <- xml2::read_html(x = wp) %>% - rvest::html_nodes(css = ".view-content .content-time") %>% - rvest::html_text() - - hrefDate <- hrefDate[1:length(href)] - - ## Concatenate url, id and date - prURL <- c(prURL, href) - prID <- c(prID, hrefID) - prDate <- c(prDate, hrefDate) - } - - ## Convert prDate to date format - prDate <- lubridate::mdy(prDate) - - ## Convert prID to numeric - prID <- as.numeric(prID) - - ## Create tibble - pr <- tibble::tibble(data.frame(url = prURL, - id = prID, - date = prDate, - stringsAsFactors = FALSE)) - - ## Return DF - return(pr) -} - - -################################################################################ -# -#' Extract text of press release from the Philippines Department of Health -#' website -#' -#' @param base Base URL for press releases in the Department of Health website. -#' Default is \url{https://www.doh.gov.ph} -#' @param df A data.frame created using \code{get_pr_url} providing values for -#' relative URL of press release/s, unique identifier of press release and, -#' date of issue of press release. -#' -#' @return A tibble containing text of the press release with additional -#' information on line number, type of text, unique identifier and date of -#' press release. -#' -#' @examples -#' prURL <- get_pr_url(base = "http://www.doh.gov.ph/press-releases", -#' pages = 1) -#' get_press_release(base = "http://www.doh.gov.ph", -#' df = prURL[1, ]) -#' -#' @export -#' -#' -# -################################################################################ - -get_press_release <- function(base = "https://www.doh.gov.ph", - df) { - ## Form URL - url <- paste(base, df$url, sep = "") - - ## Extract text from URL - z <- xml2::read_html(x = url) %>% - rvest::html_nodes(css = ".panel") %>% - rvest::html_text() %>% - stringr::str_split(pattern = "\n") %>% - unlist() %>% - stringr::str_trim(side = "both") - - ## Remove empty elements - z <- z[z != ""] - - ## Split z to 80 characters width - pressRelease <- stringr::str_wrap(string = z[[3]], width = 80) - pressRelease <- stringr::str_split(string = pressRelease, pattern = "\n") - - ## Concatenate title with body of press release - pressRelease <- c(z[[2]], pressRelease[[1]]) - - ## Create pressRelease data.frame - pressRelease <- data.frame(linenumber = 1:length(pressRelease), - text = pressRelease, - source = "DOH", - type = "press release", - id = df$id, - date = df$date, - stringsAsFactors = FALSE) - - ## Convert pressRelease to tibble - pressRelease <- tibble::tibble(pressRelease) - - ## Return pressRelease - return(pressRelease) -} - - - diff --git a/R/03-get_dfa.R b/R/03-get_dfa.R new file mode 100644 index 0000000..e499e7a --- /dev/null +++ b/R/03-get_dfa.R @@ -0,0 +1,44 @@ +################################################################################ +# +#' +#' +#' +#' +# +################################################################################ + +get_dfa_links <- function(base = "https://www.dfa.gov.ph/dfa-news/dfa-releasesupdate") { + ## Extract number of pages + end <- xml2::read_html(x = base) %>% + rvest::html_node(css = ".pagination ul .pagination-end a") %>% + rvest::html_attr(name = "href") %>% + stringr::str_extract(pattern = "[0-9]+") + + ## Get sequence of page numbers based on filter and ending number + nPages <- seq(from = 10, to = as.numeric(end), by = 10) + + ## Check if last number in sequence nPages is lower or equal to end + if(max(nPages < end)) nPages <- c(nPages, end) + + ## + titleTable <- xml2::read_html(x = base) %>% + rvest::html_node(css = ".table-noheader") %>% + rvest::html_table() + + ## Cycle through pages + for(i in nPages) { + ## Extract table from current set of pages + xx <- xml2::read_html(x = paste(base, "?start=", i, sep= "")) %>% + rvest::html_node(css = ".table-noheader") %>% + rvest::html_table() + + ## Concatenate tables by page + titleTable <- rbind(titleTable, xx) + } + + ## Convert to tibble + titleTable <- tibble::tibble(titleTable) + + ## Retrun linksTable + return(titleTable) +} diff --git a/R/03-combine.R b/R/10-combine.R similarity index 100% rename from R/03-combine.R rename to R/10-combine.R diff --git a/scrapeDFA.js b/scrapeDFA.js new file mode 100644 index 0000000..0724d10 --- /dev/null +++ b/scrapeDFA.js @@ -0,0 +1,16 @@ +var url = 'https://www.dfa.gov.ph/dfa-news/dfa-releasesupdate'; +var fs = require('fs'); +var page = require('webpage').create(); +page.open(url, function(status) { + if (status === 'success') { + var html = page.evaluate(function() { + return document.documentElement.outerHTML; + }); + try { + fs.write("dfaPressReleases.txt", html, 'w'); + } catch(e) { + console.log(e); + } + } + phantom.exit(); +}); From 485b42641ce889a59d8c69574d65bfb1964727bc Mon Sep 17 00:00:00 2001 From: Ernest Guevarra Date: Mon, 18 May 2020 16:29:27 +0100 Subject: [PATCH 02/40] get html text of DFA press releases site --- data-raw/dfaPressReleases.txt | 527 +++++++++++++++++++++++++++++++++ dfaPressReleases.txt | 528 ++++++++++++++++++++++++++++++++++ scrapeDFA.js | 2 +- 3 files changed, 1056 insertions(+), 1 deletion(-) create mode 100644 data-raw/dfaPressReleases.txt create mode 100644 dfaPressReleases.txt diff --git a/data-raw/dfaPressReleases.txt b/data-raw/dfaPressReleases.txt new file mode 100644 index 0000000..ebd53c8 --- /dev/null +++ b/data-raw/dfaPressReleases.txt @@ -0,0 +1,527 @@ + + + + + DFA Releases + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + +
+ +
+
+

GOVPH

+
+
+ MENU + +
+
+ + + + +
+ +
+ + + +
+
+ + + +
+
+ +
+ + +
+
+ +
+
+
+ + + +
+
+ +
+ + +
+
+
+ + + +
+
+ +
+
+
+ + + +
+ + + + + + \ No newline at end of file diff --git a/dfaPressReleases.txt b/dfaPressReleases.txt new file mode 100644 index 0000000..aefe120 --- /dev/null +++ b/dfaPressReleases.txt @@ -0,0 +1,528 @@ + + + + + DFA Releases + + + + + + + + + + + + + + + + + + + + + + + + + + + + +
+
+ + + + + + + + + +
+ +
+
+

GOVPH

+
+
+ MENU + +
+
+ + + + +
+ +
+ + + +
+
+ + + +
+
+ +
+ + +
+
+ +
+
+
+ + + +
+
+ +
+ + +
+
+
+ + + +
+
+ +
+
+
+ + + +
+ + + + + + \ No newline at end of file diff --git a/scrapeDFA.js b/scrapeDFA.js index 0724d10..3ec328a 100644 --- a/scrapeDFA.js +++ b/scrapeDFA.js @@ -7,7 +7,7 @@ page.open(url, function(status) { return document.documentElement.outerHTML; }); try { - fs.write("dfaPressReleases.txt", html, 'w'); + fs.write("data-raw/dfaPressReleases.txt", html, 'w'); } catch(e) { console.log(e); } From 36906e8d1de1ac0eabd002a0b3a134348213b842 Mon Sep 17 00:00:00 2001 From: Ernest Guevarra Date: Mon, 18 May 2020 17:41:52 +0100 Subject: [PATCH 03/40] update scraped dfa data --- data-raw/dfaPressReleases.txt | 58 +++++++++++++++++------------------ scrapeDFA.js | 2 +- 2 files changed, 30 insertions(+), 30 deletions(-) diff --git a/data-raw/dfaPressReleases.txt b/data-raw/dfaPressReleases.txt index ebd53c8..bcbb210 100644 --- a/data-raw/dfaPressReleases.txt +++ b/data-raw/dfaPressReleases.txt @@ -3,10 +3,10 @@ DFA Releases - - + + - + @@ -20,7 +20,7 @@ .table-noheader thead { display: none; } - + @@ -82,7 +82,7 @@ jQuery(function($){ initTooltips(); $("body").on("subform-row-add", initTooltips -