diff --git a/.Rbuildignore b/.Rbuildignore index 209aae9..382c288 100644 --- a/.Rbuildignore +++ b/.Rbuildignore @@ -7,3 +7,4 @@ ^\.travis\.yml$ ^appveyor\.yml$ ^codecov\.yml$ +^\.github$ diff --git a/.github/.gitignore b/.github/.gitignore new file mode 100644 index 0000000..2d19fc7 --- /dev/null +++ b/.github/.gitignore @@ -0,0 +1 @@ +*.html diff --git a/.github/workflows/R-CMD-check.yaml b/.github/workflows/R-CMD-check.yaml new file mode 100644 index 0000000..1f2d1b0 --- /dev/null +++ b/.github/workflows/R-CMD-check.yaml @@ -0,0 +1,80 @@ +on: + push: + branches: + - master + pull_request: + branches: + - master + +name: R-CMD-check + +jobs: + R-CMD-check: + runs-on: ${{ matrix.config.os }} + + name: ${{ matrix.config.os }} (${{ matrix.config.r }}) + + strategy: + fail-fast: false + matrix: + config: + - {os: windows-latest, r: 'release'} + - {os: macOS-latest, r: 'release'} + - {os: macOS-latest, r: 'devel'} + - {os: ubuntu-16.04, r: 'release', rspm: "https://packagemanager.rstudio.com/cran/__linux__/xenial/latest"} + + env: + R_REMOTES_NO_ERRORS_FROM_WARNINGS: true + RSPM: ${{ matrix.config.rspm }} + + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@master + with: + r-version: ${{ matrix.config.r }} + + - uses: r-lib/actions/setup-pandoc@master + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Cache R packages + if: runner.os != 'Windows' + uses: actions/cache@v1 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install system dependencies + if: runner.os == 'Linux' + env: + RHUB_PLATFORM: linux-x86_64-ubuntu-gcc + run: | + Rscript -e "remotes::install_github('r-hub/sysreqs')" + sysreqs=$(Rscript -e "cat(sysreqs::sysreq_commands('DESCRIPTION'))") + sudo -s eval "$sysreqs" + + - name: Install dependencies + run: | + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("rcmdcheck") + shell: Rscript {0} + + - name: Check + env: + _R_CHECK_CRAN_INCOMING_REMOTE_: false + run: rcmdcheck::rcmdcheck(args = c("--no-manual", "--as-cran"), error_on = "warning", check_dir = "check") + shell: Rscript {0} + + - name: Upload check results + if: failure() + uses: actions/upload-artifact@master + with: + name: ${{ runner.os }}-r${{ matrix.config.r }}-results + path: check diff --git a/.github/workflows/test-coverage.yaml b/.github/workflows/test-coverage.yaml new file mode 100644 index 0000000..3058d03 --- /dev/null +++ b/.github/workflows/test-coverage.yaml @@ -0,0 +1,46 @@ +on: + push: + branches: + - master + pull_request: + branches: + - master + +name: test-coverage + +jobs: + test-coverage: + runs-on: macOS-latest + env: + GITHUB_PAT: ${{ secrets.GITHUB_TOKEN }} + steps: + - uses: actions/checkout@v2 + + - uses: r-lib/actions/setup-r@master + + - uses: r-lib/actions/setup-pandoc@master + + - name: Query dependencies + run: | + install.packages('remotes') + saveRDS(remotes::dev_package_deps(dependencies = TRUE), ".github/depends.Rds", version = 2) + writeLines(sprintf("R-%i.%i", getRversion()$major, getRversion()$minor), ".github/R-version") + shell: Rscript {0} + + - name: Cache R packages + uses: actions/cache@v1 + with: + path: ${{ env.R_LIBS_USER }} + key: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1-${{ hashFiles('.github/depends.Rds') }} + restore-keys: ${{ runner.os }}-${{ hashFiles('.github/R-version') }}-1- + + - name: Install dependencies + run: | + install.packages(c("remotes")) + remotes::install_deps(dependencies = TRUE) + remotes::install_cran("covr") + shell: Rscript {0} + + - name: Test coverage + run: covr::codecov() + shell: Rscript {0} diff --git a/NAMESPACE b/NAMESPACE index 20566b6..8373ffe 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -2,10 +2,14 @@ export(combine_docs) export(combine_iatf) +export(get_dfa_links) +export(get_doh_links) +export(get_doh_release) export(get_iatf_links) export(get_pr_url) export(get_press_release) importFrom(httr,user_agent) +importFrom(lubridate,dmy) importFrom(lubridate,mdy) importFrom(magrittr,"%>%") importFrom(rvest,html_attr) @@ -27,3 +31,4 @@ importFrom(tibble,tibble) importFrom(utils,data) importFrom(utils,download.file) importFrom(xml2,read_html) +importFrom(xml2,url_absolute) diff --git a/NEWS.md b/NEWS.md index 2cea6f0..12b868f 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,4 +1,15 @@ -# comotext 0.1.0 +## comotext v0.2.0 + +* created new versions of `get_` functions to allow for naming of source of text data; deprecation process started on previous versions + +* re-structured the output text tibbles from the functions to a more logical sequence of data columns with a general sequence of `linenumber`, `identifier`, `text`, `date`, `source`, `type`, `url` whichever is present for the given tibble + +* converted URLs to absolute URLs rather than relative URLs + +* renamed datasets to include source of data as prefix to the name + + +# comotext v0.1.0 * Created pull data functions for the Department of Health (DoH) press releases found at [https://www.doh.gov.ph/press-releases](https://www.doh.gov.ph/press-releases) diff --git a/R/01-get_iatf.R b/R/01-get_iatf.R index aae7a8b..2113563 100644 --- a/R/01-get_iatf.R +++ b/R/01-get_iatf.R @@ -37,11 +37,16 @@ get_iatf_links <- function(base = "https://www.doh.gov.ph/COVID-19/IATF-Resoluti id <- stringr::str_extract(string = id, pattern = "[0-9]+") ## Add links id to links - yy <- data.frame(id, link = yy, stringsAsFactors = FALSE) + yy <- data.frame(id, + source = "IATF", + type = "resolution", + link = yy, + stringsAsFactors = FALSE) ## Merge links with page table linkTable <- merge(xx[[1]], yy, by.x = "Resolution No.", by.y = "id") - names(linkTable) <- c("id", "title", "date", "link") + + names(linkTable) <- c("id", "title", "date", "source", "type", "url") ## Convert to tibble linkTable <- tibble::tibble(linkTable) diff --git a/R/02-get_doh.R b/R/02-get_doh.R new file mode 100644 index 0000000..2e1a55a --- /dev/null +++ b/R/02-get_doh.R @@ -0,0 +1,383 @@ +################################################################################ +# +#' +#' Extract title of press releases from Department of Health website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph/press-releases} +#' @param pages A vector of page numbers corresponding to the page panel +#' containing the press release link. Default is 1:25. Press releases only go +#' up to page 25. Values higher than 25 will error. +#' +#' @return A tibble of 2 columns: 1) press release title; 2) press release +#' unique identifier; and, 3) date of press release. +#' +#' @examples +#' ## Get press release URLs for first page of press release panel. The base +#' ## argument has been specified using non-SSL version of base URL to prevent +#' ## erroring for those with proxy server connections (i.e., in Travis tests) +#' get_pr_url(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' +#' @export +#' +# +################################################################################ + +get_pr_url <- function(base = "https://www.doh.gov.ph/press-releases", + pages = 1:25) { + ## Issue deprecation message + .Deprecated(new = "get_doh_links", + package = "comotext", + msg = "'get_pr_url' is now in the process of deprecation and will + be unavailable in the next version. Please use 'get_doh_links' + instead.") + + ## Check that pages goes up to 25 only + if(max(pages) > 25) { + stop("The current maximum pages for press releases in the DoH website + is 25. Please try again.", call. = TRUE) + } + + ## Concatenating vectors + prURL <- NULL + prID <- NULL + prDate <- NULL + + ## Cycle through pages + for(i in pages) { + wp <- paste(base, "?page=", i - 1, sep = "") + if(i == 1) wp <- base + + ## Extract and process press release relative links + href <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .views-field-title .field-content a") %>% + rvest::html_attr(name = "href") + + href <- stringr::str_subset(string = href, pattern = "press-release|node") + href <- stringr::str_subset(string = href, pattern = "20343|19904", negate = TRUE) + + ## Generate unique identifiers for each press release + id <- stringr::str_split_fixed(string = href, pattern = "/", n = 3)[ , 3] + id <- id %>% + stringr::str_remove_all(pattern = "-") %>% + stringr::str_extract(pattern = "[a-zA-Z]{8}") %>% + stringr::str_to_upper() %>% + stringr::str_split(pattern = "", simplify = TRUE) + + id <- matrix(data = match(id, LETTERS[1:26]), + nrow = nrow(id), + ncol = 8, + byrow = FALSE) + + hrefID <- NULL + + for(j in 1:nrow(id)) { + if(all(is.na(id[j, ]))) { + hrefID <- c(hrefID, + stringr::str_extract(string = href[j], + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}")) + } else { + hrefID <- c(hrefID, paste(id[j, ], collapse = "")) + } + } + + hrefID <- stringr::str_extract(string = hrefID, + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}") + + ## Extract and process press release issue date + hrefDate <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .content-time") %>% + rvest::html_text() + + hrefDate <- hrefDate[1:length(href)] + + ## Concatenate url, id and date + prURL <- c(prURL, href) + prID <- c(prID, hrefID) + prDate <- c(prDate, hrefDate) + } + + ## Convert prDate to date format + prDate <- lubridate::mdy(prDate) + + ## Convert prID to numeric + prID <- as.numeric(prID) + + ## Create tibble + pr <- tibble::tibble(data.frame(url = prURL, + id = prID, + date = prDate, + stringsAsFactors = FALSE)) + + ## Return DF + return(pr) +} + + +################################################################################ +# +#' +#' Extract title, URLs, unique identifiers and date of release of press releases +#' from the Department of Health website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph/press-releases} +#' @param pages A vector of page numbers corresponding to the page panel +#' containing the press release link. Default is 1:25. Press releases only go +#' up to page 25. Values higher than 25 will error. +#' +#' @return A tibble of 4 columns: 1) absolute URLs of press release; 2) press +#' release title; 3) press release unique identifier; and, 4) date of press +#' release. +#' +#' @examples +#' ## Get press release URLs for first page of press release panel. The base +#' ## argument has been specified using non-SSL version of base URL to prevent +#' ## erroring for those with proxy server connections (i.e., in Travis tests) +#' get_doh_links(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' +#' @export +#' +# +################################################################################ + +get_doh_links <- function(base = "https://www.doh.gov.ph/press-releases", + pages = 1:25) { + ## Check that pages goes up to 25 only + if(max(pages) > 25) { + stop("The current maximum pages for press releases in the DoH website + is 25. Please try again.", call. = TRUE) + } + + ## Concatenating vectors + prTitle <- NULL + prURL <- NULL + prID <- NULL + prDate <- NULL + + ## Cycle through pages + for(i in pages) { + wp <- paste(base, "?page=", i - 1, sep = "") + if(i == 1) wp <- base + + xHTML <- xml2::read_html(x = wp) + + ## Extract and process press release relative links + href <- xHTML %>% + rvest::html_nodes(css = ".view-content .views-field-title .field-content a") %>% + rvest::html_attr(name = "href") + + href <- stringr::str_subset(string = href, pattern = "press-release|node") + href <- stringr::str_subset(string = href, pattern = "20343|19904", negate = TRUE) + + ## Generate unique identifiers for each press release + id <- stringr::str_split_fixed(string = href, pattern = "/", n = 3)[ , 3] + id <- id %>% + stringr::str_remove_all(pattern = "-") %>% + stringr::str_extract(pattern = "[a-zA-Z]{8}") %>% + stringr::str_to_upper() %>% + stringr::str_split(pattern = "", simplify = TRUE) + + id <- matrix(data = match(id, LETTERS[1:26]), + nrow = nrow(id), + ncol = 8, + byrow = FALSE) + + hrefID <- NULL + + for(j in 1:nrow(id)) { + if(all(is.na(id[j, ]))) { + hrefID <- c(hrefID, + stringr::str_extract(string = href[j], + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}")) + } else { + hrefID <- c(hrefID, paste(id[j, ], collapse = "")) + } + } + + hrefID <- stringr::str_extract(string = hrefID, + pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}") + + + ## Extract and process press release issue date + hrefDate <- xml2::read_html(x = wp) %>% + rvest::html_nodes(css = ".view-content .content-time") %>% + rvest::html_text() + + hrefDate <- hrefDate[1:length(href)] + + ## Extract titles + urlTitle <- xHTML %>% + rvest::html_nodes(css = ".view-content .views-field-title") %>% + rvest::html_text() %>% + stringr::str_trim(side = "both") + + urlTitle <- urlTitle[1:length(href)] + + ## Concatenate url, id and date + prTitle <- c(prTitle, urlTitle) + prURL <- c(prURL, href) + prID <- c(prID, hrefID) + prDate <- c(prDate, hrefDate) + } + ## Convert URL to absolute path + prURL <- xml2::url_absolute(x = prURL, base = base) + + ## Convert prDate to date format + prDate <- lubridate::mdy(prDate) + + ## Convert prID to numeric + prID <- as.numeric(prID) + + ## Create tibble + pr <- tibble::tibble(data.frame(id = prID, + title = prTitle, + date = prDate, + source = "DOH", + type = "press release", + url = prURL, + stringsAsFactors = FALSE)) + + ## Return DF + return(pr) +} + + +################################################################################ +# +#' Extract text of press release from the Philippines Department of Health +#' website +#' +#' @param base Base URL for press releases in the Department of Health website. +#' Default is \url{https://www.doh.gov.ph} +#' @param df A data.frame created using \code{get_pr_url} providing values for +#' relative URL of press release/s, unique identifier of press release and, +#' date of issue of press release. +#' +#' @return A tibble containing text of the press release with additional +#' information on line number, type of text, unique identifier and date of +#' press release. +#' +#' @examples +#' prURL <- get_pr_url(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' get_press_release(base = "http://www.doh.gov.ph", +#' df = prURL[1, ]) +#' +#' @export +#' +#' +# +################################################################################ + +get_press_release <- function(base = "https://www.doh.gov.ph", + df) { + ## Issue deprecation message + .Deprecated(new = "get_doh_release", + package = "comotext", + msg = "'get_press_release' is now in the process of deprecation and will + be unavailable in the next version. Please use 'get_doh_release' + instead.") + + ## Form URL + url <- paste(base, df$url, sep = "") + + ## Extract text from URL + z <- xml2::read_html(x = url) %>% + rvest::html_nodes(css = ".panel") %>% + rvest::html_text() %>% + stringr::str_split(pattern = "\n") %>% + unlist() %>% + stringr::str_trim(side = "both") + + ## Remove empty elements + z <- z[z != ""] + + ## Split z to 80 characters width + pressRelease <- stringr::str_wrap(string = z[[3]], width = 80) + pressRelease <- stringr::str_split(string = pressRelease, pattern = "\n") + + ## Concatenate title with body of press release + pressRelease <- c(z[[2]], pressRelease[[1]]) + + ## Create pressRelease data.frame + pressRelease <- data.frame(linenumber = 1:length(pressRelease), + text = pressRelease, + source = "DOH", + type = "press release", + id = df$id, + date = df$date, + stringsAsFactors = FALSE) + + ## Convert pressRelease to tibble + pressRelease <- tibble::tibble(pressRelease) + + ## Return pressRelease + return(pressRelease) +} + + +################################################################################ +# +#' Extract text of press release from the Philippines Department of Health +#' website +#' +#' @param df A data.frame created using \code{get_pr_url} providing values for +#' relative URL of press release/s, unique identifier of press release and, +#' date of issue of press release. +#' +#' @return A tibble containing text of the press release with additional +#' information on line number, type of text, unique identifier and date of +#' press release. +#' +#' @examples +#' prURL <- get_doh_links(base = "http://www.doh.gov.ph/press-releases", +#' pages = 1) +#' get_doh_release(df = prURL[1, ]) +#' +#' @export +#' +#' +# +################################################################################ + +get_doh_release <- function(df) { + ## Form URL + url <- df$url + + ## Extract text from URL + z <- xml2::read_html(x = url) %>% + rvest::html_nodes(css = ".panel") %>% + rvest::html_text() %>% + stringr::str_split(pattern = "\n") %>% + unlist() %>% + stringr::str_trim(side = "both") + + ## Remove empty elements + z <- z[z != ""] + + ## Split z to 80 characters width + pressRelease <- stringr::str_wrap(string = z[[3]], width = 80) + pressRelease <- stringr::str_split(string = pressRelease, pattern = "\n") + + ## Concatenate title with body of press release + pressRelease <- c(z[[2]], pressRelease[[1]]) + + ## Create pressRelease data.frame + pressRelease <- data.frame(linenumber = 1:length(pressRelease), + text = pressRelease, + source = "DOH", + type = "press release", + id = df$id, + date = df$date, + stringsAsFactors = FALSE) + + ## Convert pressRelease to tibble + pressRelease <- tibble::tibble(pressRelease) + + ## Return pressRelease + return(pressRelease) +} + diff --git a/R/02-get_press_release.R b/R/02-get_press_release.R deleted file mode 100644 index e0810d0..0000000 --- a/R/02-get_press_release.R +++ /dev/null @@ -1,178 +0,0 @@ -################################################################################ -# -#' -#' Extract title of press releases from Department of Health website -#' -#' @param base Base URL for press releases in the Department of Health website. -#' Default is \url{https://www.doh.gov.ph/press-releases} -#' @param pages A vector of page numbers corresponding to the page panel -#' containing the press release link. Default is 1:25. Press releases only go -#' up to page 25. Values higher than 25 will error. -#' -#' @return A tibble of 2 columns: 1) press release title; 2) press release -#' unique identifier; and, 3) date of press release. -#' -#' @examples -#' ## Get press release URLs for first page of press release panel. The base -#' ## argument has been specified using non-SSL version of base URL to prevent -#' ## erroring for those with proxy server connections (i.e., in Travis tests) -#' get_pr_url(base = "http://www.doh.gov.ph/press-releases", -#' pages = 1) -#' -#' @export -#' -# -################################################################################ - -get_pr_url <- function(base = "https://www.doh.gov.ph/press-releases", - pages = 1:25) { - ## Check that pages goes up to 25 only - if(max(pages) > 25) { - stop("The current maximum pages for press releases in the DoH website - is 25. Please try again.", call. = TRUE) - } - - ## Concatenating vectors - prURL <- NULL - prID <- NULL - prDate <- NULL - - ## Cycle through pages - for(i in pages) { - wp <- paste(base, "?page=", i - 1, sep = "") - if(i == 1) wp <- base - - ## Extract and process press release relative links - href <- xml2::read_html(x = wp) %>% - rvest::html_nodes(css = ".view-content .views-field-title .field-content a") %>% - rvest::html_attr(name = "href") - - href <- stringr::str_subset(string = href, pattern = "press-release|node") - href <- stringr::str_subset(string = href, pattern = "20343|19904", negate = TRUE) - - ## Generate unique identifiers for each press release - id <- stringr::str_split_fixed(string = href, pattern = "/", n = 3)[ , 3] - id <- id %>% - stringr::str_remove_all(pattern = "-") %>% - stringr::str_extract(pattern = "[a-zA-Z]{8}") %>% - stringr::str_to_upper() %>% - stringr::str_split(pattern = "", simplify = TRUE) - - id <- matrix(data = match(id, LETTERS[1:26]), - nrow = nrow(id), - ncol = 8, - byrow = FALSE) - - hrefID <- NULL - - for(j in 1:nrow(id)) { - if(all(is.na(id[j, ]))) { - hrefID <- c(hrefID, - stringr::str_extract(string = href[j], - pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}")) - } else { - hrefID <- c(hrefID, paste(id[j, ], collapse = "")) - } - } - - hrefID <- stringr::str_extract(string = hrefID, - pattern = "[0-9]{4}|[0-9]{5}|[0-9]{6}|[0-9]{7}|[0-9]{8}") - - ## Extract and process press release issue date - hrefDate <- xml2::read_html(x = wp) %>% - rvest::html_nodes(css = ".view-content .content-time") %>% - rvest::html_text() - - hrefDate <- hrefDate[1:length(href)] - - ## Concatenate url, id and date - prURL <- c(prURL, href) - prID <- c(prID, hrefID) - prDate <- c(prDate, hrefDate) - } - - ## Convert prDate to date format - prDate <- lubridate::mdy(prDate) - - ## Convert prID to numeric - prID <- as.numeric(prID) - - ## Create tibble - pr <- tibble::tibble(data.frame(url = prURL, - id = prID, - date = prDate, - stringsAsFactors = FALSE)) - - ## Return DF - return(pr) -} - - -################################################################################ -# -#' Extract text of press release from the Philippines Department of Health -#' website -#' -#' @param base Base URL for press releases in the Department of Health website. -#' Default is \url{https://www.doh.gov.ph} -#' @param df A data.frame created using \code{get_pr_url} providing values for -#' relative URL of press release/s, unique identifier of press release and, -#' date of issue of press release. -#' -#' @return A tibble containing text of the press release with additional -#' information on line number, type of text, unique identifier and date of -#' press release. -#' -#' @examples -#' prURL <- get_pr_url(base = "http://www.doh.gov.ph/press-releases", -#' pages = 1) -#' get_press_release(base = "http://www.doh.gov.ph", -#' df = prURL[1, ]) -#' -#' @export -#' -#' -# -################################################################################ - -get_press_release <- function(base = "https://www.doh.gov.ph", - df) { - ## Form URL - url <- paste(base, df$url, sep = "") - - ## Extract text from URL - z <- xml2::read_html(x = url) %>% - rvest::html_nodes(css = ".panel") %>% - rvest::html_text() %>% - stringr::str_split(pattern = "\n") %>% - unlist() %>% - stringr::str_trim(side = "both") - - ## Remove empty elements - z <- z[z != ""] - - ## Split z to 80 characters width - pressRelease <- stringr::str_wrap(string = z[[3]], width = 80) - pressRelease <- stringr::str_split(string = pressRelease, pattern = "\n") - - ## Concatenate title with body of press release - pressRelease <- c(z[[2]], pressRelease[[1]]) - - ## Create pressRelease data.frame - pressRelease <- data.frame(linenumber = 1:length(pressRelease), - text = pressRelease, - source = "DOH", - type = "press release", - id = df$id, - date = df$date, - stringsAsFactors = FALSE) - - ## Convert pressRelease to tibble - pressRelease <- tibble::tibble(pressRelease) - - ## Return pressRelease - return(pressRelease) -} - - - diff --git a/R/03-get_dfa.R b/R/03-get_dfa.R new file mode 100644 index 0000000..246b29e --- /dev/null +++ b/R/03-get_dfa.R @@ -0,0 +1,100 @@ +################################################################################ +# +#' +#' Get URLs of Philippines Department of Foreign Affairs (DFA) press releases +#' +#' @param base Base URL for press releases in the Department of Foreign Affairs +#' website. Default is \url{https://www.dfa.gov.ph/dfa-news/dfa-releasesupdate} +#' @param type Type of text contained in the URLs. This can be one of the +#' following: resolution, press release, advisory, statement, etc. +#' +#' @return A tibble containing information on title of the press release, +#' release date of press release and the absolute URL link to press release +#' +#' @examples +#' get_dfa_links(type = "press release") +#' +#' @export +#' +# +################################################################################ + +get_dfa_links <- function(base = "https://www.dfa.gov.ph/dfa-news/dfa-releasesupdate", + type) { + ## Get html + xHTML <- xml2::read_html(x = base) + + ## Extract number of pages + end <- xHTML %>% + rvest::html_node(css = ".pagination ul .pagination-end a") %>% + rvest::html_attr(name = "href") %>% + stringr::str_extract(pattern = "[0-9]+") + + ## Get sequence of page numbers based on filter and ending number + nPages <- seq(from = 10, to = as.numeric(end), by = 10) + + ## Check if last number in sequence nPages is lower or equal to end + if(max(nPages) < end) nPages <- c(nPages, end) + + ## Title + titleTable <- xHTML %>% + rvest::html_nodes(css = ".table-noheader") %>% + rvest::html_table() + + titleTable <- titleTable[[1]] + + ## URL + urlList <- xHTML %>% + rvest::html_nodes(css = ".table-noheader .list-title a") %>% + rvest::html_attr(name = "href") %>% + xml2::url_absolute(base = base) + + ## Cycle through pages + for(i in nPages) { + ## Get html + yHTML <- xml2::read_html(x = paste(base, "?start=", i, sep = "")) + + ## Extract table from current set of pages + xx <- yHTML %>% + rvest::html_nodes(css = ".table-noheader") %>% + rvest::html_table() + + xx <- xx[[1]] + + ## Extract url from current set of pages + yy <- yHTML %>% + rvest::html_nodes(css = ".table-noheader .list-title a") %>% + rvest::html_attr(name = "href") + + ## Concatenate tables by page + titleTable <- rbind(titleTable, xx) + + ## Concatenate URLs by page + urlList <- c(urlList, yy) + } + + ## Extract id from URLs + id <- stringr::str_extract(string = urlList, pattern = "[0-9]+") + + ## Concatenate titles with urls + urlTable <- data.frame(id, + titleTable, + "DFA", + type, + urlList, + stringsAsFactors = FALSE) + + ## Rename table + names(urlTable) <- c("id", "title", "date", "source", "type", "url") + + ## date to Date format + urlTable$date <- lubridate::dmy(urlTable$date) + + ## Convert to tibble + urlTable <- tibble::tibble(urlTable) + + ## Retrun linksTable + return(urlTable) +} + + diff --git a/R/03-combine.R b/R/10-combine.R similarity index 100% rename from R/03-combine.R rename to R/10-combine.R diff --git a/R/comotext.R b/R/comotext.R index 3d3198a..af66305 100644 --- a/R/comotext.R +++ b/R/comotext.R @@ -19,10 +19,10 @@ #' str_split str_split_fixed str_remove_all str_extract str_to_upper #' str_replace #' @importFrom tibble tibble -#' @importFrom xml2 read_html +#' @importFrom xml2 read_html url_absolute #' @importFrom rvest html_nodes html_text html_attr html_table #' @importFrom magrittr %>% -#' @importFrom lubridate mdy +#' @importFrom lubridate mdy dmy #' @importFrom httr user_agent #' # @@ -31,5 +31,24 @@ ## quiets concerns of R CMD check re: iatfResList -if(getRversion() >= "2.15.1") utils::globalVariables("iatfResList") +##if(getRversion() >= "2.15.1") utils::globalVariables("iatfResolutionLinks") +################################################################################ +# +#' +#' Deprecated functions in comotext +#' +#' These functions still work but will be removed in the next version. +#' +#' \itemize{ +#' \item \code{\link{get_pr_url}}: This function is deprecated and will be +#' removed in the next version of comotext. +#' \item \code{\link{get_press_release}}: This function is deprecated and +#' will be removed in the next version of comotext. +#' } +#' +#' @name comotext-deprecated +#' +# +################################################################################ +NULL diff --git a/R/data.R b/R/data.R index 188f364..2691387 100644 --- a/R/data.R +++ b/R/data.R @@ -233,7 +233,7 @@ #' @examples #' iatfResolution17 #' -#' @source \url{https://www.doh.gov.ph/sites/default/files/health-update/IATF Resolution No. 17.pdf} +#' @source \url{https://www.doh.gov.ph/sites/default/files/health-update/IATF\%20Resolution\%20No.\%2017.pdf} #' #' # @@ -287,7 +287,7 @@ #' @examples #' iatfResolution19 #' -#' @source \url{https://www.doh.gov.ph/sites/default/files/health-update/IATF Resolution No. 19.pdf} +#' @source \url{https://www.doh.gov.ph/sites/default/files/health-update/IATF\%20Resolution\%20No.\%2019.pdf} #' #' # @@ -673,12 +673,93 @@ "iatfResolution34" +################################################################################ +# +#' +#' COVID-19 Inter-Agency Task Force for the Management of Emerging Infectious +#' Diseases Resolution Number 35 +#' +#' @format A tibble with 140 rows and 6 columns: +#' \describe{ +#' \item{\code{linenumber}}{Linenumber} +#' \item{\code{text}}{Character string of text about 70 characters} +#' \item{\code{source}}{Issuer of resolution} +#' \item{\code{type}}{Type of document} +#' \item{\code{id}}{Text document identifier} +#' \item{\code{date}}{Date in format when resolution was made} +#' } +#' +#' @examples +#' iatfResolution35 +#' +#' @source \url{https://doh.gov.ph/sites/default/files/health-update/IATF-Resolution-No.-35.pdf} +#' +#' +# +################################################################################ +"iatfResolution35" + + +################################################################################ +# +#' +#' COVID-19 Inter-Agency Task Force for the Management of Emerging Infectious +#' Diseases Resolution Number 36 +#' +#' @format A tibble with 90 rows and 6 columns: +#' \describe{ +#' \item{\code{linenumber}}{Linenumber} +#' \item{\code{text}}{Character string of text about 70 characters} +#' \item{\code{source}}{Issuer of resolution} +#' \item{\code{type}}{Type of document} +#' \item{\code{id}}{Text document identifier} +#' \item{\code{date}}{Date in format when resolution was made} +#' } +#' +#' @examples +#' iatfResolution36 +#' +#' @source \url{https://doh.gov.ph/sites/default/files/health-update/IATF-Resolution-No.-36.pdf} +#' +#' +# +################################################################################ +"iatfResolution36" + + +################################################################################ +# +#' +#' COVID-19 Inter-Agency Task Force for the Management of Emerging Infectious +#' Diseases Resolution Number 37 +#' +#' @format A tibble with 49 rows and 6 columns: +#' \describe{ +#' \item{\code{linenumber}}{Linenumber} +#' \item{\code{text}}{Character string of text about 70 characters} +#' \item{\code{source}}{Issuer of resolution} +#' \item{\code{type}}{Type of document} +#' \item{\code{id}}{Text document identifier} +#' \item{\code{date}}{Date in format when resolution was made} +#' } +#' +#' @examples +#' iatfResolution37 +#' +#' @source \url{https://doh.gov.ph/sites/default/files/health-update/IATF-Resolution-No.-37.pdf} +#' +#' +# +################################################################################ +"iatfResolution37" + + ################################################################################ # #' #' Department of Health press releases from 2017 to May 2020 #' -#' @format A tibble with 13243 rows and 6 columns: +#' @format A tibble with 13325 rows and 6 columns: #' \describe{ #' \item{\code{linenumber}}{Linenumber} #' \item{\code{text}}{Character string of text about 70 characters} @@ -689,15 +770,15 @@ #' } #' #' @examples -#' pressRelease +#' dohRelease #' #' @source \url{https://www.doh.gov.ph/press-releases}. Updated as of -#' 16 May 2020. +#' 19 May 2020. #' #' # ################################################################################ -"pressRelease" +"dohRelease" ################################################################################ @@ -713,21 +794,21 @@ #' } #' #' @examples -#' prLinks +#' dohLinks #' #' @source Produced by \code{get_pr_url(pages = 1:25)}. Updated as of -#' 16 May 2020. +#' 19 May 2020. #' #' # ################################################################################ -"prLinks" +"dohLinks" ################################################################################ # #' -#' IATF resolutions URL links +#' IATF resolutions URLs #' #' @format A tibble with 25 rows and 4 columns: #' \describe{ @@ -738,12 +819,38 @@ #' } #' #' @examples -#' iatfResList +#' iatfLinks #' #' @source \url{https://www.doh.gov.ph/COVID-19/IATF-Resolutions}. Updated -#' as of 16 May 2020. +#' as of 19 May 2020. +#' +#' +# +################################################################################ +"iatfLinks" + + +################################################################################ +# +#' +#' Philippines Department of Foreign Affairs (DFA) press releases URLs +#' +#' @format A tibble with 2352 rows and 3 columns +#' \describe{ +#' \item{\code{id}}{Unique identifier} +#' \item{\code{title}}{Title of press release} +#' \item{\code{date}}{Date of issue of press release} +#' \item{\code{source}}{Source of press release} +#' \item{\code{type}}{Type of text data - press release} +#' \item{\code{url}}{Absolute URLs of press releases} +#' } +#' +#' @examples +#' dfaLinks #' +#' @source \url{https://www.dfa.gov.ph/dfa-news/dfa-releasesupdate}. Updated +#' as of 19 May 2020 #' # ################################################################################ -"iatfResList" +"dfaLinks" diff --git a/README.Rmd b/README.Rmd index 9b122af..06a124a 100644 --- a/README.Rmd +++ b/README.Rmd @@ -21,6 +21,7 @@ library(comotext) [![Lifecycle: experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](https://www.tidyverse.org/lifecycle/#experimental) [![Travis build status](https://travis-ci.org/como-ph/comotext.svg?branch=master)](https://travis-ci.org/como-ph/comotext) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/como-ph/comotext?branch=master&svg=true)](https://ci.appveyor.com/project/como-ph/comotext) +[![R build status](https://github.com/como-ph/comotext/workflows/R-CMD-check/badge.svg)](https://github.com/como-ph/comotext/actions) [![Codecov test coverage](https://codecov.io/gh/como-ph/comotext/branch/master/graph/badge.svg)](https://codecov.io/gh/como-ph/comotext?branch=master) [![DOI](https://zenodo.org/badge/255823130.svg)](https://zenodo.org/badge/latestdoi/255823130) @@ -40,59 +41,59 @@ remotes::install_github("como-ph/comotext") ### Datasets -`comotext` currently has 24 datasets of COVID-19-related resolutions and policies in the Philippines. These datasets are 24 resolutions made by the Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF). +`comotext` currently has 28 datasets of COVID-19-related resolutions and policies in the Philippines. These datasets are 28 resolutions made by the Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF). A description of the available datasets can be found [here](https://como-ph.github.io/comotext/reference/index.html#section-datasets). -A table of the ```r nrow(iatfResList)``` IATF resolutions and the URLs to download them can be generated using the function `get_iatf_links()` as follows: +A table of the ```r nrow(iatfLinks)``` IATF resolutions and the URLs to download them can be generated using the function `get_iatf_links()` as follows: ```{r usage, echo = TRUE, eval = TRUE} get_iatf_links() ``` -`comotext` also holds 1 dataset of all [Department of Health](http://www.doh.gov.ph) press releases to date. A description of the `pressRelease` dataset can be found [here](https://como-ph.github.io/comotext/reference/pressRelease.html). This dataset has been generated using the `get_press_release()` function (see description below) included in `comotext`. Related to this is the dataset `prLinks` which holds the relative URL links for each of the press releases in the [Department of Health](http://www.doh.gov.ph) website to date. This dataset has been produced using the `get_pr_url()` function (see description below) included in `comotext`. A description of the `prLinks` dataset can be found [here](https://como-ph.github.io/comotext/reference/prLinks.html). +`comotext` also holds 1 dataset of all [Department of Health](http://www.doh.gov.ph) press releases to date. A description of the `dohRelease` dataset can be found [here](https://como-ph.github.io/comotext/reference/dohRelease.html). This dataset has been generated using the `get_doh_release()` function (see description below) included in `comotext`. Related to this is the dataset `dohLinks` which holds the relative URL links for each of the press releases in the [Department of Health](http://www.doh.gov.ph) website to date. This dataset has been produced using the `get_doh_links()` function (see description below) included in `comotext`. A description of the `dohLinks` dataset can be found [here](https://como-ph.github.io/comotext/reference/dohLinks.html). ### Extracting text data from press releases -Press releases issued by the [Department of Health](https://www.doh.gov.ph) are available publicly via their [website](https://www.doh.gov.ph/press-releases). The structure of the press releases page is that the section with the links to the press releases text is in a panel within the web page with the panel itself having pagination with each page containing links to 15 press releases with press releases ordered in reverse chronological order. +Press releases issued by the [Department of Health](https://www.doh.gov.ph) are available publicly via their [website](https://www.doh.gov.ph/press-releases). The structure of the press releases page is that the section with the links to the press releases text is in a panel within the web page with the panel itself having pagination with each page containing links to 28 press releases with press releases ordered in reverse chronological order. -The function `get_pr_url` extracts the relative URL links to each of the press releases on a current page within the press releases panel. If we want to get the relative URL links for the press releases on page 1 of the press releases panel, we use: +The function `get_doh_links()` extracts the relative URL links to each of the press releases on a current page within the press releases panel. If we want to get the absolute URL links for the press releases on page 1 of the press releases panel, we use: ```{r usage1, echo = TRUE, eval = TRUE} -get_pr_url(pages = 1) +get_doh_links(pages = 1) ``` -The function `get_press_releases` creates a dataset of text of press releases given a URL of a specific press release text and the date of release. This information is provided for by `get_pr_url`. If we want to get the text data of the press releases from page 1 of the press release panel, we use: +The function `get_doh_release()` creates a dataset of text of press releases given a URL of a specific press release text and the date of release. This information is provided for by `get_doh_links()`. If we want to get the text data of the press releases from page 1 of the press release panel, we use: ```{r usage2, echo = TRUE, eval = TRUE} ## Extract URLs from DoH press releases page 1 -prURL <- get_pr_url(pages = 1) +prURL <- get_doh_links(pages = 1) ## Extract text from first press release -get_press_release(df = prURL[1, ]) +get_doh_release(df = prURL[1, ]) ``` To get all the [DoH](https://www.doh.gov.ph) press releases available from their [website](https://www.doh.gov.ph/press-releases), use: ```{r usage3, echo = TRUE, eval = FALSE} ## Extract URLs -pr <- get_pr_url(pages = 1:25) +pr <- get_doh_links(pages = 1:25) ## Extract all press releases text pressRelease <- NULL for(i in 1:nrow(pr)) { - currentPR <- get_press_release(df = pr[i, ]) + currentPR <- get_doh_release(df = pr[i, ]) pressRelease <- rbind(pressRelease, currentPR) } ``` ```{r usage4, echo = FALSE, eval = TRUE} -pressRelease +dohRelease ``` -This produces the same dataset as `pressRelease` included in `comotext`. +This produces the same dataset as `dohRelease` included in `comotext`. ### Concatenating text datasets diff --git a/README.md b/README.md index 36312b4..a9874ef 100644 --- a/README.md +++ b/README.md @@ -11,6 +11,8 @@ experimental](https://img.shields.io/badge/lifecycle-experimental-orange.svg)](h status](https://travis-ci.org/como-ph/comotext.svg?branch=master)](https://travis-ci.org/como-ph/comotext) [![AppVeyor build status](https://ci.appveyor.com/api/projects/status/github/como-ph/comotext?branch=master&svg=true)](https://ci.appveyor.com/project/como-ph/comotext) +[![R build +status](https://github.com/como-ph/comotext/workflows/R-CMD-check/badge.svg)](https://github.com/como-ph/comotext/actions) [![Codecov test coverage](https://codecov.io/gh/como-ph/comotext/branch/master/graph/badge.svg)](https://codecov.io/gh/como-ph/comotext?branch=master) [![DOI](https://zenodo.org/badge/255823130.svg)](https://zenodo.org/badge/latestdoi/255823130) @@ -38,47 +40,47 @@ remotes::install_github("como-ph/comotext") ### Datasets -`comotext` currently has 24 datasets of COVID-19-related resolutions and -policies in the Philippines. These datasets are 24 resolutions made by +`comotext` currently has 28 datasets of COVID-19-related resolutions and +policies in the Philippines. These datasets are 28 resolutions made by the Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF). A description of the available datasets can be found [here](https://como-ph.github.io/comotext/reference/index.html#section-datasets). -A table of the `25` IATF resolutions and the URLs to download them can +A table of the `29` IATF resolutions and the URLs to download them can be generated using the function `get_iatf_links()` as follows: ``` r get_iatf_links() -#> # A tibble: 25 x 4 -#> id title date link -#> -#> 1 9 Recommendations for the Manag… 2020-03-03 https://doh.gov.ph/sites/def… -#> 2 10 Recommendations for the Manag… 2020-03-09 https://doh.gov.ph/sites/def… -#> 3 11 Recommendations for the Manag… 2020-03-12 https://doh.gov.ph/sites/def… -#> 4 12 Recommendations for the Manag… 2020-03-13 https://doh.gov.ph/sites/def… -#> 5 13 Recommendations for the Manag… 2020-03-17 https://doh.gov.ph/sites/def… -#> 6 14 Resolutions Relative to the M… 2020-03-20 https://doh.gov.ph/sites/def… -#> 7 15 Resolutions Relative to the M… 2020-03-25 https://doh.gov.ph/sites/def… -#> 8 16 Additional Guidelines for the… 2020-03-30 https://doh.gov.ph/sites/def… -#> 9 17 Recommendations Relative to t… 2020-03-30 https://doh.gov.ph/sites/def… -#> 10 18 Recommendations Relative to t… 2020-04-01 https://doh.gov.ph/sites/def… -#> # … with 15 more rows +#> # A tibble: 29 x 6 +#> id title date source type url +#> +#> 1 9 Recommendations for th… 2020-03-03 IATF resol… https://doh.gov.ph/si… +#> 2 10 Recommendations for th… 2020-03-09 IATF resol… https://doh.gov.ph/si… +#> 3 11 Recommendations for th… 2020-03-12 IATF resol… https://doh.gov.ph/si… +#> 4 12 Recommendations for th… 2020-03-13 IATF resol… https://doh.gov.ph/si… +#> 5 13 Recommendations for th… 2020-03-17 IATF resol… https://doh.gov.ph/si… +#> 6 14 Resolutions Relative t… 2020-03-20 IATF resol… https://doh.gov.ph/si… +#> 7 15 Resolutions Relative t… 2020-03-25 IATF resol… https://doh.gov.ph/si… +#> 8 16 Additional Guidelines … 2020-03-30 IATF resol… https://doh.gov.ph/si… +#> 9 17 Recommendations Relati… 2020-03-30 IATF resol… https://doh.gov.ph/si… +#> 10 18 Recommendations Relati… 2020-04-01 IATF resol… https://doh.gov.ph/si… +#> # … with 19 more rows ``` `comotext` also holds 1 dataset of all [Department of Health](http://www.doh.gov.ph) press releases to date. A description of -the `pressRelease` dataset can be found -[here](https://como-ph.github.io/comotext/reference/pressRelease.html). -This dataset has been generated using the `get_press_release()` function +the `dohRelease` dataset can be found +[here](https://como-ph.github.io/comotext/reference/dohRelease.html). +This dataset has been generated using the `get_doh_release()` function (see description below) included in `comotext`. Related to this is the -dataset `prLinks` which holds the relative URL links for each of the +dataset `dohLinks` which holds the relative URL links for each of the press releases in the [Department of Health](http://www.doh.gov.ph) -website to date. This dataset has been produced using the `get_pr_url()` -function (see description below) included in `comotext`. A description -of the `prLinks` dataset can be found -[here](https://como-ph.github.io/comotext/reference/prLinks.html). +website to date. This dataset has been produced using the +`get_doh_links()` function (see description below) included in +`comotext`. A description of the `dohLinks` dataset can be found +[here](https://como-ph.github.io/comotext/reference/dohLinks.html). ### Extracting text data from press releases @@ -87,62 +89,62 @@ Health](https://www.doh.gov.ph) are available publicly via their [website](https://www.doh.gov.ph/press-releases). The structure of the press releases page is that the section with the links to the press releases text is in a panel within the web page with the panel itself -having pagination with each page containing links to 15 press releases +having pagination with each page containing links to 28 press releases with press releases ordered in reverse chronological order. -The function `get_pr_url` extracts the relative URL links to each of the -press releases on a current page within the press releases panel. If we -want to get the relative URL links for the press releases on page 1 of -the press releases panel, we use: +The function `get_doh_links()` extracts the relative URL links to each +of the press releases on a current page within the press releases panel. +If we want to get the absolute URL links for the press releases on page +1 of the press releases panel, we use: ``` r -get_pr_url(pages = 1) -#> # A tibble: 15 x 3 -#> url id date -#> -#> 1 /press-release/Malasakit%3A-panlaban-natin-sa-COVID-19%3B-f… 1311 2020-05-16 -#> 2 /doh-press-release/EXPERTS-RALLY-BEHIND-DOH-DATA-INTEGRITY-… 5241 2020-05-14 -#> 3 /doh-press-release/EXPERTS-RALLY-BEHIND-DOH-DATA-INTEGRITY-… 5241 2020-05-14 -#> 4 /doh-press-release/NEW-QUARANTINE-SITE-IN-ALABANG-INAUGURAT… 1452 2020-05-13 -#> 5 /doh-press-release/STATEMENT-ON-DATA-INTEGRITY 1920 2020-05-13 -#> 6 /doh-press-release/HONORING-NURSE%E2%80%99S-DAY%3A-DOH-CALL… 8151 2020-05-13 -#> 7 /doh-press-release/SAN-JUAN-CITY-THANKS-DOH-AS-COVID-19-CAS… 1911 2020-05-13 -#> 8 /press-release/ADB-Sponsored-COVID-19-Lab-in-Pampanga-Launc… 1421 2020-05-10 -#> 9 /press-release/ECQ-Buys-PH-Time-Continued-Practice-of-Healt… 5317 2020-05-09 -#> 10 /press-release/biggest-mega-swabbing-center-in-moa-arena-to… 2977 2020-05-08 -#> 11 /doh-press-release/Press%20Release/DUQUE-THANKS-NAVY-FRONTL… 4211 2020-05-07 -#> 12 /doh-press-release/PH-GOV%E2%80%99T-RECEIVES-7-METRIC-TONS-… 2018 2020-05-07 -#> 13 /doh-press-release/BEYOND-NUMBERS%3A-WHAT-THE-FLATTENING-CU… 2525 2020-05-07 -#> 14 /doh-press-release/2ND-MEGA-SWABBING-CENTER-SET-TO-OPERATE%… 1441 2020-05-06 -#> 15 /doh-press-release/NEW-MEGA-SWABBING-CENTER-TO-RAMP-UP-COVI… 1452 2020-05-06 +get_doh_links(pages = 1) +#> # A tibble: 15 x 6 +#> id title date source type url +#> +#> 1 4158 "DOH TO DEPUTIZE MED G… 2020-05-19 DOH press… https://www.doh.gov.p… +#> 2 4158 "DOH ENCOURAGES VIRTUA… 2020-05-18 DOH press… https://www.doh.gov.p… +#> 3 1311 "Malasakit: panlaban n… 2020-05-16 DOH press… https://www.doh.gov.p… +#> 4 5241 "EXPERTS RALLY BEHIND … 2020-05-14 DOH press… https://www.doh.gov.p… +#> 5 5241 "GOVERNMENT OFFERS FUL… 2020-05-14 DOH press… https://www.doh.gov.p… +#> 6 1452 "NEW QUARANTINE SITE I… 2020-05-13 DOH press… https://www.doh.gov.p… +#> 7 1920 "STATEMENT ON DATA INT… 2020-05-13 DOH press… https://www.doh.gov.p… +#> 8 8151 "HONORING NURSE’S DAY:… 2020-05-13 DOH press… https://www.doh.gov.p… +#> 9 1911 "SAN JUAN CITY THANKS … 2020-05-13 DOH press… https://www.doh.gov.p… +#> 10 1421 "ADB-Sponsored COVID-1… 2020-05-10 DOH press… https://www.doh.gov.p… +#> 11 5317 "ECQ Buys PH Time; Con… 2020-05-09 DOH press… https://www.doh.gov.p… +#> 12 2977 "BIGGEST MEGA SWABBING… 2020-05-08 DOH press… https://www.doh.gov.p… +#> 13 4211 "DUQUE THANKS NAVY FRO… 2020-05-07 DOH press… https://www.doh.gov.p… +#> 14 2018 "PH GOV’T RECEIVES 7 M… 2020-05-07 DOH press… https://www.doh.gov.p… +#> 15 2525 "BEYOND\tNUMBERS:\tWHA… 2020-05-07 DOH press… https://www.doh.gov.p… ``` -The function `get_press_releases` creates a dataset of text of press +The function `get_doh_release()` creates a dataset of text of press releases given a URL of a specific press release text and the date of -release. This information is provided for by `get_pr_url`. If we want to -get the text data of the press releases from page 1 of the press release -panel, we use: +release. This information is provided for by `get_doh_links()`. If we +want to get the text data of the press releases from page 1 of the press +release panel, we use: ``` r ## Extract URLs from DoH press releases page 1 -prURL <- get_pr_url(pages = 1) +prURL <- get_doh_links(pages = 1) ## Extract text from first press release -get_press_release(df = prURL[1, ]) -#> # A tibble: 64 x 6 +get_doh_release(df = prURL[1, ]) +#> # A tibble: 49 x 6 #> linenumber text source type id date #> -#> 1 1 Malasakit: panlaban natin sa COV… DOH press r… 1311 2020-05-16 -#> 2 2 Press Release / 16 May 2020The e… DOH press r… 1311 2020-05-16 -#> 3 3 Philippines is a huge breath of … DOH press r… 1311 2020-05-16 -#> 4 4 livelihoods have been displaced,… DOH press r… 1311 2020-05-16 -#> 5 5 that day-to-day behavior has to … DOH press r… 1311 2020-05-16 -#> 6 6 16 media forum, medical anthropo… DOH press r… 1311 2020-05-16 -#> 7 7 professor, Dr. Michael Tan, appe… DOH press r… 1311 2020-05-16 -#> 8 8 practice healthy behaviors while… DOH press r… 1311 2020-05-16 -#> 9 9 UP Chancellor also called on all… DOH press r… 1311 2020-05-16 -#> 10 10 highlighting the importance of s… DOH press r… 1311 2020-05-16 -#> # … with 54 more rows +#> 1 1 DOH TO DEPUTIZE MED GRADS TO ASS… DOH press r… 4158 2020-05-19 +#> 2 2 Press Release/18 May 2020 Consis… DOH press r… 4158 2020-05-19 +#> 3 3 11469 or the “Bayanihan to Heal … DOH press r… 4158 2020-05-19 +#> 4 4 of 1959,” the Department of Heal… DOH press r… 4158 2020-05-19 +#> 5 5 2020-0169 last Saturday, which a… DOH press r… 4158 2020-05-19 +#> 6 6 practice of medicine as deputize… DOH press r… 4158 2020-05-19 +#> 7 7 of registration from the Profess… DOH press r… 4158 2020-05-19 +#> 8 8 in the President the power “to e… DOH press r… 4158 2020-05-19 +#> 9 9 complement or supplement the cur… DOH press r… 4158 2020-05-19 +#> 10 10 R.A. 2382 stipulates “Medical gr… DOH press r… 4158 2020-05-19 +#> # … with 39 more rows ``` To get all the [DoH](https://www.doh.gov.ph) press releases available @@ -150,34 +152,34 @@ from their [website](https://www.doh.gov.ph/press-releases), use: ``` r ## Extract URLs -pr <- get_pr_url(pages = 1:25) +pr <- get_doh_links(pages = 1:25) ## Extract all press releases text pressRelease <- NULL for(i in 1:nrow(pr)) { - currentPR <- get_press_release(df = pr[i, ]) + currentPR <- get_doh_release(df = pr[i, ]) pressRelease <- rbind(pressRelease, currentPR) } ``` - #> # A tibble: 13,243 x 6 + #> # A tibble: 13,325 x 6 #> linenumber text source type id date #> - #> 1 1 Malasakit: panlaban natin sa COV… DOH press r… 1311 2020-05-16 - #> 2 2 Press Release / 16 May 2020The e… DOH press r… 1311 2020-05-16 - #> 3 3 Philippines is a huge breath of … DOH press r… 1311 2020-05-16 - #> 4 4 livelihoods have been displaced,… DOH press r… 1311 2020-05-16 - #> 5 5 that day-to-day behavior has to … DOH press r… 1311 2020-05-16 - #> 6 6 16 media forum, medical anthropo… DOH press r… 1311 2020-05-16 - #> 7 7 professor, Dr. Michael Tan, appe… DOH press r… 1311 2020-05-16 - #> 8 8 practice healthy behaviors while… DOH press r… 1311 2020-05-16 - #> 9 9 UP Chancellor also called on all… DOH press r… 1311 2020-05-16 - #> 10 10 highlighting the importance of s… DOH press r… 1311 2020-05-16 - #> # … with 13,233 more rows - -This produces the same dataset as `pressRelease` included in `comotext`. + #> 1 1 DOH TO DEPUTIZE MED GRADS TO ASS… DOH press r… 4158 2020-05-19 + #> 2 2 Press Release/18 May 2020 Consis… DOH press r… 4158 2020-05-19 + #> 3 3 11469 or the “Bayanihan to Heal … DOH press r… 4158 2020-05-19 + #> 4 4 of 1959,” the Department of Heal… DOH press r… 4158 2020-05-19 + #> 5 5 2020-0169 last Saturday, which a… DOH press r… 4158 2020-05-19 + #> 6 6 practice of medicine as deputize… DOH press r… 4158 2020-05-19 + #> 7 7 of registration from the Profess… DOH press r… 4158 2020-05-19 + #> 8 8 in the President the power “to e… DOH press r… 4158 2020-05-19 + #> 9 9 complement or supplement the cur… DOH press r… 4158 2020-05-19 + #> 10 10 R.A. 2382 stipulates “Medical gr… DOH press r… 4158 2020-05-19 + #> # … with 13,315 more rows + +This produces the same dataset as `dohRelease` included in `comotext`. ### Concatenating text datasets @@ -197,7 +199,7 @@ will be returned. ``` r combine_docs(docs = "resolution") -#> # A tibble: 2,907 x 6 +#> # A tibble: 3,186 x 6 #> linenumber text source type id date #> #> 1 1 WHEREAS, on January 31, 2020, upo… IATF resolu… 9 2020-03-03 @@ -210,7 +212,7 @@ combine_docs(docs = "resolution") #> 8 8 exemptions in favor of certain cl… IATF resolu… 9 2020-03-03 #> 9 9 SAR, and Macau SAR; IATF resolu… 9 2020-03-03 #> 10 10 WHEREAS, on February 26, 2020, fo… IATF resolu… 9 2020-03-03 -#> # … with 2,897 more rows +#> # … with 3,176 more rows ``` The `combine_iatf` function is a specialised wrapper of the diff --git a/appveyor.yml b/appveyor.yml index 3a75e16..d3baaff 100644 --- a/appveyor.yml +++ b/appveyor.yml @@ -17,7 +17,7 @@ environment: NOT_CRAN: true # env vars that may need to be set, at least temporarily, from time to time # see https://github.com/krlmlr/r-appveyor#readme for details - # USE_RTOOLS: true + USE_RTOOLS: true # R_REMOTES_STANDALONE: true # Adapt as necessary starting from here diff --git a/data-raw/IATF/IATF-Resolution-No.-35.pdf b/data-raw/IATF/IATF-Resolution-No.-35.pdf new file mode 100644 index 0000000..625245f Binary files /dev/null and b/data-raw/IATF/IATF-Resolution-No.-35.pdf differ diff --git a/data-raw/IATF/IATF-Resolution-No.-36.pdf b/data-raw/IATF/IATF-Resolution-No.-36.pdf new file mode 100644 index 0000000..7afeed7 Binary files /dev/null and b/data-raw/IATF/IATF-Resolution-No.-36.pdf differ diff --git a/data-raw/IATF/IATF-Resolution-No.-37.pdf b/data-raw/IATF/IATF-Resolution-No.-37.pdf new file mode 100644 index 0000000..d6226ef Binary files /dev/null and b/data-raw/IATF/IATF-Resolution-No.-37.pdf differ diff --git a/data-raw/IATF/Omnibus-Guidelines-community-quarantine.pdf b/data-raw/IATF/Omnibus-Guidelines-community-quarantine.pdf new file mode 100644 index 0000000..f189e48 Binary files /dev/null and b/data-raw/IATF/Omnibus-Guidelines-community-quarantine.pdf differ diff --git a/data-raw/dfaRelease.R b/data-raw/dfaRelease.R new file mode 100644 index 0000000..c42d4d3 --- /dev/null +++ b/data-raw/dfaRelease.R @@ -0,0 +1,10 @@ +library(stringr) + +## Extract URLs of DFA press releases ########################################## + +dfaLinks <- get_dfa_links(type = "press release") + +usethis::use_data(dfaLinks, overwrite = TRUE, compress = "xz") + +## Extract text from URLs of press releases #################################### + diff --git a/data-raw/dohRelease.R b/data-raw/dohRelease.R new file mode 100644 index 0000000..760e99b --- /dev/null +++ b/data-raw/dohRelease.R @@ -0,0 +1,22 @@ +library(rvest) +library(stringr) +library(comotext) + +## Extract press releases URLs ################################################# + +dohLinks <- get_doh_links(pages = 1:25) + +usethis::use_data(dohLinks, overwrite = TRUE, compress = "xz") + +## Extract text from press releases ############################################ + +dohRelease <- NULL + +for(i in 1:nrow(dohReleaseLinks)) { + currentPR <- get_doh_release(df = dohReleaseLinks[i, ]) + + dohRelease <- rbind(dohRelease, currentPR) +} + +usethis::use_data(dohRelease, overwrite = TRUE, compress = "xz") + diff --git a/data-raw/prepareResolutions.R b/data-raw/iatfResolution.R similarity index 91% rename from data-raw/prepareResolutions.R rename to data-raw/iatfResolution.R index 47ccadf..8091ee0 100644 --- a/data-raw/prepareResolutions.R +++ b/data-raw/iatfResolution.R @@ -4,9 +4,9 @@ library(stringr) ## Resolutions table ########################################################### -iatfResList <- get_iatf_links() +iatfLinks <- get_iatf_links() -usethis::use_data(iatfResList, overwrite = TRUE, compress = "xz") +usethis::use_data(iatfLinks, overwrite = TRUE, compress = "xz") ## Resolution 9 ################################################################ @@ -926,3 +926,91 @@ y <- data.frame(linenumber = 1:length(y), iatfResolution34 <- tibble::tibble(y) usethis::use_data(iatfResolution34, overwrite = TRUE, compress = "xz") + +## Resolution 35 ############################################################### + +x <- pdf_ocr_text(pdf = "data-raw/IATF/IATF-Resolution-No.-35.pdf") + +## Restructure text +y <- unlist(stringr::str_split(string = x, pattern = "\n")) + +y <- y[c(9:45, 52:90, 98:131, 141:165, 173:207)] +y <- y[y != ""] + +y[115] <- "Francisco T. Duque III Karlo Alexei B. Nograles" +y[116] <- "Secretary, Department of Health Cabinet Secretary, Office of the Cabinet Secretary" +y[117] <- "IATF Chairperson IATF Co-Chairperson" + +y <- stringr::str_replace_all(string = y, pattern = "\\[ATF", replacement = "IATF") + +y <- stringr::str_replace_all(string = y, pattern = "\\[", replacement = "I") + +y <- stringr::str_trim(string = y, side = "both") + +y <- data.frame(linenumber = 1:length(y), + text = y, + source = "IATF", + type = "resolution", + id = 35, + date = as.Date("11/05/2020", format = "%d/%m/%y"), + stringsAsFactors = FALSE) + +iatfResolution35 <- tibble::tibble(y) + +usethis::use_data(iatfResolution35, overwrite = TRUE, compress = "xz") + +## Resolution 36 ############################################################### + +x <- pdf_ocr_text(pdf = "data-raw/IATF/IATF-Resolution-No.-36.pdf") + +## Restructure text +y <- unlist(stringr::str_split(string = x, pattern = "\n")) + +y <- y[c(9:46, 54:93, 101:131)] +y <- y[y != ""] + +y[88] <- "Francisco T. Duque III Karlo Alexei B. Nograles" +y[89] <- "Secretary, Department of Health Cabinet Secretary, Office of the Cabinet Secretary" +y[90] <- "IATF Chairperson IATF Co-Chairperson" + +y <- stringr::str_trim(string = y, side = "both") + +y <- data.frame(linenumber = 1:length(y), + text = y, + source = "IATF", + type = "resolution", + id = 36, + date = as.Date("13/05/2020", format = "%d/%m/%y"), + stringsAsFactors = FALSE) + +iatfResolution36 <- tibble::tibble(y) + +usethis::use_data(iatfResolution36, overwrite = TRUE, compress = "xz") + +## Resolution 37 ############################################################### + +x <- pdf_ocr_text(pdf = "data-raw/IATF/IATF-Resolution-No.-37.pdf") + +## Restructure text +y <- unlist(stringr::str_split(string = x, pattern = "\n")) + +y <- y[c(13:49, 58:76)] +y <- y[y != ""] + +y[47] <- "Francisco T. Duque III Karlo Alexei B. Nograles" +y[48] <- "Secretary, Department of Health Cabinet Secretary, Office of the Cabinet Secretary" +y[49] <- "IATF Chairperson IATF Co-Chairperson" + +y <- stringr::str_trim(string = y, side = "both") + +y <- data.frame(linenumber = 1:length(y), + text = y, + source = "IATF", + type = "resolution", + id = 37, + date = as.Date("15/05/2020", format = "%d/%m/%y"), + stringsAsFactors = FALSE) + +iatfResolution37 <- tibble::tibble(y) + +usethis::use_data(iatfResolution37, overwrite = TRUE, compress = "xz") diff --git a/data-raw/preparePress.R b/data-raw/preparePress.R deleted file mode 100644 index 929ac56..0000000 --- a/data-raw/preparePress.R +++ /dev/null @@ -1,24 +0,0 @@ -library(rvest) -library(stringr) -library(comotext) - -## Extract press releases ###################################################### - -pr <- get_pr_url(pages = 1:25) - -prLinks <- pr - -usethis::use_data(prLinks, overwrite = TRUE, compress = "xz") - -## Extract text from press releases ############################################ - -pressRelease <- NULL - -for(i in 1:nrow(pr)) { - currentPR <- get_press_release(df = pr[i, ]) - - pressRelease <- rbind(pressRelease, currentPR) -} - -usethis::use_data(pressRelease, overwrite = TRUE, compress = "xz") - diff --git a/data/dfaLinks.rda b/data/dfaLinks.rda new file mode 100644 index 0000000..d098b43 Binary files /dev/null and b/data/dfaLinks.rda differ diff --git a/data/dohLinks.rda b/data/dohLinks.rda new file mode 100644 index 0000000..2a68671 Binary files /dev/null and b/data/dohLinks.rda differ diff --git a/data/dohRelease.rda b/data/dohRelease.rda new file mode 100644 index 0000000..bc8210e Binary files /dev/null and b/data/dohRelease.rda differ diff --git a/data/iatfLinks.rda b/data/iatfLinks.rda new file mode 100644 index 0000000..552402c Binary files /dev/null and b/data/iatfLinks.rda differ diff --git a/data/iatfResList.rda b/data/iatfResList.rda deleted file mode 100644 index a1105f3..0000000 Binary files a/data/iatfResList.rda and /dev/null differ diff --git a/data/iatfResolution35.rda b/data/iatfResolution35.rda new file mode 100644 index 0000000..2fffcc6 Binary files /dev/null and b/data/iatfResolution35.rda differ diff --git a/data/iatfResolution36.rda b/data/iatfResolution36.rda new file mode 100644 index 0000000..e92c77a Binary files /dev/null and b/data/iatfResolution36.rda differ diff --git a/data/iatfResolution37.rda b/data/iatfResolution37.rda new file mode 100644 index 0000000..4c8b002 Binary files /dev/null and b/data/iatfResolution37.rda differ diff --git a/data/prLinks.rda b/data/prLinks.rda deleted file mode 100644 index feb9ca5..0000000 Binary files a/data/prLinks.rda and /dev/null differ diff --git a/data/pressRelease.rda b/data/pressRelease.rda deleted file mode 100644 index 3f7a413..0000000 Binary files a/data/pressRelease.rda and /dev/null differ diff --git a/docs/index.html b/docs/index.html index 1e76acf..30a7225 100644 --- a/docs/index.html +++ b/docs/index.html @@ -96,97 +96,97 @@

Datasets

-

comotext currently has 24 datasets of COVID-19-related resolutions and policies in the Philippines. These datasets are 24 resolutions made by the Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF).

+

comotext currently has 28 datasets of COVID-19-related resolutions and policies in the Philippines. These datasets are 28 resolutions made by the Inter-Agency Task Force for the Management of Emerging Infectious Diseases (IATF).

A description of the available datasets can be found here.

-

A table of the 25 IATF resolutions and the URLs to download them can be generated using the function get_iatf_links() as follows:

+

A table of the 29 IATF resolutions and the URLs to download them can be generated using the function get_iatf_links() as follows:

get_iatf_links()
-#> # A tibble: 25 x 4
-#>       id title                          date       link                         
-#>    <dbl> <chr>                          <date>     <chr>                        
-#>  1     9 Recommendations for the Manag… 2020-03-03 https://doh.gov.ph/sites/def…
-#>  2    10 Recommendations for the Manag… 2020-03-09 https://doh.gov.ph/sites/def…
-#>  3    11 Recommendations for the Manag… 2020-03-12 https://doh.gov.ph/sites/def…
-#>  4    12 Recommendations for the Manag… 2020-03-13 https://doh.gov.ph/sites/def…
-#>  5    13 Recommendations for the Manag… 2020-03-17 https://doh.gov.ph/sites/def…
-#>  6    14 Resolutions Relative to the M… 2020-03-20 https://doh.gov.ph/sites/def…
-#>  7    15 Resolutions Relative to the M… 2020-03-25 https://doh.gov.ph/sites/def…
-#>  8    16 Additional Guidelines for the… 2020-03-30 https://doh.gov.ph/sites/def…
-#>  9    17 Recommendations Relative to t… 2020-03-30 https://doh.gov.ph/sites/def…
-#> 10    18 Recommendations Relative to t… 2020-04-01 https://doh.gov.ph/sites/def…
-#> # … with 15 more rows
-

comotext also holds 1 dataset of all Department of Health press releases to date. A description of the pressRelease dataset can be found here. This dataset has been generated using the get_press_release() function (see description below) included in comotext. Related to this is the dataset prLinks which holds the relative URL links for each of the press releases in the Department of Health website to date. This dataset has been produced using the get_pr_url() function (see description below) included in comotext. A description of the prLinks dataset can be found here.

+#> # A tibble: 29 x 6 +#> id title date source type url +#> <dbl> <chr> <date> <chr> <chr> <chr> +#> 1 9 Recommendations for th… 2020-03-03 IATF resol… https://doh.gov.ph/si… +#> 2 10 Recommendations for th… 2020-03-09 IATF resol… https://doh.gov.ph/si… +#> 3 11 Recommendations for th… 2020-03-12 IATF resol… https://doh.gov.ph/si… +#> 4 12 Recommendations for th… 2020-03-13 IATF resol… https://doh.gov.ph/si… +#> 5 13 Recommendations for th… 2020-03-17 IATF resol… https://doh.gov.ph/si… +#> 6 14 Resolutions Relative t… 2020-03-20 IATF resol… https://doh.gov.ph/si… +#> 7 15 Resolutions Relative t… 2020-03-25 IATF resol… https://doh.gov.ph/si… +#> 8 16 Additional Guidelines … 2020-03-30 IATF resol… https://doh.gov.ph/si… +#> 9 17 Recommendations Relati… 2020-03-30 IATF resol… https://doh.gov.ph/si… +#> 10 18 Recommendations Relati… 2020-04-01 IATF resol… https://doh.gov.ph/si… +#> # … with 19 more rows
+

comotext also holds 1 dataset of all Department of Health press releases to date. A description of the dohRelease dataset can be found here. This dataset has been generated using the get_doh_release() function (see description below) included in comotext. Related to this is the dataset dohLinks which holds the relative URL links for each of the press releases in the Department of Health website to date. This dataset has been produced using the get_doh_links() function (see description below) included in comotext. A description of the dohLinks dataset can be found here.

Extracting text data from press releases

-

Press releases issued by the Department of Health are available publicly via their website. The structure of the press releases page is that the section with the links to the press releases text is in a panel within the web page with the panel itself having pagination with each page containing links to 15 press releases with press releases ordered in reverse chronological order.

-

The function get_pr_url extracts the relative URL links to each of the press releases on a current page within the press releases panel. If we want to get the relative URL links for the press releases on page 1 of the press releases panel, we use:

-
get_pr_url(pages = 1)
-#> # A tibble: 15 x 3
-#>    url                                                             id date      
-#>    <chr>                                                        <dbl> <date>    
-#>  1 /press-release/Malasakit%3A-panlaban-natin-sa-COVID-19%3B-f…  1311 2020-05-16
-#>  2 /doh-press-release/EXPERTS-RALLY-BEHIND-DOH-DATA-INTEGRITY-…  5241 2020-05-14
-#>  3 /doh-press-release/EXPERTS-RALLY-BEHIND-DOH-DATA-INTEGRITY-…  5241 2020-05-14
-#>  4 /doh-press-release/NEW-QUARANTINE-SITE-IN-ALABANG-INAUGURAT…  1452 2020-05-13
-#>  5 /doh-press-release/STATEMENT-ON-DATA-INTEGRITY                1920 2020-05-13
-#>  6 /doh-press-release/HONORING-NURSE%E2%80%99S-DAY%3A-DOH-CALL…  8151 2020-05-13
-#>  7 /doh-press-release/SAN-JUAN-CITY-THANKS-DOH-AS-COVID-19-CAS…  1911 2020-05-13
-#>  8 /press-release/ADB-Sponsored-COVID-19-Lab-in-Pampanga-Launc…  1421 2020-05-10
-#>  9 /press-release/ECQ-Buys-PH-Time-Continued-Practice-of-Healt…  5317 2020-05-09
-#> 10 /press-release/biggest-mega-swabbing-center-in-moa-arena-to…  2977 2020-05-08
-#> 11 /doh-press-release/Press%20Release/DUQUE-THANKS-NAVY-FRONTL…  4211 2020-05-07
-#> 12 /doh-press-release/PH-GOV%E2%80%99T-RECEIVES-7-METRIC-TONS-…  2018 2020-05-07
-#> 13 /doh-press-release/BEYOND-NUMBERS%3A-WHAT-THE-FLATTENING-CU…  2525 2020-05-07
-#> 14 /doh-press-release/2ND-MEGA-SWABBING-CENTER-SET-TO-OPERATE%…  1441 2020-05-06
-#> 15 /doh-press-release/NEW-MEGA-SWABBING-CENTER-TO-RAMP-UP-COVI…  1452 2020-05-06
-

The function get_press_releases creates a dataset of text of press releases given a URL of a specific press release text and the date of release. This information is provided for by get_pr_url. If we want to get the text data of the press releases from page 1 of the press release panel, we use:

+

Press releases issued by the Department of Health are available publicly via their website. The structure of the press releases page is that the section with the links to the press releases text is in a panel within the web page with the panel itself having pagination with each page containing links to 28 press releases with press releases ordered in reverse chronological order.

+

The function get_doh_links() extracts the relative URL links to each of the press releases on a current page within the press releases panel. If we want to get the absolute URL links for the press releases on page 1 of the press releases panel, we use:

+
get_doh_links(pages = 1)
+#> # A tibble: 15 x 6
+#>       id title                   date       source type   url                   
+#>    <dbl> <chr>                   <date>     <chr>  <chr>  <chr>                 
+#>  1  4158 "DOH TO DEPUTIZE MED G… 2020-05-19 DOH    press… https://www.doh.gov.p…
+#>  2  4158 "DOH ENCOURAGES VIRTUA… 2020-05-18 DOH    press… https://www.doh.gov.p…
+#>  3  1311 "Malasakit: panlaban n… 2020-05-16 DOH    press… https://www.doh.gov.p…
+#>  4  5241 "EXPERTS RALLY BEHIND … 2020-05-14 DOH    press… https://www.doh.gov.p…
+#>  5  5241 "GOVERNMENT OFFERS FUL… 2020-05-14 DOH    press… https://www.doh.gov.p…
+#>  6  1452 "NEW QUARANTINE SITE I… 2020-05-13 DOH    press… https://www.doh.gov.p…
+#>  7  1920 "STATEMENT ON DATA INT… 2020-05-13 DOH    press… https://www.doh.gov.p…
+#>  8  8151 "HONORING NURSE’S DAY:… 2020-05-13 DOH    press… https://www.doh.gov.p…
+#>  9  1911 "SAN JUAN CITY THANKS … 2020-05-13 DOH    press… https://www.doh.gov.p…
+#> 10  1421 "ADB-Sponsored COVID-1… 2020-05-10 DOH    press… https://www.doh.gov.p…
+#> 11  5317 "ECQ Buys PH Time; Con… 2020-05-09 DOH    press… https://www.doh.gov.p…
+#> 12  2977 "BIGGEST MEGA SWABBING… 2020-05-08 DOH    press… https://www.doh.gov.p…
+#> 13  4211 "DUQUE THANKS NAVY FRO… 2020-05-07 DOH    press… https://www.doh.gov.p…
+#> 14  2018 "PH GOV’T RECEIVES 7 M… 2020-05-07 DOH    press… https://www.doh.gov.p…
+#> 15  2525 "BEYOND\tNUMBERS:\tWHA… 2020-05-07 DOH    press… https://www.doh.gov.p…
+

The function get_doh_release() creates a dataset of text of press releases given a URL of a specific press release text and the date of release. This information is provided for by get_doh_links(). If we want to get the text data of the press releases from page 1 of the press release panel, we use:

## Extract URLs from DoH press releases page 1
-prURL <- get_pr_url(pages = 1)
+prURL <- get_doh_links(pages = 1)
 
 ## Extract text from first press release
-get_press_release(df = prURL[1, ])
-#> # A tibble: 64 x 6
+get_doh_release(df = prURL[1, ])
+#> # A tibble: 49 x 6
 #>    linenumber text                              source type        id date      
 #>         <int> <chr>                             <chr>  <chr>    <dbl> <date>    
-#>  1          1 Malasakit: panlaban natin sa COV… DOH    press r…  1311 2020-05-16
-#>  2          2 Press Release / 16 May 2020The e… DOH    press r…  1311 2020-05-16
-#>  3          3 Philippines is a huge breath of … DOH    press r…  1311 2020-05-16
-#>  4          4 livelihoods have been displaced,… DOH    press r…  1311 2020-05-16
-#>  5          5 that day-to-day behavior has to … DOH    press r…  1311 2020-05-16
-#>  6          6 16 media forum, medical anthropo… DOH    press r…  1311 2020-05-16
-#>  7          7 professor, Dr. Michael Tan, appe… DOH    press r…  1311 2020-05-16
-#>  8          8 practice healthy behaviors while… DOH    press r…  1311 2020-05-16
-#>  9          9 UP Chancellor also called on all… DOH    press r…  1311 2020-05-16
-#> 10         10 highlighting the importance of s… DOH    press r…  1311 2020-05-16
-#> # … with 54 more rows
+#> 1 1 DOH TO DEPUTIZE MED GRADS TO ASS… DOH press r… 4158 2020-05-19 +#> 2 2 Press Release/18 May 2020 Consis… DOH press r… 4158 2020-05-19 +#> 3 3 11469 or the “Bayanihan to Heal … DOH press r… 4158 2020-05-19 +#> 4 4 of 1959,” the Department of Heal… DOH press r… 4158 2020-05-19 +#> 5 5 2020-0169 last Saturday, which a… DOH press r… 4158 2020-05-19 +#> 6 6 practice of medicine as deputize… DOH press r… 4158 2020-05-19 +#> 7 7 of registration from the Profess… DOH press r… 4158 2020-05-19 +#> 8 8 in the President the power “to e… DOH press r… 4158 2020-05-19 +#> 9 9 complement or supplement the cur… DOH press r… 4158 2020-05-19 +#> 10 10 R.A. 2382 stipulates “Medical gr… DOH press r… 4158 2020-05-19 +#> # … with 39 more rows

To get all the DoH press releases available from their website, use:

## Extract URLs
-pr <- get_pr_url(pages = 1:25)
+pr <- get_doh_links(pages = 1:25)
 
 ## Extract all press releases text
 pressRelease <- NULL
 
 for(i in 1:nrow(pr)) {
-  currentPR <- get_press_release(df = pr[i, ])
+  currentPR <- get_doh_release(df = pr[i, ])
 
   pressRelease <- rbind(pressRelease, currentPR)
 }
-
#> # A tibble: 13,243 x 6
+
-

This produces the same dataset as pressRelease included in comotext.

+#> 1 1 DOH TO DEPUTIZE MED GRADS TO ASS… DOH press r… 4158 2020-05-19 +#> 2 2 Press Release/18 May 2020 Consis… DOH press r… 4158 2020-05-19 +#> 3 3 11469 or the “Bayanihan to Heal … DOH press r… 4158 2020-05-19 +#> 4 4 of 1959,” the Department of Heal… DOH press r… 4158 2020-05-19 +#> 5 5 2020-0169 last Saturday, which a… DOH press r… 4158 2020-05-19 +#> 6 6 practice of medicine as deputize… DOH press r… 4158 2020-05-19 +#> 7 7 of registration from the Profess… DOH press r… 4158 2020-05-19 +#> 8 8 in the President the power “to e… DOH press r… 4158 2020-05-19 +#> 9 9 complement or supplement the cur… DOH press r… 4158 2020-05-19 +#> 10 10 R.A. 2382 stipulates “Medical gr… DOH press r… 4158 2020-05-19 +#> # … with 13,315 more rows
+

This produces the same dataset as dohRelease included in comotext.

@@ -197,7 +197,7 @@

Concatenating datasets based on a specific search term

The combine_docs function allows the user to specify search terms to use in identifying datasets provided by the comotext package. The docs argument allows the specification of a vector of search terms to use to identify the names of datasets to concatenate. If the name/s of the datasets contain these search terms, the datasets with these name/s will be returned.

combine_docs(docs = "resolution")
-#> # A tibble: 2,907 x 6
+#> # A tibble: 3,186 x 6
 #>    linenumber text                               source type       id date      
 #>         <int> <chr>                              <chr>  <chr>   <dbl> <date>    
 #>  1          1 WHEREAS, on January 31, 2020, upo… IATF   resolu…     9 2020-03-03
@@ -210,7 +210,7 @@ 

#> 8 8 exemptions in favor of certain cl… IATF resolu… 9 2020-03-03 #> 9 9 SAR, and Macau SAR; IATF resolu… 9 2020-03-03 #> 10 10 WHEREAS, on February 26, 2020, fo… IATF resolu… 9 2020-03-03 -#> # … with 2,897 more rows

+#> # … with 3,176 more rows

The combine_iatf function is a specialised wrapper of the combine_docs function that specifically returns datasets containing IATF resolutions. An additional argument res allows users to specify which IATF resolutions to return. To get IATF resolution 10, 11, and 12, the following call to combine_iatf is made as follows:

combine_iatf(docs = "resolution", res = 10:12)
 #> # A tibble: 324 x 6
@@ -281,6 +281,7 @@ 

Dev status

  • Lifecycle: experimental
  • Travis build status
  • AppVeyor build status
  • +
  • R build status
  • Codecov test coverage
  • DOI
  • diff --git a/docs/news/index.html b/docs/news/index.html index f0523df..95a4392 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -114,9 +114,19 @@

    Changelog

    Source: NEWS.md
    -
    +
    +

    +comotext v0.2.0

    +
      +
    • created new versions of get_ functions to allow for naming of source of text data; deprecation process started on previous versions

    • +
    • re-structured the output text tibbles from the functions to a more logical sequence of data columns with a general sequence of linenumber, identifier, text, date, source, type, url whichever is present for the given tibble

    • +
    • converted URLs to absolute URLs rather than relative URLs

    • +
    • renamed datasets to include source of data as prefix to the name

    • +
    +
    +

    -comotext 0.1.0

    +comotext v0.1.0

    • Created pull data functions for the Department of Health (DoH) press releases found at https://www.doh.gov.ph/press-releases

    • Created datasets of press releases from the DoH using the pull data functions above

    • diff --git a/docs/pkgdown.yml b/docs/pkgdown.yml index f11a5e1..08d1397 100644 --- a/docs/pkgdown.yml +++ b/docs/pkgdown.yml @@ -2,7 +2,7 @@ pandoc: 2.3.1 pkgdown: 1.5.1 pkgdown_sha: ~ articles: [] -last_built: 2020-05-17T00:08Z +last_built: 2020-05-19T21:31Z urls: reference: https://como-ph.github.io/comotext/reference article: https://como-ph.github.io/comotext/articles diff --git a/docs/reference/combine_docs.html b/docs/reference/combine_docs.html index 011f99c..7a9d46c 100644 --- a/docs/reference/combine_docs.html +++ b/docs/reference/combine_docs.html @@ -116,7 +116,7 @@ @@ -141,7 +141,7 @@

      Value

      A tibble of all document types called for.

      Examples

      -
      combine_docs(docs = "resolution")
      #> # A tibble: 2,907 x 6 +
      combine_docs(docs = "resolution")
      #> # A tibble: 3,186 x 6 #> linenumber text source type id date #> <int> <chr> <chr> <chr> <dbl> <date> #> 1 1 WHEREAS, on January 31, 2020, upo… IATF resolu… 9 2020-03-03 @@ -154,7 +154,7 @@

      Examp #> 8 8 exemptions in favor of certain cl… IATF resolu… 9 2020-03-03 #> 9 9 SAR, and Macau SAR; IATF resolu… 9 2020-03-03 #> 10 10 WHEREAS, on February 26, 2020, fo… IATF resolu… 9 2020-03-03 -#> # … with 2,897 more rows

      +#> # … with 3,176 more rows