From 9f2a7fad516af3dd5a6b1692400af636e665fa0c Mon Sep 17 00:00:00 2001 From: Petr Bouchal Date: Wed, 30 Sep 2020 22:03:32 +0200 Subject: [PATCH] add function for retrieving catalogue of available files, closes #60, closes #59 --- NAMESPACE | 1 + NEWS.md | 4 ++ R/catalogue.R | 110 ++++++++++++++++++++++++++++++++++++++++ cran-comments.md | 6 ++- man/sp_codelists.Rd | 1 + man/sp_datasets.Rd | 1 + man/sp_get_catalogue.Rd | 41 +++++++++++++++ man/sp_tables.Rd | 3 +- 8 files changed, 165 insertions(+), 2 deletions(-) create mode 100644 R/catalogue.R create mode 100644 man/sp_get_catalogue.Rd diff --git a/NAMESPACE b/NAMESPACE index ba18817..b45f08b 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -7,6 +7,7 @@ export(get_dataset) export(get_dataset_doc) export(get_table) export(sp_add_codelist) +export(sp_get_catalogue) export(sp_get_codelist) export(sp_get_dataset) export(sp_get_dataset_doc) diff --git a/NEWS.md b/NEWS.md index c14dfb4..8f0fc4c 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,9 @@ # statnipokladna (development version) +## New features + +* `sp_get_catalogue()` returns a list of all currently available files for download along with their metadata (temporal coverage, URL), using the SPARQL endpoint at . + ## Improvements * directory for downloading raw data and documentation in all `sp_get_*()` functions can now be set in the option `statnipokladna.dest_dir`. Set this at the beginning of your script or once per session, or in your `.Rprofile` (e.g. `setOption(statnipokladna.dest_dir = "~/sp_data")` for cross-session storage or `setOption(statnipokladna.dest_dir = "sp_data")` for storing all data downloaded by statnipokladna in an `sp_data` subfolder of the current project.) diff --git a/R/catalogue.R b/R/catalogue.R new file mode 100644 index 0000000..5b5a792 --- /dev/null +++ b/R/catalogue.R @@ -0,0 +1,110 @@ + +#' List all files currently available from the data provider +#' +#' Queries the SPARQL endpoint at +#' and +#' +#' @return A [tibble][tibble::tibble-package] with one row per downloadable file +#' @examples +#' \donttest{ +#' sp_get_catalogue() +#' } +#' @format A data frame with these variables: +#' \describe{ +#' \item{\code{table}}{character. Table name incl. period (long name, does not correspond to dataset label in `sp_tables`).} +#' \item{\code{dataset}}{character. Dataset (long name, does not correspond to dataset label in `sp_datasets`).} +#' \item{\code{start}}{date. Start date of temporal coverage for this file.} +#' \item{\code{end}}{date End date of temporal coverage for this file.} +#' \item{\code{filetype}}{character. Filetyp. Currently 'csv' for all files.} +#' \item{\code{compression}}{character. Type of compression. Currently 'zip' for all files.} +#' \item{\code{url}}{character. Link to downloadable file/archive.} +#' \item{\code{doc}}{character. Link to documantation Currently empty as not provided by endpoint.} +#' \item{\code{schema}}{character. Link to schema. Currently empty as not provided by endpoint.} +#' } +#' @family Lists of available entities +#' @export +sp_get_catalogue <- function() { + + sparql_url <- "https://opendata.mfcr.cz/lod/sparql" + + sparqlquery_datasets_byczso <- stringr::str_c(" + PREFIX dct: + PREFIX dcterm: + PREFIX dcterms: + PREFIX rdf: + PREFIX purl: + PREFIX dcat: + PREFIX foaf: + SELECT ?dist_iri ?subds_iri ?dl_url ?start ?end ?media_type + ?subds_title ?ds_title ?schema ?compression ?dist_title ?doc + WHERE + { + {?ds_iri dct:isPartOf . + ?ds_iri purl:title ?ds_title} + + VALUES ?cat_iri {} + + {?subds_iri dct:isPartOf ?ds_iri} + + {?subds_iri dcat:distribution ?dist_iri . + ?subds_iri purl:title ?subds_title . + ?dist_iri dcat:accessURL ?dl_url . + OPTIONAL {?subds_iri foaf:page ?doc . } + ?subds_iri dct:temporal ?tmprl . + OPTIONAL {?dist_iri dct:title ?dist_title . } + ?dist_iri dcat:compressFormat ?compression . + OPTIONAL {?dist_iri dct:conformsTo ?schema .} + ?tmprl dcat:startDate ?start . + ?tmprl dcat:endDate ?end . + {?dist_iri dcat:mediaType ?media_type .} + + } + } + LIMIT 2000") %>% + stringi::stri_unescape_unicode() + + params = list(`default-graph-uri` = "", + query = sparqlquery_datasets_byczso, + # format = "application/sparql-results+json", + format = "text/csv", + timeout = 30000, + debug = "on", + run = "Run Query") + if(!curl::has_internet()) usethis::ui_stop(c("No internet connection. Cannot continue. Retry when connected.")) + usethis::ui_info("Reading data from data.gov.cz") + cat_rslt <- httr::GET(sparql_url, query = params, + # accept("application/sparql-results+json"), + httr::user_agent(usr), + httr::add_headers(c("Accept-Charset" = "utf-8")), + httr::accept("text/csv;charset=UTF-8")) %>% + httr::stop_for_status() + + # print(params$query) + + if(httr::status_code(cat_rslt) > 200) { + print(httr::http_status(cat_rslt)) + rslt <- httr::content(cat_rslt, as = "text") + } else + rslt <- cat_rslt %>% httr::content(as = "text") + rslt <- readr::read_csv(rslt, col_types = readr::cols(start = "D", + end = "D", + doc = "c", + schema = "c")) + usethis::ui_done("Done downloading and reading data") + usethis::ui_info("Transforming data") + rslt <- rslt %>% + dplyr::mutate(media_type = stringr::str_extract(media_type, "(?<=/)[a-zA-Z]*$"), + compression = stringr::str_extract(compression, "(?<=/)[a-zA-Z]*$")) %>% + dplyr::select(table = subds_title, dataset = ds_title, + start, end, + filetype = media_type, compression, + url = dl_url, doc, schema) + return(rslt) +} + +# spd <- sp_get_catalogue() +# unique(spd$filetype) +# unique(spd$compression) +# spd +# +# spd %>% group_by(ds_title) %>% slice_max(end) diff --git a/cran-comments.md b/cran-comments.md index 0a07a4e..9f91e9d 100644 --- a/cran-comments.md +++ b/cran-comments.md @@ -1,6 +1,10 @@ ## New minor version -This version includes several minor improvements and bug fixes and a single new feature: an option can be set to store data dumps downloaded from the external source in a custom directory so as to avoid redownloading them later. By default, `tempdir()` is used and the user has to actively set a parameter or option for the package to store any data outside working or temporary directories. +This version includes + +* several minor improvements and bug fixes +* a single new feature: the `sp_get_catalogue()` function drawing on the new SPARQL endpoint made accessible by the data provider at . +* a new option can be set to store data dumps downloaded from the external source in a custom directory so as to avoid redownloading them later. By default, `tempdir()` is used and the user has to actively set a parameter or option for the package to store any data outside working or temporary directories. ## Test environments diff --git a/man/sp_codelists.Rd b/man/sp_codelists.Rd index f2afe56..7f86b72 100644 --- a/man/sp_codelists.Rd +++ b/man/sp_codelists.Rd @@ -25,6 +25,7 @@ descriptions and a GUI for exploring the lists. \seealso{ Other Lists of available entities: \code{\link{sp_datasets}}, +\code{\link{sp_get_catalogue}()}, \code{\link{sp_tables}} } \concept{Lists of available entities} diff --git a/man/sp_datasets.Rd b/man/sp_datasets.Rd index 5a3da9c..855ca65 100644 --- a/man/sp_datasets.Rd +++ b/man/sp_datasets.Rd @@ -24,6 +24,7 @@ of the datasets. \seealso{ Other Lists of available entities: \code{\link{sp_codelists}}, +\code{\link{sp_get_catalogue}()}, \code{\link{sp_tables}} } \concept{Lists of available entities} diff --git a/man/sp_get_catalogue.Rd b/man/sp_get_catalogue.Rd new file mode 100644 index 0000000..509d941 --- /dev/null +++ b/man/sp_get_catalogue.Rd @@ -0,0 +1,41 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/catalogue.R +\name{sp_get_catalogue} +\alias{sp_get_catalogue} +\title{List all files currently available from the data provider} +\format{ +A data frame with these variables: +\describe{ +\item{\code{table}}{character. Table name incl. period (long name, does not correspond to dataset label in \code{sp_tables}).} +\item{\code{dataset}}{character. Dataset (long name, does not correspond to dataset label in \code{sp_datasets}).} +\item{\code{start}}{date. Start date of temporal coverage for this file.} +\item{\code{end}}{date End date of temporal coverage for this file.} +\item{\code{filetype}}{character. Filetyp. Currently 'csv' for all files.} +\item{\code{compression}}{character. Type of compression. Currently 'zip' for all files.} +\item{\code{url}}{character. Link to downloadable file/archive.} +\item{\code{doc}}{character. Link to documantation Currently empty as not provided by endpoint.} +\item{\code{schema}}{character. Link to schema. Currently empty as not provided by endpoint.} +} +} +\usage{ +sp_get_catalogue() +} +\value{ +A \link[tibble:tibble-package]{tibble} with one row per downloadable file +} +\description{ +Queries the SPARQL endpoint at \url{https://opendata.mfcr.cz/lod/monitor} +and +} +\examples{ +\donttest{ +sp_get_catalogue() +} +} +\seealso{ +Other Lists of available entities: +\code{\link{sp_codelists}}, +\code{\link{sp_datasets}}, +\code{\link{sp_tables}} +} +\concept{Lists of available entities} diff --git a/man/sp_tables.Rd b/man/sp_tables.Rd index 2883bdf..d0a4dfc 100644 --- a/man/sp_tables.Rd +++ b/man/sp_tables.Rd @@ -24,7 +24,8 @@ to see more detailed descriptions. Note that tables do not correspond to the tab \seealso{ Other Lists of available entities: \code{\link{sp_codelists}}, -\code{\link{sp_datasets}} +\code{\link{sp_datasets}}, +\code{\link{sp_get_catalogue}()} } \concept{Lists of available entities} \keyword{datasets}