add function for retrieving catalogue of available files, closes #60, c…

…loses #59
petrbouchal · Sep 30, 2020 · 9f2a7fa · 9f2a7fa
1 parent 9f23d76
commit 9f2a7fa
Show file tree

Hide file tree

Showing 8 changed files with 165 additions and 2 deletions.
diff --git a/NAMESPACE b/NAMESPACE
@@ -7,6 +7,7 @@ export(get_dataset)
 export(get_dataset_doc)
 export(get_table)
 export(sp_add_codelist)
+export(sp_get_catalogue)
 export(sp_get_codelist)
 export(sp_get_dataset)
 export(sp_get_dataset_doc)

diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,9 @@
 # statnipokladna (development version)
 
+## New features
+
+* `sp_get_catalogue()` returns a list of all currently available files for download along with their metadata (temporal coverage, URL), using the SPARQL endpoint at <https://opendata.mfcr.cz/lod/monitor>.
+
 ## Improvements
 
 * directory for downloading raw data and documentation in all `sp_get_*()` functions can now be set in the option `statnipokladna.dest_dir`. Set this at the beginning of your script or once per session, or in your `.Rprofile` (e.g. `setOption(statnipokladna.dest_dir = "~/sp_data")` for cross-session storage or `setOption(statnipokladna.dest_dir = "sp_data")` for storing all data downloaded by statnipokladna in an `sp_data` subfolder of the current project.)

diff --git a/R/catalogue.R b/R/catalogue.R
@@ -0,0 +1,110 @@
+
+#' List all files currently available from the data provider
+#'
+#' Queries the SPARQL endpoint at <https://opendata.mfcr.cz/lod/monitor>
+#' and
+#'
+#' @return A [tibble][tibble::tibble-package] with one row per downloadable file
+#' @examples
+#' \donttest{
+#' sp_get_catalogue()
+#' }
+#' @format A data frame with these variables:
+#' \describe{
+#'   \item{\code{table}}{character. Table name incl. period (long name, does not correspond to dataset label in `sp_tables`).}
+#'   \item{\code{dataset}}{character. Dataset (long name, does not correspond to dataset label in `sp_datasets`).}
+#'   \item{\code{start}}{date. Start date of temporal coverage for this file.}
+#'   \item{\code{end}}{date End date of temporal coverage for this file.}
+#'   \item{\code{filetype}}{character. Filetyp. Currently 'csv' for all files.}
+#'   \item{\code{compression}}{character. Type of compression. Currently 'zip' for all files.}
+#'   \item{\code{url}}{character. Link to downloadable file/archive.}
+#'   \item{\code{doc}}{character. Link to documantation Currently empty as not provided by endpoint.}
+#'   \item{\code{schema}}{character. Link to schema. Currently empty as not provided by endpoint.}
+#' }
+#' @family Lists of available entities
+#' @export
+sp_get_catalogue <- function() {
+
+  sparql_url <- "https://opendata.mfcr.cz/lod/sparql"
+
+  sparqlquery_datasets_byczso <- stringr::str_c("
+    PREFIX dct: <http://purl.org/dc/terms/>
+    PREFIX dcterm: <http://purl.org/dc/terms/>
+    PREFIX dcterms: <http://purl.org/dc/terms/>
+    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    PREFIX purl: <http://purl.org/dc/terms/>
+    PREFIX dcat: <http://www.w3.org/ns/dcat#>
+    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+    SELECT ?dist_iri ?subds_iri ?dl_url ?start ?end ?media_type
+           ?subds_title ?ds_title ?schema ?compression ?dist_title ?doc
+    WHERE
+    {
+      {?ds_iri dct:isPartOf <https://opendata.mfcr.cz/lod/monitor/MONITOR> .
+       ?ds_iri purl:title ?ds_title}
+
+      VALUES ?cat_iri {<https://opendata.mfcr.cz/lod/monitor/>}
+
+      {?subds_iri dct:isPartOf ?ds_iri}
+
+      {?subds_iri dcat:distribution ?dist_iri .
+       ?subds_iri purl:title ?subds_title .
+       ?dist_iri dcat:accessURL ?dl_url .
+        OPTIONAL {?subds_iri foaf:page ?doc . }
+       ?subds_iri dct:temporal ?tmprl .
+        OPTIONAL {?dist_iri dct:title ?dist_title . }
+       ?dist_iri dcat:compressFormat ?compression .
+        OPTIONAL {?dist_iri dct:conformsTo ?schema .}
+       ?tmprl dcat:startDate ?start .
+       ?tmprl dcat:endDate ?end .
+       {?dist_iri dcat:mediaType ?media_type .}
+
+      }
+    }
+    LIMIT 2000") %>%
+    stringi::stri_unescape_unicode()
+
+  params = list(`default-graph-uri` = "",
+                query = sparqlquery_datasets_byczso,
+                # format = "application/sparql-results+json",
+                format = "text/csv",
+                timeout = 30000,
+                debug = "on",
+                run = "Run Query")
+  if(!curl::has_internet()) usethis::ui_stop(c("No internet connection. Cannot continue. Retry when connected."))
+  usethis::ui_info("Reading data from data.gov.cz")
+  cat_rslt <- httr::GET(sparql_url, query = params,
+                        # accept("application/sparql-results+json"),
+                        httr::user_agent(usr),
+                        httr::add_headers(c("Accept-Charset" = "utf-8")),
+                        httr::accept("text/csv;charset=UTF-8")) %>%
+    httr::stop_for_status()
+
+  # print(params$query)
+
+  if(httr::status_code(cat_rslt) > 200) {
+    print(httr::http_status(cat_rslt))
+    rslt <- httr::content(cat_rslt, as = "text")
+  } else
+    rslt <- cat_rslt %>% httr::content(as = "text")
+  rslt <- readr::read_csv(rslt, col_types = readr::cols(start = "D",
+                                                        end = "D",
+                                                        doc = "c",
+                                                        schema = "c"))
+  usethis::ui_done("Done downloading and reading data")
+  usethis::ui_info("Transforming data")
+  rslt <- rslt %>%
+    dplyr::mutate(media_type = stringr::str_extract(media_type, "(?<=/)[a-zA-Z]*$"),
+                  compression = stringr::str_extract(compression, "(?<=/)[a-zA-Z]*$")) %>%
+    dplyr::select(table = subds_title, dataset = ds_title,
+           start, end,
+           filetype = media_type, compression,
+           url = dl_url, doc, schema)
+  return(rslt)
+}
+
+# spd <- sp_get_catalogue()
+# unique(spd$filetype)
+# unique(spd$compression)
+# spd
+#
+# spd %>% group_by(ds_title) %>% slice_max(end)
diff --git a/cran-comments.md b/cran-comments.md
@@ -1,6 +1,10 @@
 ## New minor version
 
-This version includes several minor improvements and bug fixes and a single new feature: an option can be set to store data dumps downloaded from the external source in a custom directory so as to avoid redownloading them later. By default, `tempdir()` is used and the user has to actively set a parameter or option for the package to store any data outside working or temporary directories.
+This version includes 
+
+* several minor improvements and bug fixes
+* a single new feature: the `sp_get_catalogue()` function drawing on the new SPARQL endpoint made accessible by the data provider at <https://opendata.mfcr.cz/lod/monitor>.
+* a new option can be set to store data dumps downloaded from the external source in a custom directory so as to avoid redownloading them later. By default, `tempdir()` is used and the user has to actively set a parameter or option for the package to store any data outside working or temporary directories.
 
 ## Test environments
 

diff --git a/man/sp_codelists.Rd b/man/sp_codelists.Rd
diff --git a/man/sp_datasets.Rd b/man/sp_datasets.Rd
diff --git a/man/sp_get_catalogue.Rd b/man/sp_get_catalogue.Rd
diff --git a/man/sp_tables.Rd b/man/sp_tables.Rd