From 9f2a7fad516af3dd5a6b1692400af636e665fa0c Mon Sep 17 00:00:00 2001
From: Petr Bouchal <pbouchal@gmail.com>
Date: Wed, 30 Sep 2020 22:03:32 +0200
Subject: [PATCH] add function for retrieving catalogue of available files,
 closes #60, closes #59

---
 NAMESPACE               |   1 +
 NEWS.md                 |   4 ++
 R/catalogue.R           | 110 ++++++++++++++++++++++++++++++++++++++++
 cran-comments.md        |   6 ++-
 man/sp_codelists.Rd     |   1 +
 man/sp_datasets.Rd      |   1 +
 man/sp_get_catalogue.Rd |  41 +++++++++++++++
 man/sp_tables.Rd        |   3 +-
 8 files changed, 165 insertions(+), 2 deletions(-)
 create mode 100644 R/catalogue.R
 create mode 100644 man/sp_get_catalogue.Rd

diff --git a/NAMESPACE b/NAMESPACE
index ba18817..b45f08b 100644
--- a/NAMESPACE
+++ b/NAMESPACE
@@ -7,6 +7,7 @@ export(get_dataset)
 export(get_dataset_doc)
 export(get_table)
 export(sp_add_codelist)
+export(sp_get_catalogue)
 export(sp_get_codelist)
 export(sp_get_dataset)
 export(sp_get_dataset_doc)
diff --git a/NEWS.md b/NEWS.md
index c14dfb4..8f0fc4c 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,5 +1,9 @@
 # statnipokladna (development version)
 
+## New features
+
+* `sp_get_catalogue()` returns a list of all currently available files for download along with their metadata (temporal coverage, URL), using the SPARQL endpoint at <https://opendata.mfcr.cz/lod/monitor>.
+
 ## Improvements
 
 * directory for downloading raw data and documentation in all `sp_get_*()` functions can now be set in the option `statnipokladna.dest_dir`. Set this at the beginning of your script or once per session, or in your `.Rprofile` (e.g. `setOption(statnipokladna.dest_dir = "~/sp_data")` for cross-session storage or `setOption(statnipokladna.dest_dir = "sp_data")` for storing all data downloaded by statnipokladna in an `sp_data` subfolder of the current project.)
diff --git a/R/catalogue.R b/R/catalogue.R
new file mode 100644
index 0000000..5b5a792
--- /dev/null
+++ b/R/catalogue.R
@@ -0,0 +1,110 @@
+
+#' List all files currently available from the data provider
+#'
+#' Queries the SPARQL endpoint at <https://opendata.mfcr.cz/lod/monitor>
+#' and
+#'
+#' @return A [tibble][tibble::tibble-package] with one row per downloadable file
+#' @examples
+#' \donttest{
+#' sp_get_catalogue()
+#' }
+#' @format A data frame with these variables:
+#' \describe{
+#'   \item{\code{table}}{character. Table name incl. period (long name, does not correspond to dataset label in `sp_tables`).}
+#'   \item{\code{dataset}}{character. Dataset (long name, does not correspond to dataset label in `sp_datasets`).}
+#'   \item{\code{start}}{date. Start date of temporal coverage for this file.}
+#'   \item{\code{end}}{date End date of temporal coverage for this file.}
+#'   \item{\code{filetype}}{character. Filetyp. Currently 'csv' for all files.}
+#'   \item{\code{compression}}{character. Type of compression. Currently 'zip' for all files.}
+#'   \item{\code{url}}{character. Link to downloadable file/archive.}
+#'   \item{\code{doc}}{character. Link to documantation Currently empty as not provided by endpoint.}
+#'   \item{\code{schema}}{character. Link to schema. Currently empty as not provided by endpoint.}
+#' }
+#' @family Lists of available entities
+#' @export
+sp_get_catalogue <- function() {
+
+  sparql_url <- "https://opendata.mfcr.cz/lod/sparql"
+
+  sparqlquery_datasets_byczso <- stringr::str_c("
+    PREFIX dct: <http://purl.org/dc/terms/>
+    PREFIX dcterm: <http://purl.org/dc/terms/>
+    PREFIX dcterms: <http://purl.org/dc/terms/>
+    PREFIX rdf: <http://www.w3.org/1999/02/22-rdf-syntax-ns#>
+    PREFIX purl: <http://purl.org/dc/terms/>
+    PREFIX dcat: <http://www.w3.org/ns/dcat#>
+    PREFIX foaf: <http://xmlns.com/foaf/0.1/>
+    SELECT ?dist_iri ?subds_iri ?dl_url ?start ?end ?media_type
+           ?subds_title ?ds_title ?schema ?compression ?dist_title ?doc
+    WHERE
+    {
+      {?ds_iri dct:isPartOf <https://opendata.mfcr.cz/lod/monitor/MONITOR> .
+       ?ds_iri purl:title ?ds_title}
+
+      VALUES ?cat_iri {<https://opendata.mfcr.cz/lod/monitor/>}
+
+      {?subds_iri dct:isPartOf ?ds_iri}
+
+      {?subds_iri dcat:distribution ?dist_iri .
+       ?subds_iri purl:title ?subds_title .
+       ?dist_iri dcat:accessURL ?dl_url .
+        OPTIONAL {?subds_iri foaf:page ?doc . }
+       ?subds_iri dct:temporal ?tmprl .
+        OPTIONAL {?dist_iri dct:title ?dist_title . }
+       ?dist_iri dcat:compressFormat ?compression .
+        OPTIONAL {?dist_iri dct:conformsTo ?schema .}
+       ?tmprl dcat:startDate ?start .
+       ?tmprl dcat:endDate ?end .
+       {?dist_iri dcat:mediaType ?media_type .}
+
+      }
+    }
+    LIMIT 2000") %>%
+    stringi::stri_unescape_unicode()
+
+  params = list(`default-graph-uri` = "",
+                query = sparqlquery_datasets_byczso,
+                # format = "application/sparql-results+json",
+                format = "text/csv",
+                timeout = 30000,
+                debug = "on",
+                run = "Run Query")
+  if(!curl::has_internet()) usethis::ui_stop(c("No internet connection. Cannot continue. Retry when connected."))
+  usethis::ui_info("Reading data from data.gov.cz")
+  cat_rslt <- httr::GET(sparql_url, query = params,
+                        # accept("application/sparql-results+json"),
+                        httr::user_agent(usr),
+                        httr::add_headers(c("Accept-Charset" = "utf-8")),
+                        httr::accept("text/csv;charset=UTF-8")) %>%
+    httr::stop_for_status()
+
+  # print(params$query)
+
+  if(httr::status_code(cat_rslt) > 200) {
+    print(httr::http_status(cat_rslt))
+    rslt <- httr::content(cat_rslt, as = "text")
+  } else
+    rslt <- cat_rslt %>% httr::content(as = "text")
+  rslt <- readr::read_csv(rslt, col_types = readr::cols(start = "D",
+                                                        end = "D",
+                                                        doc = "c",
+                                                        schema = "c"))
+  usethis::ui_done("Done downloading and reading data")
+  usethis::ui_info("Transforming data")
+  rslt <- rslt %>%
+    dplyr::mutate(media_type = stringr::str_extract(media_type, "(?<=/)[a-zA-Z]*$"),
+                  compression = stringr::str_extract(compression, "(?<=/)[a-zA-Z]*$")) %>%
+    dplyr::select(table = subds_title, dataset = ds_title,
+           start, end,
+           filetype = media_type, compression,
+           url = dl_url, doc, schema)
+  return(rslt)
+}
+
+# spd <- sp_get_catalogue()
+# unique(spd$filetype)
+# unique(spd$compression)
+# spd
+#
+# spd %>% group_by(ds_title) %>% slice_max(end)
diff --git a/cran-comments.md b/cran-comments.md
index 0a07a4e..9f91e9d 100644
--- a/cran-comments.md
+++ b/cran-comments.md
@@ -1,6 +1,10 @@
 ## New minor version
 
-This version includes several minor improvements and bug fixes and a single new feature: an option can be set to store data dumps downloaded from the external source in a custom directory so as to avoid redownloading them later. By default, `tempdir()` is used and the user has to actively set a parameter or option for the package to store any data outside working or temporary directories.
+This version includes 
+
+* several minor improvements and bug fixes
+* a single new feature: the `sp_get_catalogue()` function drawing on the new SPARQL endpoint made accessible by the data provider at <https://opendata.mfcr.cz/lod/monitor>.
+* a new option can be set to store data dumps downloaded from the external source in a custom directory so as to avoid redownloading them later. By default, `tempdir()` is used and the user has to actively set a parameter or option for the package to store any data outside working or temporary directories.
 
 ## Test environments
 
diff --git a/man/sp_codelists.Rd b/man/sp_codelists.Rd
index f2afe56..7f86b72 100644
--- a/man/sp_codelists.Rd
+++ b/man/sp_codelists.Rd
@@ -25,6 +25,7 @@ descriptions and a GUI for exploring the lists.
 \seealso{
 Other Lists of available entities: 
 \code{\link{sp_datasets}},
+\code{\link{sp_get_catalogue}()},
 \code{\link{sp_tables}}
 }
 \concept{Lists of available entities}
diff --git a/man/sp_datasets.Rd b/man/sp_datasets.Rd
index 5a3da9c..855ca65 100644
--- a/man/sp_datasets.Rd
+++ b/man/sp_datasets.Rd
@@ -24,6 +24,7 @@ of the datasets.
 \seealso{
 Other Lists of available entities: 
 \code{\link{sp_codelists}},
+\code{\link{sp_get_catalogue}()},
 \code{\link{sp_tables}}
 }
 \concept{Lists of available entities}
diff --git a/man/sp_get_catalogue.Rd b/man/sp_get_catalogue.Rd
new file mode 100644
index 0000000..509d941
--- /dev/null
+++ b/man/sp_get_catalogue.Rd
@@ -0,0 +1,41 @@
+% Generated by roxygen2: do not edit by hand
+% Please edit documentation in R/catalogue.R
+\name{sp_get_catalogue}
+\alias{sp_get_catalogue}
+\title{List all files currently available from the data provider}
+\format{
+A data frame with these variables:
+\describe{
+\item{\code{table}}{character. Table name incl. period (long name, does not correspond to dataset label in \code{sp_tables}).}
+\item{\code{dataset}}{character. Dataset (long name, does not correspond to dataset label in \code{sp_datasets}).}
+\item{\code{start}}{date. Start date of temporal coverage for this file.}
+\item{\code{end}}{date End date of temporal coverage for this file.}
+\item{\code{filetype}}{character. Filetyp. Currently 'csv' for all files.}
+\item{\code{compression}}{character. Type of compression. Currently 'zip' for all files.}
+\item{\code{url}}{character. Link to downloadable file/archive.}
+\item{\code{doc}}{character. Link to documantation Currently empty as not provided by endpoint.}
+\item{\code{schema}}{character. Link to schema. Currently empty as not provided by endpoint.}
+}
+}
+\usage{
+sp_get_catalogue()
+}
+\value{
+A \link[tibble:tibble-package]{tibble} with one row per downloadable file
+}
+\description{
+Queries the SPARQL endpoint at \url{https://opendata.mfcr.cz/lod/monitor}
+and
+}
+\examples{
+\donttest{
+sp_get_catalogue()
+}
+}
+\seealso{
+Other Lists of available entities: 
+\code{\link{sp_codelists}},
+\code{\link{sp_datasets}},
+\code{\link{sp_tables}}
+}
+\concept{Lists of available entities}
diff --git a/man/sp_tables.Rd b/man/sp_tables.Rd
index 2883bdf..d0a4dfc 100644
--- a/man/sp_tables.Rd
+++ b/man/sp_tables.Rd
@@ -24,7 +24,8 @@ to see more detailed descriptions. Note that tables do not correspond to the tab
 \seealso{
 Other Lists of available entities: 
 \code{\link{sp_codelists}},
-\code{\link{sp_datasets}}
+\code{\link{sp_datasets}},
+\code{\link{sp_get_catalogue}()}
 }
 \concept{Lists of available entities}
 \keyword{datasets}