diff --git a/DESCRIPTION b/DESCRIPTION
index 2bee96c..dd9ae25 100644
--- a/DESCRIPTION
+++ b/DESCRIPTION
@@ -1,18 +1,19 @@
Package: czso
Title: Use Open Data from the Czech Statistical Office in R
-Version: 0.1.4
+Version: 0.1.5
Authors@R:
person(given = "Petr",
family = "Bouchal",
role = c("aut", "cre"),
email = "pbouchal@gmail.com")
-Description: Get programmatic access to the open data provided
- by the Czech Statistical Office (CZSO).
+Description: Get programmatic access to the open data provided by the
+ Czech Statistical Office (CZSO).
License: MIT + file LICENSE
URL: https://github.com/petrbouchal/czso
BugReports: https://github.com/petrbouchal/czso/issues
Imports:
dplyr (>= 0.8.3),
+ httr (>= 1.4.1),
jsonlite (>= 1.6),
lifecycle,
magrittr,
@@ -20,6 +21,7 @@ Imports:
stringi (>= 1.4.4),
stringr (>= 1.4.0),
tools (>= 3.6.0),
+ usethis (>= 1.5.1),
utils (>= 3.6.0),
vroom (>= 1.0.2)
RdMacros:
diff --git a/NEWS.md b/NEWS.md
index cf0c329..896eea7 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -1,3 +1,10 @@
+# czso 0.1.5
+
+## Improvements
+
+* get_czso_catalogue() is now much faster as it uses the open data catalogue's API instead of donwloading a huge CSV list of all datasets. It is less flexible as it does not allow direct filtering.
+* handle encoding of some older datasets, which may not be UTF-8
+
# czso 0.1.4
* relaxed stringi version requirement to make Win build work
diff --git a/R/core.R b/R/core.R
index 5ef9de8..f68ae5f 100644
--- a/R/core.R
+++ b/R/core.R
@@ -1,102 +1,101 @@
#' Get catalogue of open CZSO datasets
#'
-#' Downloads and processes a list of all registered Czech open data datasets,
-#' returning (by default) those accessible through get_table() from the CZSO.
+#' Retrieves a list of all CZSO's open datasets available from the Czech Open data catalogue.
#'
-#' If `provider` is NULL, returns the whole list, without CZSO-specific identifier
-#' usable in `get_table()`.
+#' Use the dataset_id column as an argument to `get_czso_table()`.
#'
-#' If `provider` is left unset, returns data frame listing CZSO's datasets, with a
-#' `czso_id` column usable in `get_table`.
-#'
-#' Other values of `provider` must be exact matches. Use `provider_filter` for text/regex matching.
-#'
-#' All `*_filter` arguments are case sensitive and can be regular expressions.
-#'
-#' Original data: https://data.gov.cz/soubor/datov%C3%A9-sady.csv
-#'
-#' @param provider character, can be of length > 1. Provider to select for. Defaults to (the Czech name of) CZSO. Must be exact match. If set to NULL, returns full list of all datasets.
-#' @param title_filter character, text to use for filtering the set by title. Case sensitive. Can be a regular expression.
-#' @param description_filter character, text to use for filtering the set by description. Case sensitive. Can be a regular expression.
-#' @param keyword_filter character, text to use for filtering the set by keyword. Case sensitive. Can be a regular expression.
-#' @param provider_filter character, text to use for filtering the set by provider Case sensitive. Can be a regular expression.
-#' @param force_redownload integer. Whether to redownload data source file even if already cached. Defaults to FALSE.
-#' @return a data frame. If `provider` param is left to default, contains a column called czso_id, which can be used as dataset_id parameter in get_table().
+#' @return a data frame with details on all CZSO datasets available in the Czech National Open Data Catalogue.
#' @export
#' @family Core workflow
#' @examples
#' \dontrun{
#' get_czso_catalogue()
-#' get_czso_catalogue(NULL)
-#' get_czso_catalogue(title_filter = "[Mm]zd[ay]")
-#' get_czso_catalogue(provider = "Ministerstvo vnitra")
-#' get_czso_catalogue(provider_filter = "[Mm]inisterstvo")
#' }
-get_czso_catalogue <- function(provider = "\\u010cesk\\u00fd statistick\\u00fd \\u00fa\\u0159ad",
- title_filter = NULL,
- description_filter = NULL,
- keyword_filter = NULL,
- provider_filter = NULL,
- force_redownload = F)
- {
- if(!is.null(provider))
- provider_uni <- stringi::stri_unescape_unicode(provider)
- else provider_uni <- NULL
- td <- paste(tempdir(), "czso", sep = "/")
- dir.create(td, showWarnings = F, recursive = T)
- tf <- paste0(td, "/", "dataset_list.csv")
- if(file.exists(tf) & !force_redownload) {
- message(stringr::str_glue("File already in {td}, not downloading. Set `force_redownload` to TRUE if needed."))
- } else {
- utils::download.file("https://data.gov.cz/soubor/datov%C3%A9-sady.csv", tf, headers = ua_header)
- }
- message("Reading full list of all datasets available on data.gov.cz...")
- dslist0 <- suppressWarnings(suppressMessages(vroom::vroom(tf, num_threads = 1,
- col_types = readr::cols(.default = "c")))) %>%
- dplyr::rename_all(~stringi::stri_trans_general(., "latin-ascii")) %>%
- dplyr::select(provider = poskytovatel,
- title = nazev, description = popis, dataset = datova_sada,
- keywords0 = klicova_slova, topic = tema,
- update_frequency = periodicita_aktualizace,
- spatial_coverage = prostorove_pokryti)
- if(is.null(provider)) {
- dslist <- dslist0 %>%
- dplyr::group_by(dataset) %>%
- dplyr::mutate(keywords = stringr::str_c(keywords0, collapse = "; ")) %>%
- dplyr::ungroup() %>%
- dplyr::select(-keywords0) %>%
- dplyr::distinct()
- } else {
- message("Filtering...")
- dslist <- dslist0 %>%
- dplyr::filter(.$provider %in% provider_uni) %>%
- dplyr::group_by(dataset) %>%
- dplyr::mutate(keywords = stringr::str_c(keywords0, collapse = "; ")) %>%
- dplyr::ungroup() %>%
- dplyr::select(-keywords0) %>%
- dplyr::distinct()
- if(provider == "\\u010cesk\\u00fd statistick\\u00fd \\u00fa\\u0159ad") {
- dslist <- dslist %>%
- dplyr::mutate(czso_id = stringr::str_extract(dataset, "(?<=package_show-id-).*$")) %>%
- dplyr::select(czso_id, -provider, dplyr::everything())
- }
- }
-
- if(!is.null(title_filter)) {
- dslist <- dplyr::filter(dslist, stringr::str_detect(title, title_filter))
- }
- if(!is.null(description_filter)) {
- dslist <- dplyr::filter(dslist, stringr::str_detect(description, description_filter))
- }
- if(!is.null(keyword_filter)) {
- dslist <- dplyr::filter(dslist, stringr::str_detect(keyword_filter, description_filter))
- }
- if(!is.null(provider_filter)) {
- dslist <- dplyr::filter(dslist, stringr::str_detect(provider, description_filter))
- }
-
- return(dslist)
+get_czso_catalogue <- function() {
+
+ sparql_url <- "https://data.gov.cz/sparql"
+
+ sparqlquery_datasets_byczso <- stringr::str_glue(
+ "PREFIX foaf:
+ PREFIX dcterms:
+ PREFIX dcat:
+ PREFIX rdf:
+ PREFIX rdfs:
+
+ SELECT ?dataset_iri
+ ?dataset_id
+ ?title
+ ?provider
+ ?description
+ ?spatial
+ ?temporal
+ ?modified
+ ?page
+ ?periodicity
+ ?periodicity_abb
+ ?start
+ ?end
+ ?keywords_all
+ WHERE {{
+ GRAPH ?g {{
+ ?dataset_iri a dcat:Dataset .
+ ?dataset_iri dcterms:publisher ?publisher .
+ ?dataset_iri dcterms:title ?title .
+ ?dataset_iri dcterms:description ?description .
+ OPTIONAL {{ ?dataset_iri dcterms:identifier ?dataset_id .}}
+ OPTIONAL {{ ?dataset_iri dcterms:spatial ?spatial .}}
+ OPTIONAL {{ ?dataset_iri foaf:page ?page.}}
+ OPTIONAL {{ ?dataset_iri dcterms:temporal ?temporal .}}
+ OPTIONAL {{ ?dataset_iri dcterms:modified ?modified .}}
+ OPTIONAL {{ ?dataset_iri dcat:keyword ?keywords_all .}}
+ OPTIONAL {{ ?dataset_iri dcterms:accrualPeriodicity ?periodicity .}}
+ OPTIONAL {{ ?dataset_iri ?periodicity_abb .}}
+
+ ?publisher foaf:name ?provider .
+
+ OPTIONAL {{ ?temporal schema:startDate ?start .}}
+ OPTIONAL {{ ?temporal schema:endDate ?end .}}
+
+ VALUES ?publisher {{
+ # IRI pro CZSO
+ # # IRI pro Prahu
+ }}
+ FILTER(lang(?provider) = \"cs\")
+ FILTER(lang(?keywords_all) = \"cs\")
+ FILTER(lang(?title) = \"cs\")
+ }}
+ }}") %>% stringi::stri_unescape_unicode()
+
+ params = list(`default-graph-uri` = "",
+ query = sparqlquery_datasets_byczso,
+ # format = "application/sparql-results+json",
+ format = "text/csv",
+ timeout = 30000,
+ debug = "on",
+ run = "Run Query")
+ usethis::ui_info("Downloading")
+ cat_rslt <- httr::GET(sparql_url, query = params,
+ # accept("application/sparql-results+json"),
+ httr::add_headers(c("Accept-Charset" = "utf-8")),
+ httr::accept("text/csv;charset=UTF-8"))
+
+ # print(params$query)
+
+ usethis::ui_info("Reading data")
+ if(httr::status_code(cat_rslt) > 200) {
+ print(httr::http_status(cat_rslt))
+ rslt <- httr::content(cat_rslt, as = "text")
+ } else
+ rslt <- cat_rslt %>% httr::content(as = "text")
+ rslt <- readr::read_csv(rslt, col_types = readr::cols(modified = "T"))
+ usethis::ui_info("Transforming data")
+ rslt <- dplyr::group_by(rslt, dataset_iri) %>%
+ dplyr::mutate(keywords = stringr::str_c(keywords_all, collapse = "; ")) %>%
+ dplyr::ungroup() %>%
+ dplyr::select(-keywords_all) %>%
+ dplyr::distinct()
+ return(rslt)
}
#' Deprecated: Retrieve and read dataset from CZSO
@@ -109,19 +108,9 @@ get_czso_catalogue <- function(provider = "\\u010cesk\\u00fd statistick\\u00fd \
#' @examples
#' # see `get_czso_catalogue()`
#' @export
-get_catalogue <- function(provider = "\\u010cesk\\u00fd statistick\\u00fd \\u00fa\\u0159ad",
- title_filter = NULL,
- description_filter = NULL,
- keyword_filter = NULL,
- provider_filter = NULL,
- force_redownload = F) {
+get_catalogue <- function() {
.Deprecated("get_czso_catalogue")
- get_czso_catalogue(provider = provider,
- title_filter = title_filter,
- description_filter = description_filter,
- keyword_filter = keyword_filter,
- provider_filter = provider_filter,
- force_redownload = force_redownload)
+ get_czso_catalogue()
}
get_czso_dataset_metadata <- function(dataset_id) {
@@ -194,11 +183,14 @@ get_czso_table <- function(dataset_id, resource_num = 1, force_redownload = F) {
}
switch (action,
read = {
+ guessed_enc <- readr::guess_encoding(dfile)[[1,1]]
+ if(guessed_enc == "windows-1252") guessed_enc <- "windows-1250"
dt <- suppressWarnings(suppressMessages(readr::read_csv(dfile, col_types = readr::cols(.default = "c",
- rok = "i",
- casref_do = "T",
- ctvrtleti = "i",
- hodnota = "d"))))
+ rok = "i",
+ casref_do = "T",
+ ctvrtleti = "i",
+ hodnota = "d"),
+ locale = readr::locale(encoding = guessed_enc))))
rtrn <- dt
},
listone = {
diff --git a/README.Rmd b/README.Rmd
index 5ba368b..63a74af 100644
--- a/README.Rmd
+++ b/README.Rmd
@@ -47,40 +47,39 @@ remotes::install_github("petrbouchal/czso")
## Example
-Imagine you are looking for a dataset whose title refers to wages (mzda/mzdy):
+Say you are looking for a dataset whose title refers to wages (mzda/mzdy):
-```{r example}
+First, retrieve the list of available CZSO datasets:
+
+```{r example-catalogue}
library(czso)
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(stringr))
-# first, retrieve the list of available CZSO datasets, filtering for mzda/mzdy
-get_czso_catalogue(title_filter = "mzd[ay]")
+catalogue <- get_czso_catalogue()
```
-We can see the `czso_id` for the required dataset - now use it to get the dataset:
-
-```{r example-cont}
-get_czso_table("110080")
+```{r example-filter}
+catalogue %>%
+ filter(str_detect(title, "[Mm]zd[ay]")) %>%
+ select(dataset_id, title, description)
```
-Alternatively, you could store the whole CZSO catalogue in an object and filter yourself. This is especially useful if you expect to need multiple tries.
-```{r example-alt}
-library(dplyr, warn.conflicts = F)
-library(stringr, warn.conflicts = F)
-catalogue <- get_czso_catalogue()
+We can see the `dataset_id` for the required dataset - now use it to get the dataset:
-catalogue %>%
- filter(str_detect(title, "mzda"))
+```{r example-cont}
+get_czso_table("110080")
```
-The latter allows you to search through the list - or simply look through it - without the overhead of reusing the `get_dataset()` function which downloads and transforms the underlying data.
-
## Credit and notes
- not created or endorsed by the Czech Statistical Office, though they, as well as [the open data team at the Ministry of Interior](https://data.gov.cz/) deserve credit for getting the data out there.
- the package relies on the data.gov.cz catalogue of open data and on the CZSO's local catalogue
- NB: The robots.txt at the domain hosting the CZSO's catalogue prohibits robots from accessing it; while this may be an inappropriate/erroneous setting for what is in essence a data API, this package tries to honor the spirit of that setting by only accessing the API once per `get_table()` call, relying on a different system for `get_catalogue()`. Hence, *do not use this package for harvesting large numbers of datasets from the CZSO.*
+Thanks to @jakubklimek and @martinnecasky for [helping me figure out](https://github.com/opendata-mvcr/nkod/issues/19) the [SPARQL endpoint](https://data.gov.cz/sparql) on the Czech National Open Data Catalogue.
+
## See also
This package takes inspiration from the packages
@@ -94,5 +93,7 @@ For Czech geospatial data, see [CzechData](https://github.com/JanCaha/CzechData/
For Czech fiscal data, see [statnipokladna](https://github.com/petrbouchal/statnipokladna).
+For various transparency disclosures, see [Hlídač státu](https://hlidacstatu.cz).
+
For access to some of Prague's open geospatial data in R, see [pragr](https://github.com/petrbouchal/pragr).
diff --git a/README.md b/README.md
index fdc1496..6a81fb5 100644
--- a/README.md
+++ b/README.md
@@ -49,25 +49,34 @@ remotes::install_github("petrbouchal/czso")
## Example
-Imagine you are looking for a dataset whose title refers to wages
+Say you are looking for a dataset whose title refers to wages
(mzda/mzdy):
+First, retrieve the list of available CZSO datasets:
+
``` r
library(czso)
+suppressPackageStartupMessages(library(dplyr))
+suppressPackageStartupMessages(library(stringr))
+
+catalogue <- get_czso_catalogue()
+#> ℹ Downloading
+#> ℹ Reading data
+#> ℹ Transforming data
+```
-# first, retrieve the list of available CZSO datasets, filtering for mzda/mzdy
-get_czso_catalogue(title_filter = "mzd[ay]")
-#> Reading full list of all datasets available on data.gov.cz...
-#> Filtering...
-#> # A tibble: 2 x 9
-#> czso_id provider title description dataset topic update_frequency
-#>
-#> 1 110079 Český s… Zamě… Datová sad… https:… čtvrtletní
-#> 2 110080 Český s… Prům… Datová sad… https:… roční
-#> # … with 2 more variables: spatial_coverage , keywords
+``` r
+catalogue %>%
+ filter(str_detect(title, "[Mm]zd[ay]")) %>%
+ select(dataset_id, title, description)
+#> # A tibble: 2 x 3
+#> dataset_id title description
+#>
+#> 1 110080 Průměrná hrubá měsíční mzd… Datová sada obsahuje časovou řadu prům…
+#> 2 110079 Zaměstnanci a průměrné hru… Datová sada obsahuje časovou řadu počt…
```
-We can see the `czso_id` for the required dataset - now use it to get
+We can see the `dataset_id` for the required dataset - now use it to get
the dataset:
``` r
@@ -90,31 +99,6 @@ get_czso_table("110080")
#> # POHLAVI_txt
```
-Alternatively, you could store the whole CZSO catalogue in an object and
-filter yourself. This is especially useful if you expect to need
-multiple tries.
-
-``` r
-library(dplyr, warn.conflicts = F)
-library(stringr, warn.conflicts = F)
-catalogue <- get_czso_catalogue()
-#> File already in /var/folders/c8/pj33jytj233g8vr0tw4b2h7m0000gn/T//RtmpypkVs8/czso, not downloading. Set `force_redownload` to TRUE if needed.
-#> Reading full list of all datasets available on data.gov.cz...
-#> Filtering...
-
-catalogue %>%
- filter(str_detect(title, "mzda"))
-#> # A tibble: 1 x 9
-#> czso_id provider title description dataset topic update_frequency
-#>
-#> 1 110080 Český s… Prům… Datová sad… https:… roční
-#> # … with 2 more variables: spatial_coverage , keywords
-```
-
-The latter allows you to search through the list - or simply look
-through it - without the overhead of reusing the `get_dataset()`
-function which downloads and transforms the underlying data.
-
## Credit and notes
- not created or endorsed by the Czech Statistical Office, though
@@ -131,6 +115,11 @@ function which downloads and transforms the underlying data.
different system for `get_catalogue()`. Hence, *do not use this
package for harvesting large numbers of datasets from the CZSO.*
+Thanks to @jakubklimek and @martinnecasky for [helping me figure
+out](https://github.com/opendata-mvcr/nkod/issues/19) the [SPARQL
+endpoint](https://data.gov.cz/sparql) on the Czech National Open Data
+Catalogue.
+
## See also
This package takes inspiration from the packages
@@ -147,5 +136,8 @@ For Czech geospatial data, see
For Czech fiscal data, see
[statnipokladna](https://github.com/petrbouchal/statnipokladna).
+For various transparency disclosures, see [Hlídač
+státu](https://hlidacstatu.cz).
+
For access to some of Prague’s open geospatial data in R, see
[pragr](https://github.com/petrbouchal/pragr).
diff --git a/_pkgdown.yml b/_pkgdown.yml
index a1d453d..79c38f5 100644
--- a/_pkgdown.yml
+++ b/_pkgdown.yml
@@ -1,2 +1,10 @@
destination: docs
url: https://petrbouchal.github.io/czso
+template:
+ params:
+ ganalytics: UA-156460113-4
+reference:
+- title: Core workflow
+ desc: Functions for executing the core workflow.
+ contents:
+ - has_concept("Core workflow")
diff --git a/man/czso-package.Rd b/man/czso-package.Rd
index 47050ea..ea2ef42 100644
--- a/man/czso-package.Rd
+++ b/man/czso-package.Rd
@@ -8,8 +8,8 @@
\description{
\if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}}
-Get programmatic access to the open data provided
- by the Czech Statistical Office (CZSO).
+Get programmatic access to the open data provided by the
+ Czech Statistical Office (CZSO).
}
\seealso{
Useful links:
diff --git a/man/get_catalogue.Rd b/man/get_catalogue.Rd
index d4dad8b..f9dca05 100644
--- a/man/get_catalogue.Rd
+++ b/man/get_catalogue.Rd
@@ -4,27 +4,7 @@
\alias{get_catalogue}
\title{Deprecated: Retrieve and read dataset from CZSO}
\usage{
-get_catalogue(
- provider = "\\\\u010cesk\\\\u00fd statistick\\\\u00fd \\\\u00fa\\\\u0159ad",
- title_filter = NULL,
- description_filter = NULL,
- keyword_filter = NULL,
- provider_filter = NULL,
- force_redownload = F
-)
-}
-\arguments{
-\item{provider}{character, can be of length > 1. Provider to select for. Defaults to (the Czech name of) CZSO. Must be exact match. If set to NULL, returns full list of all datasets.}
-
-\item{title_filter}{character, text to use for filtering the set by title. Case sensitive. Can be a regular expression.}
-
-\item{description_filter}{character, text to use for filtering the set by description. Case sensitive. Can be a regular expression.}
-
-\item{keyword_filter}{character, text to use for filtering the set by keyword. Case sensitive. Can be a regular expression.}
-
-\item{provider_filter}{character, text to use for filtering the set by provider Case sensitive. Can be a regular expression.}
-
-\item{force_redownload}{integer. Whether to redownload data source file even if already cached. Defaults to FALSE.}
+get_catalogue()
}
\value{
a tibble
diff --git a/man/get_czso_catalogue.Rd b/man/get_czso_catalogue.Rd
index 079222d..db3fca3 100644
--- a/man/get_czso_catalogue.Rd
+++ b/man/get_czso_catalogue.Rd
@@ -4,55 +4,20 @@
\alias{get_czso_catalogue}
\title{Get catalogue of open CZSO datasets}
\usage{
-get_czso_catalogue(
- provider = "\\\\u010cesk\\\\u00fd statistick\\\\u00fd \\\\u00fa\\\\u0159ad",
- title_filter = NULL,
- description_filter = NULL,
- keyword_filter = NULL,
- provider_filter = NULL,
- force_redownload = F
-)
-}
-\arguments{
-\item{provider}{character, can be of length > 1. Provider to select for. Defaults to (the Czech name of) CZSO. Must be exact match. If set to NULL, returns full list of all datasets.}
-
-\item{title_filter}{character, text to use for filtering the set by title. Case sensitive. Can be a regular expression.}
-
-\item{description_filter}{character, text to use for filtering the set by description. Case sensitive. Can be a regular expression.}
-
-\item{keyword_filter}{character, text to use for filtering the set by keyword. Case sensitive. Can be a regular expression.}
-
-\item{provider_filter}{character, text to use for filtering the set by provider Case sensitive. Can be a regular expression.}
-
-\item{force_redownload}{integer. Whether to redownload data source file even if already cached. Defaults to FALSE.}
+get_czso_catalogue()
}
\value{
-a data frame. If \code{provider} param is left to default, contains a column called czso_id, which can be used as dataset_id parameter in get_table().
+a data frame with details on all CZSO datasets available in the Czech National Open Data Catalogue.
}
\description{
-Downloads and processes a list of all registered Czech open data datasets,
-returning (by default) those accessible through get_table() from the CZSO.
+Retrieves a list of all CZSO's open datasets available from the Czech Open data catalogue.
}
\details{
-If \code{provider} is NULL, returns the whole list, without CZSO-specific identifier
-usable in \code{get_table()}.
-
-If \code{provider} is left unset, returns data frame listing CZSO's datasets, with a
-\code{czso_id} column usable in \code{get_table}.
-
-Other values of \code{provider} must be exact matches. Use \code{provider_filter} for text/regex matching.
-
-All \verb{*_filter} arguments are case sensitive and can be regular expressions.
-
-Original data: https://data.gov.cz/soubor/datov\%C3\%A9-sady.csv
+Use the dataset_id column as an argument to \code{get_czso_table()}.
}
\examples{
\dontrun{
get_czso_catalogue()
-get_czso_catalogue(NULL)
-get_czso_catalogue(title_filter = "[Mm]zd[ay]")
-get_czso_catalogue(provider = "Ministerstvo vnitra")
-get_czso_catalogue(provider_filter = "[Mm]inisterstvo")
}
}
\seealso{
diff --git a/prep/datagovcz_apiback.R b/prep/datagovcz_apiback.R
index 4b78165..42734e0 100644
--- a/prep/datagovcz_apiback.R
+++ b/prep/datagovcz_apiback.R
@@ -28,6 +28,37 @@ titles <- nn %>% map(1) %>%
dist <- GET(nn[[9]][[1]]$`http://www.w3.org/ns/dcat#distribution`[[1]]$value, accept_json()) %>%
content()
+GET("https://data.gov.cz/api/v1/resource/distribution?iri=https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12",
+ accept_json()) %>%
+ content() %>%
+ pluck("jsonld", 2) %>%
+ pluck("http://www.w3.org/ns/dcat#downloadURL", 1, "@id")
+
+
+ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>%
+ content()
+tt <- ss[[1]][[6]]
+urls <- map_chr(tt, 2)
+
+nn <- urls[400] %>% map(~GET(.x, accept_json()) %>% content(auto_unbox = T))
+
+zzz <- nn %>%
+ pluck(1) %>%
+ purrr::set_names(str_extract, "[:alnum:]*$") %>%
+ map(~purrr::set_names(.x, str_extract, "[:alnum:]*$")) %>%
+ pluck(1) %>%
+ map_dfc(~pluck(.x, 1, "value"))
+
+
+
+ddd <- GET("https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12",
+ accept_json()) %>%
+ content()
+
+ddd %>%
+ pluck(1) %>%
+ map_df(~pluck(.x, 1, "value")) %>%
+ purrr::set_names(str_extract, "[:alnum:]*$")
diff --git a/prep/datagovcz_apiback.R.orig b/prep/datagovcz_apiback.R.orig
new file mode 100644
index 0000000..b083dab
--- /dev/null
+++ b/prep/datagovcz_apiback.R.orig
@@ -0,0 +1,67 @@
+library(httr)
+library(tidyverse)
+library(jsonlite)
+
+ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>%
+ content()
+tt <- ss[[1]][[6]]
+urls <- map_chr(tt, 2)
+
+nn <- urls %>% map(~GET(.x, accept_json()) %>% content(auto_unbox = T))
+names(nn[[1]][[1]])
+
+nn[[1]][[1]]$`http://www.w3.org/ns/dcat#keyword` %>%
+ map(`[[`, 'value') %>%
+ map_chr(1)
+nn[[1]][[1]]$`http://www.w3.org/ns/dcat#distribution` %>%
+ map(`[[`, 'value') %>%
+ map_chr(1)
+
+nn %>% map(names)
+innams <- nn[[1]] %>% map_dfc(names)
+
+titles <- nn %>% map(1) %>%
+ map(`[[`, "http://purl.org/dc/terms/title") %>%
+ map(1) %>%
+ map_chr(`[[`, "value")
+
+dist <- GET(nn[[9]][[1]]$`http://www.w3.org/ns/dcat#distribution`[[1]]$value, accept_json()) %>%
+ content()
+
+<<<<<<< HEAD
+GET("https://data.gov.cz/api/v1/resource/distribution?iri=https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12",
+ accept_json()) %>%
+ content() %>%
+ pluck("jsonld", 2) %>%
+ pluck("http://www.w3.org/ns/dcat#downloadURL", 1, "@id")
+
+
+ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>%
+ content()
+tt <- ss[[1]][[6]]
+urls <- map_chr(tt, 2)
+
+nn <- urls[400] %>% map(~GET(.x, accept_json()) %>% content(auto_unbox = T))
+
+zzz <- nn %>%
+ pluck(1) %>%
+ purrr::set_names(str_extract, "[:alnum:]*$") %>%
+ map(~purrr::set_names(.x, str_extract, "[:alnum:]*$")) %>%
+ pluck(1) %>%
+ map_dfc(~pluck(.x, 1, "value"))
+
+
+
+ddd <- GET("https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12",
+ accept_json()) %>%
+ content()
+
+ddd %>%
+ pluck(1) %>%
+ map_df(~pluck(.x, 1, "value")) %>%
+ purrr::set_names(str_extract, "[:alnum:]*$")
+=======
+>>>>>>> master
+
+
+
diff --git a/prep/komplet.sql b/prep/komplet.sql
new file mode 100644
index 0000000..e6ca6af
--- /dev/null
+++ b/prep/komplet.sql
@@ -0,0 +1,48 @@
+PREFIX foaf:
+PREFIX dcterms:
+PREFIX dcat:
+PREFIX rdf:
+PREFIX rdfs:
+
+SELECT ?dataset_iri
+?dataset_id
+?title ?provider ?description
+?spatial
+?temporal
+?modified
+?page
+?periodicity
+?periodicity_abb
+?start
+?end
+?keyword
+WHERE {
+ GRAPH ?g {
+ ?dataset_iri a dcat:Dataset .
+ ?dataset_iri dcterms:title ?title .
+ ?dataset_iri dcterms:description ?description .
+ ?dataset_iri dcterms:publisher ?publisher .
+ OPTIONAL { ?dataset_iri dcterms:identifier ?dataset_id .}
+ OPTIONAL { ?dataset_iri dcterms:spatial ?spatial .}
+ OPTIONAL { ?dataset_iri foaf:page ?page.}
+ OPTIONAL { ?dataset_iri dcterms:temporal ?temporal .}
+ OPTIONAL { ?dataset_iri dcterms:modified ?modified .}
+ OPTIONAL { ?dataset_iri dcat:keyword ?keyword .}
+ OPTIONAL { ?dataset_iri dcterms:accrualPeriodicity ?periodicity .}
+ OPTIONAL { ?dataset_iri ?periodicity_abb .}
+
+ ?publisher foaf:name ?provider .
+
+ OPTIONAL { ?temporal schema:startDate ?start .}
+ OPTIONAL { ?temporal schema:endDate ?end .}
+
+ VALUES ?publisher {
+ # IRI pro CZSO
+ # IRI pro PlzKraj
+ }
+
+ FILTER(lang(?provider) = "cs")
+ FILTER(lang(?keyword) = "cs")
+ FILTER(lang(?title) = "cs")
+ }
+}
diff --git a/prep/list_datasets_simple.sql b/prep/list_datasets_simple.sql
new file mode 100644
index 0000000..346b04c
--- /dev/null
+++ b/prep/list_datasets_simple.sql
@@ -0,0 +1,20 @@
+PREFIX foaf:
+ PREFIX dcterms:
+ PREFIX dcat:
+PREFIX rdf:
+PREFIX rdfs:
+
+SELECT DISTINCT ?title ?publisher WHERE {
+ GRAPH ?g {
+ ?d a dcat:Dataset
+ ?d dcterms:publisher ?publisher .
+ ?d dcterms:title ?title
+
+ FILTER(lang(?poskytovatel) = "cs")
+ VALUES ?publisher {
+
+
+ }
+ }
+
+}
diff --git a/prep/partial_label.sql b/prep/partial_label.sql
new file mode 100644
index 0000000..f58efa9
--- /dev/null
+++ b/prep/partial_label.sql
@@ -0,0 +1,17 @@
+PREFIX foaf:
+PREFIX dcterms:
+PREFIX skos:
+PREFIX dcat:
+PREFIX rdf:
+PREFIX rdfs:
+
+SELECT ?polozka ?label
+WHERE {
+ GRAPH ?g {
+
+ ?polozka skos:inScheme .
+ ?polozka skos:prefLabel ?label
+ FILTER(lang(?label) = "cs")
+
+ }
+}
diff --git a/prep/sparql-links.md b/prep/sparql-links.md
new file mode 100644
index 0000000..db866c3
--- /dev/null
+++ b/prep/sparql-links.md
@@ -0,0 +1,3 @@
+http://www.iro.umontreal.ca/~lapalme/ift6281/sparql-1_1-cheat-sheet.pdf
+https://www.youtube.com/watch?v=FvGndkpa4K0
+https://en.wikibooks.org/wiki/SPARQL/
diff --git a/prep/sparql_workflow.R b/prep/sparql_workflow.R
new file mode 100644
index 0000000..2075226
--- /dev/null
+++ b/prep/sparql_workflow.R
@@ -0,0 +1,190 @@
+library(httr)
+library(jsonlite)
+library(stringr)
+library(readr)
+
+url <- "https://data.gov.cz/sparql"
+
+# Všechny datasety jednoho providera, podle IRI ---------------------------
+
+sparqlquery_datasets_provider <- str_glue(
+ "PREFIX foaf:
+ PREFIX dcterms:
+ PREFIX dcat:
+ PREFIX rdf:
+ PREFIX rdfs:
+
+ SELECT ?dataset_iri ?title ?provider ?description ?spatial ?issued ?periodicity WHERE {{
+ GRAPH ?g {{
+ ?dataset_iri a dcat:Dataset ;
+ dcterms:title ?title ;
+ dcterms:spatial ?spatial ;
+ dcterms:issued ?issued ;
+ dcterms:accrualPeriodicity ?periodicity ;
+ dcterms:description ?description ;
+ dcterms:publisher ?publisher .
+
+ ?publisher foaf:name ?provider .
+
+ VALUES ?publisher {{
+ # IRI pro CZSO
+ }}
+ FILTER(lang(?provider) = \"cs\")
+ FILTER(lang(?title) = \"cs\")
+ }}
+}}")
+
+# Všechny datasety jednoho providera, podle názvu (přesně) ----------------
+
+sparqlquery_datasets_provider_name <- str_glue(
+ "PREFIX foaf:
+ PREFIX dcterms:
+ PREFIX dcat:
+ PREFIX rdf:
+ PREFIX rdfs:
+
+ SELECT ?dataset ?název ?provider ?popis WHERE {{
+ GRAPH ?g {{
+ ?dataset a dcat:Dataset ;
+ dcterms:title ?název ;
+ dcterms:description ?popis ;
+ dcterms:publisher ?publisher .
+
+ ?publisher foaf:name ?provider .
+
+ }}
+ VALUES ?poskytovatel {{
+ \"Ministerstvo vnitra\"@cs # IRI pro CZSO
+ }}
+ FILTER(lang(?poskytovatel) = \"cs\")
+ FILTER(lang(?název) = \"cs\")
+}}")
+
+params = list(`default-graph-uri` = "",
+ query = sparqlquery_datasets_provider,
+ # format = "application/sparql-results+json",
+ format = "text/csv",
+ timeout = 0,
+ debug = "on",
+ run = "Run Query")
+
+plz <- httr::GET(url, query = params,
+ # accept("application/sparql-results+json"),
+ add_headers(c("Accept-Charset" = "utf-8")),
+ accept("text/csv;charset=UTF-8")
+)
+
+plz %>% stop_for_status()
+
+plz$request$headers
+
+plz$headers$`content-type`
+plzd <- plz %>% content(as = "text")
+
+plzd <- plz %>% content(as = "text") %>%
+ read_csv()
+
+plzd$results$bindings %>% names()
+plzd$results$bindings %>% head()
+
+# Všechny distribuce jednoho datasetu, podle IRI --------------------------
+
+sparqlquery_distribs_dataset <- str_glue(
+ "PREFIX foaf:
+ PREFIX dcterms:
+ PREFIX dcat:
+ PREFIX rdf:
+ PREFIX rdfs:
+
+
+
+ SELECT ?url, ?format WHERE {{
+ GRAPH ?g {{
+ ?dataset a dcat:Dataset ;
+ dcat:distribution ?distribution .
+
+ ?distribution dcat:downloadURL ?url .
+ ?distribution dct:format ?format .
+
+ VALUES ?dataset {{
+ # IRI pro dataset
+ }}
+ }}
+ }}"
+)
+
+url <- "https://data.gov.cz/sparql"
+params_ds = list(`default-graph-uri` = "",
+ query = sparqlquery_distribs_dataset,
+ # format = "application/sparql-results+json",
+ format = "text/csv",
+ timeout = 0,
+ debug = "on",
+ run = "Run Query")
+
+ds <- httr::GET(url, query = params_ds,
+ # accept("application/sparql-results+json"),
+ config = add_headers(c("Accept-charset" = "utf-8"))
+) %>%
+ stop_for_status()
+
+# ds$headers$`content-type`
+# ds$status_code
+#
+# dst <- ds %>% content(as = "text") %>%
+# fromJSON()
+
+dst <- ds %>%
+ content(as = "text") %>%
+ read_csv()
+
+ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>%
+ content()
+tt <- ss[[1]][[6]]
+s <- map_chr(tt, 2)
+
+# Všichni providers -------------------------------------------------------
+
+sparqlquery_providers <- "PREFIX foaf:
+PREFIX dcterms:
+PREFIX dcat:
+PREFIX rdf:
+PREFIX rdfs:
+
+SELECT DISTINCT ?poskytovatel ?publisher WHERE {
+ GRAPH ?g {
+
+ ?publisher foaf:name ?poskytovatel .
+
+ FILTER(lang(?poskytovatel) = \"cs\")
+ FILTER(?poskytovatel = \"Ministerstvo vnitra\"@cs)
+ }
+
+}"
+
+params_prv = list(`default-graph-uri` = "",
+ query = sparqlquery_providers,
+ # format = "application/sparql-results+json",
+ format = "text/csv",
+ timeout = 0,
+ debug = "on",
+ run = "Run Query")
+
+prv <- httr::GET(url, query = params_prv,
+ # accept("application/sparql-results+json"),
+ config = add_headers(c("Accept-charset" = "utf-8"))
+) %>%
+ stop_for_status()
+
+# ds$headers$`content-type`
+# ds$status_code
+#
+# dst <- ds %>% content(as = "text") %>%
+# fromJSON()
+
+prvt <- prv %>%
+ content(as = "text") %>%
+ read_csv()
+
+
+
diff --git a/renv.lock b/renv.lock
index 2ecb5e1..4b0c387 100644
--- a/renv.lock
+++ b/renv.lock
@@ -1,6 +1,6 @@
{
"R": {
- "Version": "3.6.0",
+ "Version": "3.6.2",
"Repositories": [
{
"Name": "CRAN",
@@ -158,6 +158,13 @@
"Repository": "CRAN",
"Hash": "08cf4045c149a0f0eaf405324c7495bd"
},
+ "clisymbols": {
+ "Package": "clisymbols",
+ "Version": "1.2.0",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "96c01552bfd5661b9bbdefbc762f4bcd"
+ },
"colorspace": {
"Package": "colorspace",
"Version": "1.4-1",
@@ -306,6 +313,20 @@
"RemoteSha": "d05e437eb3cd8dd52a4aeed8298a6efba23c1d1f",
"Hash": "93bf157d44cfd990a33b36e78c0317cb"
},
+ "gh": {
+ "Package": "gh",
+ "Version": "1.0.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "4da58d15239da30de8c7de088250d3be"
+ },
+ "git2r": {
+ "Package": "git2r",
+ "Version": "0.26.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "135db4dbc94ed18f629ff8843a8064b7"
+ },
"glue": {
"Package": "glue",
"Version": "1.3.1.9000",
@@ -374,6 +395,13 @@
"Repository": "CRAN",
"Hash": "7146fea4685b4252ebf478978c75f597"
},
+ "ini": {
+ "Package": "ini",
+ "Version": "0.3.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "6154ec2223172bce8162d4153cda21f7"
+ },
"isoband": {
"Package": "isoband",
"Version": "0.2.0",
@@ -828,6 +856,13 @@
"Repository": "CRAN",
"Hash": "35c5245622df501b51263b565d18c053"
},
+ "usethis": {
+ "Package": "usethis",
+ "Version": "1.5.1",
+ "Source": "Repository",
+ "Repository": "CRAN",
+ "Hash": "30ee6fa315a020d5db6f28adbb7fea83"
+ },
"utf8": {
"Package": "utf8",
"Version": "1.1.4",