diff --git a/DESCRIPTION b/DESCRIPTION index 2bee96c..dd9ae25 100644 --- a/DESCRIPTION +++ b/DESCRIPTION @@ -1,18 +1,19 @@ Package: czso Title: Use Open Data from the Czech Statistical Office in R -Version: 0.1.4 +Version: 0.1.5 Authors@R: person(given = "Petr", family = "Bouchal", role = c("aut", "cre"), email = "pbouchal@gmail.com") -Description: Get programmatic access to the open data provided - by the Czech Statistical Office (CZSO). +Description: Get programmatic access to the open data provided by the + Czech Statistical Office (CZSO). License: MIT + file LICENSE URL: https://github.com/petrbouchal/czso BugReports: https://github.com/petrbouchal/czso/issues Imports: dplyr (>= 0.8.3), + httr (>= 1.4.1), jsonlite (>= 1.6), lifecycle, magrittr, @@ -20,6 +21,7 @@ Imports: stringi (>= 1.4.4), stringr (>= 1.4.0), tools (>= 3.6.0), + usethis (>= 1.5.1), utils (>= 3.6.0), vroom (>= 1.0.2) RdMacros: diff --git a/NEWS.md b/NEWS.md index cf0c329..896eea7 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,3 +1,10 @@ +# czso 0.1.5 + +## Improvements + +* get_czso_catalogue() is now much faster as it uses the open data catalogue's API instead of donwloading a huge CSV list of all datasets. It is less flexible as it does not allow direct filtering. +* handle encoding of some older datasets, which may not be UTF-8 + # czso 0.1.4 * relaxed stringi version requirement to make Win build work diff --git a/R/core.R b/R/core.R index 5ef9de8..f68ae5f 100644 --- a/R/core.R +++ b/R/core.R @@ -1,102 +1,101 @@ #' Get catalogue of open CZSO datasets #' -#' Downloads and processes a list of all registered Czech open data datasets, -#' returning (by default) those accessible through get_table() from the CZSO. +#' Retrieves a list of all CZSO's open datasets available from the Czech Open data catalogue. #' -#' If `provider` is NULL, returns the whole list, without CZSO-specific identifier -#' usable in `get_table()`. +#' Use the dataset_id column as an argument to `get_czso_table()`. #' -#' If `provider` is left unset, returns data frame listing CZSO's datasets, with a -#' `czso_id` column usable in `get_table`. -#' -#' Other values of `provider` must be exact matches. Use `provider_filter` for text/regex matching. -#' -#' All `*_filter` arguments are case sensitive and can be regular expressions. -#' -#' Original data: https://data.gov.cz/soubor/datov%C3%A9-sady.csv -#' -#' @param provider character, can be of length > 1. Provider to select for. Defaults to (the Czech name of) CZSO. Must be exact match. If set to NULL, returns full list of all datasets. -#' @param title_filter character, text to use for filtering the set by title. Case sensitive. Can be a regular expression. -#' @param description_filter character, text to use for filtering the set by description. Case sensitive. Can be a regular expression. -#' @param keyword_filter character, text to use for filtering the set by keyword. Case sensitive. Can be a regular expression. -#' @param provider_filter character, text to use for filtering the set by provider Case sensitive. Can be a regular expression. -#' @param force_redownload integer. Whether to redownload data source file even if already cached. Defaults to FALSE. -#' @return a data frame. If `provider` param is left to default, contains a column called czso_id, which can be used as dataset_id parameter in get_table(). +#' @return a data frame with details on all CZSO datasets available in the Czech National Open Data Catalogue. #' @export #' @family Core workflow #' @examples #' \dontrun{ #' get_czso_catalogue() -#' get_czso_catalogue(NULL) -#' get_czso_catalogue(title_filter = "[Mm]zd[ay]") -#' get_czso_catalogue(provider = "Ministerstvo vnitra") -#' get_czso_catalogue(provider_filter = "[Mm]inisterstvo") #' } -get_czso_catalogue <- function(provider = "\\u010cesk\\u00fd statistick\\u00fd \\u00fa\\u0159ad", - title_filter = NULL, - description_filter = NULL, - keyword_filter = NULL, - provider_filter = NULL, - force_redownload = F) - { - if(!is.null(provider)) - provider_uni <- stringi::stri_unescape_unicode(provider) - else provider_uni <- NULL - td <- paste(tempdir(), "czso", sep = "/") - dir.create(td, showWarnings = F, recursive = T) - tf <- paste0(td, "/", "dataset_list.csv") - if(file.exists(tf) & !force_redownload) { - message(stringr::str_glue("File already in {td}, not downloading. Set `force_redownload` to TRUE if needed.")) - } else { - utils::download.file("https://data.gov.cz/soubor/datov%C3%A9-sady.csv", tf, headers = ua_header) - } - message("Reading full list of all datasets available on data.gov.cz...") - dslist0 <- suppressWarnings(suppressMessages(vroom::vroom(tf, num_threads = 1, - col_types = readr::cols(.default = "c")))) %>% - dplyr::rename_all(~stringi::stri_trans_general(., "latin-ascii")) %>% - dplyr::select(provider = poskytovatel, - title = nazev, description = popis, dataset = datova_sada, - keywords0 = klicova_slova, topic = tema, - update_frequency = periodicita_aktualizace, - spatial_coverage = prostorove_pokryti) - if(is.null(provider)) { - dslist <- dslist0 %>% - dplyr::group_by(dataset) %>% - dplyr::mutate(keywords = stringr::str_c(keywords0, collapse = "; ")) %>% - dplyr::ungroup() %>% - dplyr::select(-keywords0) %>% - dplyr::distinct() - } else { - message("Filtering...") - dslist <- dslist0 %>% - dplyr::filter(.$provider %in% provider_uni) %>% - dplyr::group_by(dataset) %>% - dplyr::mutate(keywords = stringr::str_c(keywords0, collapse = "; ")) %>% - dplyr::ungroup() %>% - dplyr::select(-keywords0) %>% - dplyr::distinct() - if(provider == "\\u010cesk\\u00fd statistick\\u00fd \\u00fa\\u0159ad") { - dslist <- dslist %>% - dplyr::mutate(czso_id = stringr::str_extract(dataset, "(?<=package_show-id-).*$")) %>% - dplyr::select(czso_id, -provider, dplyr::everything()) - } - } - - if(!is.null(title_filter)) { - dslist <- dplyr::filter(dslist, stringr::str_detect(title, title_filter)) - } - if(!is.null(description_filter)) { - dslist <- dplyr::filter(dslist, stringr::str_detect(description, description_filter)) - } - if(!is.null(keyword_filter)) { - dslist <- dplyr::filter(dslist, stringr::str_detect(keyword_filter, description_filter)) - } - if(!is.null(provider_filter)) { - dslist <- dplyr::filter(dslist, stringr::str_detect(provider, description_filter)) - } - - return(dslist) +get_czso_catalogue <- function() { + + sparql_url <- "https://data.gov.cz/sparql" + + sparqlquery_datasets_byczso <- stringr::str_glue( + "PREFIX foaf: + PREFIX dcterms: + PREFIX dcat: + PREFIX rdf: + PREFIX rdfs: + + SELECT ?dataset_iri + ?dataset_id + ?title + ?provider + ?description + ?spatial + ?temporal + ?modified + ?page + ?periodicity + ?periodicity_abb + ?start + ?end + ?keywords_all + WHERE {{ + GRAPH ?g {{ + ?dataset_iri a dcat:Dataset . + ?dataset_iri dcterms:publisher ?publisher . + ?dataset_iri dcterms:title ?title . + ?dataset_iri dcterms:description ?description . + OPTIONAL {{ ?dataset_iri dcterms:identifier ?dataset_id .}} + OPTIONAL {{ ?dataset_iri dcterms:spatial ?spatial .}} + OPTIONAL {{ ?dataset_iri foaf:page ?page.}} + OPTIONAL {{ ?dataset_iri dcterms:temporal ?temporal .}} + OPTIONAL {{ ?dataset_iri dcterms:modified ?modified .}} + OPTIONAL {{ ?dataset_iri dcat:keyword ?keywords_all .}} + OPTIONAL {{ ?dataset_iri dcterms:accrualPeriodicity ?periodicity .}} + OPTIONAL {{ ?dataset_iri ?periodicity_abb .}} + + ?publisher foaf:name ?provider . + + OPTIONAL {{ ?temporal schema:startDate ?start .}} + OPTIONAL {{ ?temporal schema:endDate ?end .}} + + VALUES ?publisher {{ + # IRI pro CZSO + # # IRI pro Prahu + }} + FILTER(lang(?provider) = \"cs\") + FILTER(lang(?keywords_all) = \"cs\") + FILTER(lang(?title) = \"cs\") + }} + }}") %>% stringi::stri_unescape_unicode() + + params = list(`default-graph-uri` = "", + query = sparqlquery_datasets_byczso, + # format = "application/sparql-results+json", + format = "text/csv", + timeout = 30000, + debug = "on", + run = "Run Query") + usethis::ui_info("Downloading") + cat_rslt <- httr::GET(sparql_url, query = params, + # accept("application/sparql-results+json"), + httr::add_headers(c("Accept-Charset" = "utf-8")), + httr::accept("text/csv;charset=UTF-8")) + + # print(params$query) + + usethis::ui_info("Reading data") + if(httr::status_code(cat_rslt) > 200) { + print(httr::http_status(cat_rslt)) + rslt <- httr::content(cat_rslt, as = "text") + } else + rslt <- cat_rslt %>% httr::content(as = "text") + rslt <- readr::read_csv(rslt, col_types = readr::cols(modified = "T")) + usethis::ui_info("Transforming data") + rslt <- dplyr::group_by(rslt, dataset_iri) %>% + dplyr::mutate(keywords = stringr::str_c(keywords_all, collapse = "; ")) %>% + dplyr::ungroup() %>% + dplyr::select(-keywords_all) %>% + dplyr::distinct() + return(rslt) } #' Deprecated: Retrieve and read dataset from CZSO @@ -109,19 +108,9 @@ get_czso_catalogue <- function(provider = "\\u010cesk\\u00fd statistick\\u00fd \ #' @examples #' # see `get_czso_catalogue()` #' @export -get_catalogue <- function(provider = "\\u010cesk\\u00fd statistick\\u00fd \\u00fa\\u0159ad", - title_filter = NULL, - description_filter = NULL, - keyword_filter = NULL, - provider_filter = NULL, - force_redownload = F) { +get_catalogue <- function() { .Deprecated("get_czso_catalogue") - get_czso_catalogue(provider = provider, - title_filter = title_filter, - description_filter = description_filter, - keyword_filter = keyword_filter, - provider_filter = provider_filter, - force_redownload = force_redownload) + get_czso_catalogue() } get_czso_dataset_metadata <- function(dataset_id) { @@ -194,11 +183,14 @@ get_czso_table <- function(dataset_id, resource_num = 1, force_redownload = F) { } switch (action, read = { + guessed_enc <- readr::guess_encoding(dfile)[[1,1]] + if(guessed_enc == "windows-1252") guessed_enc <- "windows-1250" dt <- suppressWarnings(suppressMessages(readr::read_csv(dfile, col_types = readr::cols(.default = "c", - rok = "i", - casref_do = "T", - ctvrtleti = "i", - hodnota = "d")))) + rok = "i", + casref_do = "T", + ctvrtleti = "i", + hodnota = "d"), + locale = readr::locale(encoding = guessed_enc)))) rtrn <- dt }, listone = { diff --git a/README.Rmd b/README.Rmd index 5ba368b..63a74af 100644 --- a/README.Rmd +++ b/README.Rmd @@ -47,40 +47,39 @@ remotes::install_github("petrbouchal/czso") ## Example -Imagine you are looking for a dataset whose title refers to wages (mzda/mzdy): +Say you are looking for a dataset whose title refers to wages (mzda/mzdy): -```{r example} +First, retrieve the list of available CZSO datasets: + +```{r example-catalogue} library(czso) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(stringr)) -# first, retrieve the list of available CZSO datasets, filtering for mzda/mzdy -get_czso_catalogue(title_filter = "mzd[ay]") +catalogue <- get_czso_catalogue() ``` -We can see the `czso_id` for the required dataset - now use it to get the dataset: - -```{r example-cont} -get_czso_table("110080") +```{r example-filter} +catalogue %>% + filter(str_detect(title, "[Mm]zd[ay]")) %>% + select(dataset_id, title, description) ``` -Alternatively, you could store the whole CZSO catalogue in an object and filter yourself. This is especially useful if you expect to need multiple tries. -```{r example-alt} -library(dplyr, warn.conflicts = F) -library(stringr, warn.conflicts = F) -catalogue <- get_czso_catalogue() +We can see the `dataset_id` for the required dataset - now use it to get the dataset: -catalogue %>% - filter(str_detect(title, "mzda")) +```{r example-cont} +get_czso_table("110080") ``` -The latter allows you to search through the list - or simply look through it - without the overhead of reusing the `get_dataset()` function which downloads and transforms the underlying data. - ## Credit and notes - not created or endorsed by the Czech Statistical Office, though they, as well as [the open data team at the Ministry of Interior](https://data.gov.cz/) deserve credit for getting the data out there. - the package relies on the data.gov.cz catalogue of open data and on the CZSO's local catalogue - NB: The robots.txt at the domain hosting the CZSO's catalogue prohibits robots from accessing it; while this may be an inappropriate/erroneous setting for what is in essence a data API, this package tries to honor the spirit of that setting by only accessing the API once per `get_table()` call, relying on a different system for `get_catalogue()`. Hence, *do not use this package for harvesting large numbers of datasets from the CZSO.* +Thanks to @jakubklimek and @martinnecasky for [helping me figure out](https://github.com/opendata-mvcr/nkod/issues/19) the [SPARQL endpoint](https://data.gov.cz/sparql) on the Czech National Open Data Catalogue. + ## See also This package takes inspiration from the packages @@ -94,5 +93,7 @@ For Czech geospatial data, see [CzechData](https://github.com/JanCaha/CzechData/ For Czech fiscal data, see [statnipokladna](https://github.com/petrbouchal/statnipokladna). +For various transparency disclosures, see [Hlídač státu](https://hlidacstatu.cz). + For access to some of Prague's open geospatial data in R, see [pragr](https://github.com/petrbouchal/pragr). diff --git a/README.md b/README.md index fdc1496..6a81fb5 100644 --- a/README.md +++ b/README.md @@ -49,25 +49,34 @@ remotes::install_github("petrbouchal/czso") ## Example -Imagine you are looking for a dataset whose title refers to wages +Say you are looking for a dataset whose title refers to wages (mzda/mzdy): +First, retrieve the list of available CZSO datasets: + ``` r library(czso) +suppressPackageStartupMessages(library(dplyr)) +suppressPackageStartupMessages(library(stringr)) + +catalogue <- get_czso_catalogue() +#> ℹ Downloading +#> ℹ Reading data +#> ℹ Transforming data +``` -# first, retrieve the list of available CZSO datasets, filtering for mzda/mzdy -get_czso_catalogue(title_filter = "mzd[ay]") -#> Reading full list of all datasets available on data.gov.cz... -#> Filtering... -#> # A tibble: 2 x 9 -#> czso_id provider title description dataset topic update_frequency -#> -#> 1 110079 Český s… Zamě… Datová sad… https:… čtvrtletní -#> 2 110080 Český s… Prům… Datová sad… https:… roční -#> # … with 2 more variables: spatial_coverage , keywords +``` r +catalogue %>% + filter(str_detect(title, "[Mm]zd[ay]")) %>% + select(dataset_id, title, description) +#> # A tibble: 2 x 3 +#> dataset_id title description +#> +#> 1 110080 Průměrná hrubá měsíční mzd… Datová sada obsahuje časovou řadu prům… +#> 2 110079 Zaměstnanci a průměrné hru… Datová sada obsahuje časovou řadu počt… ``` -We can see the `czso_id` for the required dataset - now use it to get +We can see the `dataset_id` for the required dataset - now use it to get the dataset: ``` r @@ -90,31 +99,6 @@ get_czso_table("110080") #> # POHLAVI_txt ``` -Alternatively, you could store the whole CZSO catalogue in an object and -filter yourself. This is especially useful if you expect to need -multiple tries. - -``` r -library(dplyr, warn.conflicts = F) -library(stringr, warn.conflicts = F) -catalogue <- get_czso_catalogue() -#> File already in /var/folders/c8/pj33jytj233g8vr0tw4b2h7m0000gn/T//RtmpypkVs8/czso, not downloading. Set `force_redownload` to TRUE if needed. -#> Reading full list of all datasets available on data.gov.cz... -#> Filtering... - -catalogue %>% - filter(str_detect(title, "mzda")) -#> # A tibble: 1 x 9 -#> czso_id provider title description dataset topic update_frequency -#> -#> 1 110080 Český s… Prům… Datová sad… https:… roční -#> # … with 2 more variables: spatial_coverage , keywords -``` - -The latter allows you to search through the list - or simply look -through it - without the overhead of reusing the `get_dataset()` -function which downloads and transforms the underlying data. - ## Credit and notes - not created or endorsed by the Czech Statistical Office, though @@ -131,6 +115,11 @@ function which downloads and transforms the underlying data. different system for `get_catalogue()`. Hence, *do not use this package for harvesting large numbers of datasets from the CZSO.* +Thanks to @jakubklimek and @martinnecasky for [helping me figure +out](https://github.com/opendata-mvcr/nkod/issues/19) the [SPARQL +endpoint](https://data.gov.cz/sparql) on the Czech National Open Data +Catalogue. + ## See also This package takes inspiration from the packages @@ -147,5 +136,8 @@ For Czech geospatial data, see For Czech fiscal data, see [statnipokladna](https://github.com/petrbouchal/statnipokladna). +For various transparency disclosures, see [Hlídač +státu](https://hlidacstatu.cz). + For access to some of Prague’s open geospatial data in R, see [pragr](https://github.com/petrbouchal/pragr). diff --git a/_pkgdown.yml b/_pkgdown.yml index a1d453d..79c38f5 100644 --- a/_pkgdown.yml +++ b/_pkgdown.yml @@ -1,2 +1,10 @@ destination: docs url: https://petrbouchal.github.io/czso +template: + params: + ganalytics: UA-156460113-4 +reference: +- title: Core workflow + desc: Functions for executing the core workflow. + contents: + - has_concept("Core workflow") diff --git a/man/czso-package.Rd b/man/czso-package.Rd index 47050ea..ea2ef42 100644 --- a/man/czso-package.Rd +++ b/man/czso-package.Rd @@ -8,8 +8,8 @@ \description{ \if{html}{\figure{logo.png}{options: align='right' alt='logo' width='120'}} -Get programmatic access to the open data provided - by the Czech Statistical Office (CZSO). +Get programmatic access to the open data provided by the + Czech Statistical Office (CZSO). } \seealso{ Useful links: diff --git a/man/get_catalogue.Rd b/man/get_catalogue.Rd index d4dad8b..f9dca05 100644 --- a/man/get_catalogue.Rd +++ b/man/get_catalogue.Rd @@ -4,27 +4,7 @@ \alias{get_catalogue} \title{Deprecated: Retrieve and read dataset from CZSO} \usage{ -get_catalogue( - provider = "\\\\u010cesk\\\\u00fd statistick\\\\u00fd \\\\u00fa\\\\u0159ad", - title_filter = NULL, - description_filter = NULL, - keyword_filter = NULL, - provider_filter = NULL, - force_redownload = F -) -} -\arguments{ -\item{provider}{character, can be of length > 1. Provider to select for. Defaults to (the Czech name of) CZSO. Must be exact match. If set to NULL, returns full list of all datasets.} - -\item{title_filter}{character, text to use for filtering the set by title. Case sensitive. Can be a regular expression.} - -\item{description_filter}{character, text to use for filtering the set by description. Case sensitive. Can be a regular expression.} - -\item{keyword_filter}{character, text to use for filtering the set by keyword. Case sensitive. Can be a regular expression.} - -\item{provider_filter}{character, text to use for filtering the set by provider Case sensitive. Can be a regular expression.} - -\item{force_redownload}{integer. Whether to redownload data source file even if already cached. Defaults to FALSE.} +get_catalogue() } \value{ a tibble diff --git a/man/get_czso_catalogue.Rd b/man/get_czso_catalogue.Rd index 079222d..db3fca3 100644 --- a/man/get_czso_catalogue.Rd +++ b/man/get_czso_catalogue.Rd @@ -4,55 +4,20 @@ \alias{get_czso_catalogue} \title{Get catalogue of open CZSO datasets} \usage{ -get_czso_catalogue( - provider = "\\\\u010cesk\\\\u00fd statistick\\\\u00fd \\\\u00fa\\\\u0159ad", - title_filter = NULL, - description_filter = NULL, - keyword_filter = NULL, - provider_filter = NULL, - force_redownload = F -) -} -\arguments{ -\item{provider}{character, can be of length > 1. Provider to select for. Defaults to (the Czech name of) CZSO. Must be exact match. If set to NULL, returns full list of all datasets.} - -\item{title_filter}{character, text to use for filtering the set by title. Case sensitive. Can be a regular expression.} - -\item{description_filter}{character, text to use for filtering the set by description. Case sensitive. Can be a regular expression.} - -\item{keyword_filter}{character, text to use for filtering the set by keyword. Case sensitive. Can be a regular expression.} - -\item{provider_filter}{character, text to use for filtering the set by provider Case sensitive. Can be a regular expression.} - -\item{force_redownload}{integer. Whether to redownload data source file even if already cached. Defaults to FALSE.} +get_czso_catalogue() } \value{ -a data frame. If \code{provider} param is left to default, contains a column called czso_id, which can be used as dataset_id parameter in get_table(). +a data frame with details on all CZSO datasets available in the Czech National Open Data Catalogue. } \description{ -Downloads and processes a list of all registered Czech open data datasets, -returning (by default) those accessible through get_table() from the CZSO. +Retrieves a list of all CZSO's open datasets available from the Czech Open data catalogue. } \details{ -If \code{provider} is NULL, returns the whole list, without CZSO-specific identifier -usable in \code{get_table()}. - -If \code{provider} is left unset, returns data frame listing CZSO's datasets, with a -\code{czso_id} column usable in \code{get_table}. - -Other values of \code{provider} must be exact matches. Use \code{provider_filter} for text/regex matching. - -All \verb{*_filter} arguments are case sensitive and can be regular expressions. - -Original data: https://data.gov.cz/soubor/datov\%C3\%A9-sady.csv +Use the dataset_id column as an argument to \code{get_czso_table()}. } \examples{ \dontrun{ get_czso_catalogue() -get_czso_catalogue(NULL) -get_czso_catalogue(title_filter = "[Mm]zd[ay]") -get_czso_catalogue(provider = "Ministerstvo vnitra") -get_czso_catalogue(provider_filter = "[Mm]inisterstvo") } } \seealso{ diff --git a/prep/datagovcz_apiback.R b/prep/datagovcz_apiback.R index 4b78165..42734e0 100644 --- a/prep/datagovcz_apiback.R +++ b/prep/datagovcz_apiback.R @@ -28,6 +28,37 @@ titles <- nn %>% map(1) %>% dist <- GET(nn[[9]][[1]]$`http://www.w3.org/ns/dcat#distribution`[[1]]$value, accept_json()) %>% content() +GET("https://data.gov.cz/api/v1/resource/distribution?iri=https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12", + accept_json()) %>% + content() %>% + pluck("jsonld", 2) %>% + pluck("http://www.w3.org/ns/dcat#downloadURL", 1, "@id") + + +ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>% + content() +tt <- ss[[1]][[6]] +urls <- map_chr(tt, 2) + +nn <- urls[400] %>% map(~GET(.x, accept_json()) %>% content(auto_unbox = T)) + +zzz <- nn %>% + pluck(1) %>% + purrr::set_names(str_extract, "[:alnum:]*$") %>% + map(~purrr::set_names(.x, str_extract, "[:alnum:]*$")) %>% + pluck(1) %>% + map_dfc(~pluck(.x, 1, "value")) + + + +ddd <- GET("https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12", + accept_json()) %>% + content() + +ddd %>% + pluck(1) %>% + map_df(~pluck(.x, 1, "value")) %>% + purrr::set_names(str_extract, "[:alnum:]*$") diff --git a/prep/datagovcz_apiback.R.orig b/prep/datagovcz_apiback.R.orig new file mode 100644 index 0000000..b083dab --- /dev/null +++ b/prep/datagovcz_apiback.R.orig @@ -0,0 +1,67 @@ +library(httr) +library(tidyverse) +library(jsonlite) + +ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>% + content() +tt <- ss[[1]][[6]] +urls <- map_chr(tt, 2) + +nn <- urls %>% map(~GET(.x, accept_json()) %>% content(auto_unbox = T)) +names(nn[[1]][[1]]) + +nn[[1]][[1]]$`http://www.w3.org/ns/dcat#keyword` %>% + map(`[[`, 'value') %>% + map_chr(1) +nn[[1]][[1]]$`http://www.w3.org/ns/dcat#distribution` %>% + map(`[[`, 'value') %>% + map_chr(1) + +nn %>% map(names) +innams <- nn[[1]] %>% map_dfc(names) + +titles <- nn %>% map(1) %>% + map(`[[`, "http://purl.org/dc/terms/title") %>% + map(1) %>% + map_chr(`[[`, "value") + +dist <- GET(nn[[9]][[1]]$`http://www.w3.org/ns/dcat#distribution`[[1]]$value, accept_json()) %>% + content() + +<<<<<<< HEAD +GET("https://data.gov.cz/api/v1/resource/distribution?iri=https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12", + accept_json()) %>% + content() %>% + pluck("jsonld", 2) %>% + pluck("http://www.w3.org/ns/dcat#downloadURL", 1, "@id") + + +ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>% + content() +tt <- ss[[1]][[6]] +urls <- map_chr(tt, 2) + +nn <- urls[400] %>% map(~GET(.x, accept_json()) %>% content(auto_unbox = T)) + +zzz <- nn %>% + pluck(1) %>% + purrr::set_names(str_extract, "[:alnum:]*$") %>% + map(~purrr::set_names(.x, str_extract, "[:alnum:]*$")) %>% + pluck(1) %>% + map_dfc(~pluck(.x, 1, "value")) + + + +ddd <- GET("https://data.gov.cz/zdroj/datov%C3%A9-sady/http---vdb.czso.cz-pll-eweb-package_show-id-110080/distribuce/ca24b1ef468d717e0d8f14fa366e1a12", + accept_json()) %>% + content() + +ddd %>% + pluck(1) %>% + map_df(~pluck(.x, 1, "value")) %>% + purrr::set_names(str_extract, "[:alnum:]*$") +======= +>>>>>>> master + + + diff --git a/prep/komplet.sql b/prep/komplet.sql new file mode 100644 index 0000000..e6ca6af --- /dev/null +++ b/prep/komplet.sql @@ -0,0 +1,48 @@ +PREFIX foaf: +PREFIX dcterms: +PREFIX dcat: +PREFIX rdf: +PREFIX rdfs: + +SELECT ?dataset_iri +?dataset_id +?title ?provider ?description +?spatial +?temporal +?modified +?page +?periodicity +?periodicity_abb +?start +?end +?keyword +WHERE { + GRAPH ?g { + ?dataset_iri a dcat:Dataset . + ?dataset_iri dcterms:title ?title . + ?dataset_iri dcterms:description ?description . + ?dataset_iri dcterms:publisher ?publisher . + OPTIONAL { ?dataset_iri dcterms:identifier ?dataset_id .} + OPTIONAL { ?dataset_iri dcterms:spatial ?spatial .} + OPTIONAL { ?dataset_iri foaf:page ?page.} + OPTIONAL { ?dataset_iri dcterms:temporal ?temporal .} + OPTIONAL { ?dataset_iri dcterms:modified ?modified .} + OPTIONAL { ?dataset_iri dcat:keyword ?keyword .} + OPTIONAL { ?dataset_iri dcterms:accrualPeriodicity ?periodicity .} + OPTIONAL { ?dataset_iri ?periodicity_abb .} + + ?publisher foaf:name ?provider . + + OPTIONAL { ?temporal schema:startDate ?start .} + OPTIONAL { ?temporal schema:endDate ?end .} + + VALUES ?publisher { + # IRI pro CZSO + # IRI pro PlzKraj + } + + FILTER(lang(?provider) = "cs") + FILTER(lang(?keyword) = "cs") + FILTER(lang(?title) = "cs") + } +} diff --git a/prep/list_datasets_simple.sql b/prep/list_datasets_simple.sql new file mode 100644 index 0000000..346b04c --- /dev/null +++ b/prep/list_datasets_simple.sql @@ -0,0 +1,20 @@ +PREFIX foaf: + PREFIX dcterms: + PREFIX dcat: +PREFIX rdf: +PREFIX rdfs: + +SELECT DISTINCT ?title ?publisher WHERE { + GRAPH ?g { + ?d a dcat:Dataset + ?d dcterms:publisher ?publisher . + ?d dcterms:title ?title + + FILTER(lang(?poskytovatel) = "cs") + VALUES ?publisher { + + + } + } + +} diff --git a/prep/partial_label.sql b/prep/partial_label.sql new file mode 100644 index 0000000..f58efa9 --- /dev/null +++ b/prep/partial_label.sql @@ -0,0 +1,17 @@ +PREFIX foaf: +PREFIX dcterms: +PREFIX skos: +PREFIX dcat: +PREFIX rdf: +PREFIX rdfs: + +SELECT ?polozka ?label +WHERE { + GRAPH ?g { + + ?polozka skos:inScheme . + ?polozka skos:prefLabel ?label + FILTER(lang(?label) = "cs") + + } +} diff --git a/prep/sparql-links.md b/prep/sparql-links.md new file mode 100644 index 0000000..db866c3 --- /dev/null +++ b/prep/sparql-links.md @@ -0,0 +1,3 @@ +http://www.iro.umontreal.ca/~lapalme/ift6281/sparql-1_1-cheat-sheet.pdf +https://www.youtube.com/watch?v=FvGndkpa4K0 +https://en.wikibooks.org/wiki/SPARQL/ diff --git a/prep/sparql_workflow.R b/prep/sparql_workflow.R new file mode 100644 index 0000000..2075226 --- /dev/null +++ b/prep/sparql_workflow.R @@ -0,0 +1,190 @@ +library(httr) +library(jsonlite) +library(stringr) +library(readr) + +url <- "https://data.gov.cz/sparql" + +# Všechny datasety jednoho providera, podle IRI --------------------------- + +sparqlquery_datasets_provider <- str_glue( + "PREFIX foaf: + PREFIX dcterms: + PREFIX dcat: + PREFIX rdf: + PREFIX rdfs: + + SELECT ?dataset_iri ?title ?provider ?description ?spatial ?issued ?periodicity WHERE {{ + GRAPH ?g {{ + ?dataset_iri a dcat:Dataset ; + dcterms:title ?title ; + dcterms:spatial ?spatial ; + dcterms:issued ?issued ; + dcterms:accrualPeriodicity ?periodicity ; + dcterms:description ?description ; + dcterms:publisher ?publisher . + + ?publisher foaf:name ?provider . + + VALUES ?publisher {{ + # IRI pro CZSO + }} + FILTER(lang(?provider) = \"cs\") + FILTER(lang(?title) = \"cs\") + }} +}}") + +# Všechny datasety jednoho providera, podle názvu (přesně) ---------------- + +sparqlquery_datasets_provider_name <- str_glue( + "PREFIX foaf: + PREFIX dcterms: + PREFIX dcat: + PREFIX rdf: + PREFIX rdfs: + + SELECT ?dataset ?název ?provider ?popis WHERE {{ + GRAPH ?g {{ + ?dataset a dcat:Dataset ; + dcterms:title ?název ; + dcterms:description ?popis ; + dcterms:publisher ?publisher . + + ?publisher foaf:name ?provider . + + }} + VALUES ?poskytovatel {{ + \"Ministerstvo vnitra\"@cs # IRI pro CZSO + }} + FILTER(lang(?poskytovatel) = \"cs\") + FILTER(lang(?název) = \"cs\") +}}") + +params = list(`default-graph-uri` = "", + query = sparqlquery_datasets_provider, + # format = "application/sparql-results+json", + format = "text/csv", + timeout = 0, + debug = "on", + run = "Run Query") + +plz <- httr::GET(url, query = params, + # accept("application/sparql-results+json"), + add_headers(c("Accept-Charset" = "utf-8")), + accept("text/csv;charset=UTF-8") +) + +plz %>% stop_for_status() + +plz$request$headers + +plz$headers$`content-type` +plzd <- plz %>% content(as = "text") + +plzd <- plz %>% content(as = "text") %>% + read_csv() + +plzd$results$bindings %>% names() +plzd$results$bindings %>% head() + +# Všechny distribuce jednoho datasetu, podle IRI -------------------------- + +sparqlquery_distribs_dataset <- str_glue( + "PREFIX foaf: + PREFIX dcterms: + PREFIX dcat: + PREFIX rdf: + PREFIX rdfs: + + + + SELECT ?url, ?format WHERE {{ + GRAPH ?g {{ + ?dataset a dcat:Dataset ; + dcat:distribution ?distribution . + + ?distribution dcat:downloadURL ?url . + ?distribution dct:format ?format . + + VALUES ?dataset {{ + # IRI pro dataset + }} + }} + }}" +) + +url <- "https://data.gov.cz/sparql" +params_ds = list(`default-graph-uri` = "", + query = sparqlquery_distribs_dataset, + # format = "application/sparql-results+json", + format = "text/csv", + timeout = 0, + debug = "on", + run = "Run Query") + +ds <- httr::GET(url, query = params_ds, + # accept("application/sparql-results+json"), + config = add_headers(c("Accept-charset" = "utf-8")) +) %>% + stop_for_status() + +# ds$headers$`content-type` +# ds$status_code +# +# dst <- ds %>% content(as = "text") %>% +# fromJSON() + +dst <- ds %>% + content(as = "text") %>% + read_csv() + +ss <- GET("https://data.gov.cz/zdroj/lok%C3%A1ln%C3%AD-katalogy/CSttstckyU/214608232", accept_json()) %>% + content() +tt <- ss[[1]][[6]] +s <- map_chr(tt, 2) + +# Všichni providers ------------------------------------------------------- + +sparqlquery_providers <- "PREFIX foaf: +PREFIX dcterms: +PREFIX dcat: +PREFIX rdf: +PREFIX rdfs: + +SELECT DISTINCT ?poskytovatel ?publisher WHERE { + GRAPH ?g { + + ?publisher foaf:name ?poskytovatel . + + FILTER(lang(?poskytovatel) = \"cs\") + FILTER(?poskytovatel = \"Ministerstvo vnitra\"@cs) + } + +}" + +params_prv = list(`default-graph-uri` = "", + query = sparqlquery_providers, + # format = "application/sparql-results+json", + format = "text/csv", + timeout = 0, + debug = "on", + run = "Run Query") + +prv <- httr::GET(url, query = params_prv, + # accept("application/sparql-results+json"), + config = add_headers(c("Accept-charset" = "utf-8")) +) %>% + stop_for_status() + +# ds$headers$`content-type` +# ds$status_code +# +# dst <- ds %>% content(as = "text") %>% +# fromJSON() + +prvt <- prv %>% + content(as = "text") %>% + read_csv() + + + diff --git a/renv.lock b/renv.lock index 2ecb5e1..4b0c387 100644 --- a/renv.lock +++ b/renv.lock @@ -1,6 +1,6 @@ { "R": { - "Version": "3.6.0", + "Version": "3.6.2", "Repositories": [ { "Name": "CRAN", @@ -158,6 +158,13 @@ "Repository": "CRAN", "Hash": "08cf4045c149a0f0eaf405324c7495bd" }, + "clisymbols": { + "Package": "clisymbols", + "Version": "1.2.0", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "96c01552bfd5661b9bbdefbc762f4bcd" + }, "colorspace": { "Package": "colorspace", "Version": "1.4-1", @@ -306,6 +313,20 @@ "RemoteSha": "d05e437eb3cd8dd52a4aeed8298a6efba23c1d1f", "Hash": "93bf157d44cfd990a33b36e78c0317cb" }, + "gh": { + "Package": "gh", + "Version": "1.0.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "4da58d15239da30de8c7de088250d3be" + }, + "git2r": { + "Package": "git2r", + "Version": "0.26.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "135db4dbc94ed18f629ff8843a8064b7" + }, "glue": { "Package": "glue", "Version": "1.3.1.9000", @@ -374,6 +395,13 @@ "Repository": "CRAN", "Hash": "7146fea4685b4252ebf478978c75f597" }, + "ini": { + "Package": "ini", + "Version": "0.3.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "6154ec2223172bce8162d4153cda21f7" + }, "isoband": { "Package": "isoband", "Version": "0.2.0", @@ -828,6 +856,13 @@ "Repository": "CRAN", "Hash": "35c5245622df501b51263b565d18c053" }, + "usethis": { + "Package": "usethis", + "Version": "1.5.1", + "Source": "Repository", + "Repository": "CRAN", + "Hash": "30ee6fa315a020d5db6f28adbb7fea83" + }, "utf8": { "Package": "utf8", "Version": "1.1.4",