v1.0.0 release

wikimedia · Jun 16, 2020 · 2180af3 · 2180af3
1 parent 5051819
commit 2180af3
Show file tree

Hide file tree

Showing 15 changed files with 285 additions and 206 deletions.
diff --git a/.lintr b/.lintr
@@ -1,5 +1,5 @@
 linters: with_defaults(line_length_linter(120), object_usage_linter = NULL, closed_curly_linter = NULL, open_curly_linter = NULL, spaces_left_parentheses_linter = NULL, camel_case_linter = NULL)
-exclusions: list()
+exclusions: list("R/zzz.R", "R/WikidataQueryServiceR-package.R")
 exclude: "# Exclude Linting"
 exclude_start: "# Begin Exclude Linting"
 exclude_end: "# End Exclude Linting"
diff --git a/DESCRIPTION b/DESCRIPTION
@@ -1,7 +1,7 @@
 Package: WikidataQueryServiceR
 Title: API Client Library for 'Wikidata Query Service'
 Version: 1.0.0
-Date: 2017-08-05
+Date: 2020-06-16
 Authors@R: c(
     person("Mikhail", "Popov", email = "mikhail@wikimedia.org",
            role = c("aut", "cre"), comment = "@bearloga on Twitter"),
@@ -13,16 +13,20 @@ Depends:
     R (>= 3.1.2)
 Imports:
     httr (>= 1.2.1),
-    dplyr (>= 0.5.0),
+    dplyr (>= 1.0.0),
     jsonlite (>= 1.2),
-    WikipediR (>= 1.5.0)
+    WikipediR (>= 1.5.0),
+    ratelimitr (>= 0.4.1),
+    purrr (>= 0.3.4),
+    readr (>= 1.3.1),
+    rex (>= 1.2.0)
 Suggests:
-    testthat,
-    lintr
+    testthat (>= 2.3.0),
+    lintr (>= 2.0.1)
 URL: https://github.com/bearloga/WikidataQueryServiceR
 BugReports: https://github.com/bearloga/WikidataQueryServiceR/issues
 License: MIT + file LICENSE
 Encoding: UTF-8
 LazyData: true
 Roxygen: list(markdown = TRUE)
-RoxygenNote: 6.0.1
+RoxygenNote: 7.1.0
diff --git a/NAMESPACE b/NAMESPACE
@@ -3,3 +3,4 @@
 export(get_example)
 export(query_wikidata)
 export(scrape_example)
+import(ratelimitr)
diff --git a/NEWS.md b/NEWS.md
@@ -1,5 +1,11 @@
-WikidataQueryServiceR 0.1.1
----------------------------
+# WikidataQueryServiceR 1.0.0
+
+* Fixed example retrieval (was broken due to translation wikitext markers)
+* Rate-limiting ([11](https://github.com/bearloga/WikidataQueryServiceR/issues/11))
+* Using tidyverse family of packages (tibble, dplyr, purrr, readr)
+* Various improvements and modernizations
+
+# WikidataQueryServiceR 0.1.1
 
 ## Changes
 
@@ -11,8 +17,7 @@ WikidataQueryServiceR 0.1.1
 
 * Fixed a bug with JSON-formatted results ([#3](https://github.com/bearloga/WikidataQueryServiceR/issues/3))
 
-WikidataQueryServiceR 0.1.0
----------------------------
+# WikidataQueryServiceR 0.1.0
 
 * Initial CRAN release:
   - Support for multiple SPARQL queries

diff --git a/R/wdqs.R → R/WikidataQueryServiceR-package.R b/R/wdqs.R → R/WikidataQueryServiceR-package.R
@@ -1,11 +1,8 @@
-#' @title WikidataQueryServiceR: An R Wrapper For Wikidata Query Service API
-#' @description This is an R wrapper for the
-#'   [Wikidata Query Service](https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service)
-#'   (WDQS) which provides a way for tools to query Wikidata via
-#'   [SPARQL](https://en.wikipedia.org/wiki/SPARQL).
+#' @keywords internal
+#' @aliases WDQS
 #' @details [Wikidata Query Service](https://www.mediawiki.org/wiki/Wikidata_query_service)
-#'   is maintained by [Wikimedia Foundation](https://wikimediafoundation.org/).
-#' @references
+#'   is maintained by the [Wikimedia Foundation](https://wikimediafoundation.org/).
+#' @section Resources:
 #' - [A beginner-friendly course for SPARQL](https://www.wikidata.org/wiki/Wikidata:A_beginner-friendly_course_for_SPARQL)
 #' - Building a SPARQL query: [Museums on Instagram](https://www.wikidata.org/wiki/Help:SPARQL/Building_a_query/Museums_on_Instagram)
 #' - [SPARQL Query Examples](https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples) for WDQS
@@ -20,7 +17,10 @@
 #' - [WDQS User Manual](https://www.mediawiki.org/wiki/Wikidata_query_service/User_Manual)
 #' - [Quick intro to WDQS & SPARQL](https://github.com/bearloga/wmf/blob/master/presentations/talks/Cascadia\%20R\%20Conference\%202017/presentation.md#wikidata-query-service-wdqs)
 #'   from [my Cascadia R Conference 2017 talk](https://github.com/bearloga/wmf/tree/master/presentations/talks/Cascadia\%20R\%20Conference\%202017)
-#' @aliases WDQS
-#' @docType package
-#' @name WDQSR-package
+"_PACKAGE"
+
+# The following block is used by usethis to automatically manage
+# roxygen namespace tags. Modify with care!
+## usethis namespace: start
+## usethis namespace: end
 NULL
diff --git a/R/http.R b/R/http.R
@@ -0,0 +1,12 @@
+#' @import ratelimitr
+wdqs_requester <- function() {
+  req <- function(query, ...) {
+    httr::POST(
+      url = "https://query.wikidata.org/sparql",
+      query = list(query = query),
+      httr::user_agent("https://github.com/bearloga/WikidataQueryServiceR"),
+      ...
+    )
+  }
+  return(limit_rate(req, rate(n = 30, period = 60)))
+}
diff --git a/R/query.R b/R/query.R
@@ -4,85 +4,82 @@
 #' @param format "simple" uses CSV and returns pure character data frame, while
 #'   "smart" fetches JSON-formatted data and returns a data frame with datetime
 #'   columns converted to `POSIXct`
-#' @param ... Additional parameters to supply to [httr::POST]
-#' @return A `data.frame`
+#' @return A tibble data frame
 #' @examples
-#' # R's versions and release dates:
-#' sparql_query <- 'SELECT DISTINCT
+#' sparql_query <- "SELECT
 #'   ?softwareVersion ?publicationDate
 #' WHERE {
 #'   BIND(wd:Q206904 AS ?R)
 #'   ?R p:P348 [
 #'     ps:P348 ?softwareVersion;
 #'     pq:P577 ?publicationDate
 #'   ] .
-#' }'
+#' }"
 #' query_wikidata(sparql_query)
 #'
 #' \dontrun{
-#' # "smart" format converts all datetime columns to POSIXct
 #' query_wikidata(sparql_query, format = "smart")
 #' }
+#' @section Query limits:
+#' There is a hard query deadline configured which is set to 60 seconds. There
+#' are also following limits:
+#' - One client (user agent + IP) is allowed 60 seconds of processing time each
+#'   60 seconds
+#' - One client is allowed 30 error queries per minute
+#' See [query limits section](https://www.mediawiki.org/wiki/Wikidata_Query_Service/User_Manual#Query_limits)
+#' in the WDQS user manual for more information.
 #' @seealso [get_example]
 #' @export
-query_wikidata <- function(sparql_query, format = c("simple", "smart"), ...) {
-  if (!format[1] %in% c("simple", "smart")) {
+query_wikidata <- function(sparql_query, format = c("simple", "smart")) {
+  format <- format[1]
+  if (!format %in% c("simple", "smart")) {
     stop("`format` must be either \"simple\" or \"smart\"")
   }
   output <- lapply(sparql_query, function(sparql_query) {
-    if (format[1] == "simple") {
-      response <- httr::POST(
-        url = "https://query.wikidata.org/sparql",
-        query = list(query = sparql_query),
-        httr::add_headers(Accept = "text/csv"),
-        httr::user_agent("https://github.com/bearloga/WikidataQueryServiceR"),
-        ...
-      )
+    rate_limited_query <- wdqs_requester()
+    if (format == "simple") {
+      response <- rate_limited_query(sparql_query, httr::add_headers(Accept = "text/csv"))
       httr::stop_for_status(response)
       if (httr::http_type(response) == "text/csv") {
-        con <- textConnection(httr::content(response, as = "text", encoding = "UTF-8"))
-        df <- utils::read.csv(con, header = TRUE, stringsAsFactors = FALSE)
-        message(nrow(df), " rows were returned by WDQS")
-        return(df)
+        content <- httr::content(response, as = "text", encoding = "UTF-8")
+        return(readr::read_csv(content))
       } else {
         stop("returned response is not formatted as a CSV")
       }
     } else {
-      response <- httr::GET(
-        url = "https://query.wikidata.org/sparql",
-        query = list(query = sparql_query),
-        format = "json",
-        httr::user_agent("https://github.com/bearloga/WikidataQueryServiceR"),
-        ...
-      )
+      response <- rate_limited_query(sparql_query, httr::add_headers(Accept = "application/sparql-results+json"))
       httr::stop_for_status(response)
       if (httr::http_type(response) == "application/sparql-results+json") {
-        temp <- jsonlite::fromJSON(httr::content(response, as = "text", encoding = "UTF-8"), simplifyVector = FALSE)
+        content <- httr::content(response, as = "text", encoding = "UTF-8")
+        temp <- jsonlite::fromJSON(content, simplifyVector = FALSE)
       }
       if (length(temp$results$bindings) > 0) {
-        df <- as.data.frame(dplyr::bind_rows(lapply(temp$results$bindings, function(x) {
-          return(lapply(x, function(y) { return(y$value) }))
-        })))
-        datetime_cols <- vapply(temp$results$bindings[[1]], function(x) {
-          if ("datatype" %in% names(x)) {
-            return(x$datatype == "http://www.w3.org/2001/XMLSchema#dateTime")
+        data_frame <- purrr::map_dfr(temp$results$bindings, function(binding) {
+          return(purrr::map_chr(binding, ~ .x$value))
+        })
+        datetime_columns <- purrr::map_lgl(temp$results$bindings[[1]], function(binding) {
+          if ("datatype" %in% names(binding)) {
+            return(binding[["datatype"]] == "http://www.w3.org/2001/XMLSchema#dateTime")
           } else {
             return(FALSE)
           }
-        }, FALSE)
-        if (any(datetime_cols)) {
-          for (datetime_col in which(datetime_cols)) {
-            df[[datetime_col]] <- as.POSIXct(df[[datetime_col]], format = "%Y-%m-%dT%H:%M:%SZ", tz = "GMT")
-          }
-        }
-        message(nrow(df), " rows were returned by WDQS")
-        return(df)
+        })
+        data_frame <- dplyr::mutate_if(
+          .tbl = data_frame,
+          .predicate = datetime_columns,
+          .funs = as.POSIXct,
+          format = "%Y-%m-%dT%H:%M:%SZ", tz = "GMT"
+        )
       } else {
-        message("0 rows were returned by WDQS")
-        return(data.frame(matrix(character(), nrow = 0, ncol = length(temp$head$vars),
-                                 dimnames = list(c(), unlist(temp$head$vars))),
-                          stringsAsFactors = FALSE))
+        data_frame <- dplyr::as_tibble(
+          matrix(
+            character(),
+            nrow = 0, ncol = length(temp$head$vars),
+            dimnames = list(c(), unlist(temp$head$vars))
+          )
+        )
       }
+      return(data_frame)
     }
   })
   if (length(output) == 1) {

diff --git a/R/utils.R b/R/utils.R
@@ -26,13 +26,20 @@ get_example <- function(example_name) {
     page_name = "Wikidata:SPARQL query service/queries/examples",
     as_wikitext = TRUE
   )
-  wiki <- strsplit(content$parse$wikitext$`*`, "\n")[[1]]
-  wiki <- wiki[wiki != ""]
-  return(vapply(example_name, function(example_name) {
-    heading_line <- which(grepl(paste0("^===\\s?", example_name, "\\s?===$"), wiki, fixed = FALSE))
-    start_line <- which(grepl("{{SPARQL", wiki[(heading_line + 1):length(wiki)], fixed = TRUE))[1]
-    end_line <- which(grepl("}}", wiki[(heading_line + start_line + 1):length(wiki)], fixed = TRUE))[1]
-    query <- paste0(wiki[(heading_line + start_line):(heading_line + start_line + end_line - 1)], collapse = "\n")
+  wikitext <- strsplit(content$parse$wikitext$`*`, "\n")[[1]]
+  wikitext <- wikitext[wikitext != ""]
+  examples <- purrr::map(example_name, function(example_name) {
+    regex <- paste0(
+      "^={2,}\\s?(<translate><!--T:[0-9]+-->)?\\s?",
+      rex::escape(example_name),
+      "\\s?(</translate>)?\\s?={2,}$"
+    )
+    heading_line <- which(grepl(regex, wikitext, fixed = FALSE))
+    start_line <- which(grepl("{{SPARQL", wikitext[(heading_line + 1):length(wikitext)], fixed = TRUE))[1]
+    end_line <- which(grepl("}}", wikitext[(heading_line + start_line + 1):length(wikitext)], fixed = TRUE))[1]
+    query <- paste0(wikitext[(heading_line + start_line):(heading_line + start_line + end_line - 1)], collapse = "\n")
     return(sub("^\\s*\\{\\{SPARQL2?\\n?\\|query\\=", "", query))
-  }, ""))
+  })
+  names(examples) <- example_name
+  return(examples)
 }
diff --git a/R/zzz.R b/R/zzz.R
@@ -0,0 +1,3 @@
+.onAttach <- function(libname, pkgname) {
+  packageStartupMessage("See ?WDQS for resources on Wikidata Query Service and SPARQL")
+}
diff --git a/README.Rmd b/README.Rmd
@@ -17,7 +17,7 @@ library(printr)
 [![CRAN Total Downloads](https://cranlogs.r-pkg.org/badges/grand-total/WikidataQueryServiceR)](https://cran.r-project.org/package=WikidataQueryServiceR)
 [![License: MIT](https://img.shields.io/badge/license-MIT-blue.svg)](https://opensource.org/licenses/MIT)
 
-This is an R wrapper for the [Wikidata Query Service (WDQS)](https://www.mediawiki.org/wiki/Wikidata_query_service) which provides a way for tools to query [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) via [SPARQL](https://en.wikipedia.org/wiki/SPARQL) (see the beta at https://query.wikidata.org/). It is written in and for R, and was inspired by Oliver Keyes' [WikipediR](https://github.com/Ironholds/WikipediR) and [WikidataR](https://github.com/Ironholds/WikidataR) packages.
+This is an R wrapper for the [Wikidata Query Service (WDQS)](https://www.mediawiki.org/wiki/Wikidata_query_service) which provides a way for tools to query [Wikidata](https://www.wikidata.org/wiki/Wikidata:Main_Page) via [SPARQL](https://en.wikipedia.org/wiki/SPARQL) (see the beta at https://query.wikidata.org/). It is written in and for R, and was inspired by Os Keyes' [WikipediR](https://github.com/Ironholds/WikipediR) and [WikidataR](https://github.com/Ironholds/WikidataR) packages.
 
 __Author:__ Mikhail Popov (Wikimedia Foundation)<br/> 
 __License:__ [MIT](http://opensource.org/licenses/MIT)<br/>
@@ -32,8 +32,8 @@ install.packages("WikidataQueryServiceR")
 To install the development version:
 
 ```R
-# install.packages(c("devtools", "httr", "dplyr", "jsonlite"))
-devtools::install_github("bearloga/WikidataQueryServiceR")
+# install.packages("remotes")
+remotes::install_github("bearloga/WikidataQueryServiceR")
 ```
 
 ## Usage
@@ -68,20 +68,21 @@ For more example SPARQL queries, see [this page](https://www.wikidata.org/wiki/W
 The package provides a [WikipediR](https://github.com/Ironholds/WikipediR/)-based function for getting SPARQL queries from the [WDQS examples page](https://www.wikidata.org/wiki/Wikidata:SPARQL_query_service/queries/examples).
 
 ```{r get_examples, cache=TRUE}
-sparql_query <- get_example(c("Cats", "Horses", "Largest cities with female mayor"))
+sparql_query <- get_example(c("Cats", "How many states this US state borders"))
 ```
 ```{r, eval=FALSE}
-sparql_query[["Largest cities with female mayor"]]
+sparql_query[["How many states this US state borders"]]
 ```
 ```{r, echo=FALSE, results='asis'}
-cat("```SPARQL\n", sparql_query[["Largest cities with female mayor"]], "\n```")
+cat("```SPARQL\n", sparql_query[["How many states this US state borders"]], "\n```")
 ```
 
-Now we can run all three extracted SPARQL queries and get back three data.frames:
+Now we can run all extracted SPARQL queries:
 
 ```{r run_examples, cache=TRUE, dependson='get_examples'}
 results <- query_wikidata(sparql_query)
-results$`Largest cities with female mayor`[, c("cityLabel", "mayorLabel")]
+lapply(results, dim)
+head(results$`How many states this US state borders`)
 ```
 
 ## Links for learning SPARQL