Skip to content

Commit

Permalink
working on fixing functions using GNR
Browse files Browse the repository at this point in the history
  • Loading branch information
Zachary Foster committed Oct 23, 2024
1 parent 78d0d72 commit 5d2b66c
Show file tree
Hide file tree
Showing 43 changed files with 75,585 additions and 505 deletions.
88 changes: 39 additions & 49 deletions R/gni_parse.R
Original file line number Diff line number Diff line change
@@ -1,55 +1,45 @@
#' Parse scientific names using EOL's name parser.
#'
#' THIS FUNCTION IS DEFUNCT.
#'
#' @export
#' @param names A vector of length 1 or more of taxonomic names
#' @param ... Curl options passed on to [crul::verb-GET]
#' @return A data.frame with results, the submitted names, and the
#' parsed names with additional information.
#' @seealso [gbif_parse()], [gn_parse()]
#' @references http://gni.globalnames.org/
#' @examples \dontrun{
#' gni_parse("Cyanistes caeruleus")
#' gni_parse("Plantago minor")
#' gni_parse("Plantago minor minor")
#' gni_parse(c("Plantago minor minor","Helianthus annuus texanus"))
#'
#' # pass on curl options
#' gni_parse("Cyanistes caeruleus", verbose = TRUE)
#' }
#' @keywords internal
gni_parse <- function(names, ...) {
names <- paste0(names, collapse = "|")
cli <- crul::HttpClient$new(paste0(gni_base(), "parsers.json"),
headers = tx_ual, opts = list(...))
tt <- cli$get(query = list(names = names))
tt$raise_for_status()
out <- jsonlite::fromJSON(tt$parse("UTF-8"), FALSE)
dt2df(lapply(out, gni_parser), idcol = FALSE)
.Defunct("ncbi_searcher", "traits",
msg = "This function is defunct. See gn_parse()")
# names <- paste0(names, collapse = "|")
# url <- paste0("https://parser.globalnames.org/api/v1/", names)
# cli <- crul::HttpClient$new(url, headers = tx_ual, opts = list(...))
# tt <- cli$get(query = list(cultivars = cultivars, csv = TRUE))
# tt$raise_for_status()
# out <- jsonlite::fromJSON(tt$parse("UTF-8"), FALSE)
# dt2df(lapply(out, gni_parser), idcol = FALSE)
}

gni_parser <- function(x) {
positions_names <- vapply(x$scientificName$positions, function(y)
paste("position_", y[[1]], sep = ""), "", USE.NAMES = FALSE)
nums <- vapply(x$scientificName$positions, function(y) y[[2]], 1,
USE.NAMES = FALSE)
pv <- data.frame(as.list(setNames(nums, positions_names)),
stringsAsFactors = FALSE)

nmz <- c("verbatim","canonical", "normalized","hybrid","parsed")
singles <- data.frame(x$scientificName[names(x$scientificName) %in% nmz],
stringsAsFactors = FALSE)

details2 <- data.frame()
if (x$scientificName$parsed) {
details_ <- x$scientificName$details[[1]]
details_ <- details_[!names(details_) %in% 'status']
details <- dt2df(Map(function(x, y) data.frame(y, x,
stringsAsFactors = FALSE), details_, names(details_)),
idcol = FALSE)[,-3]
details2 <- as.data.frame(t(data.frame(details[,2])))
names(details2) <- details[,1]
row.names(details2) <- NULL
}

data.frame(Filter(NROW, list(details2, singles, pv)),
stringsAsFactors = FALSE)
}
# gni_parser <- function(x) {
# positions_names <- vapply(x$scientificName$positions, function(y)
# paste("position_", y[[1]], sep = ""), "", USE.NAMES = FALSE)
# nums <- vapply(x$scientificName$positions, function(y) y[[2]], 1,
# USE.NAMES = FALSE)
# pv <- data.frame(as.list(setNames(nums, positions_names)),
# stringsAsFactors = FALSE)
#
# nmz <- c("verbatim","canonical", "normalized","hybrid","parsed")
# singles <- data.frame(x$scientificName[names(x$scientificName) %in% nmz],
# stringsAsFactors = FALSE)
#
# details2 <- data.frame()
# if (x$scientificName$parsed) {
# details_ <- x$scientificName$details[[1]]
# details_ <- details_[!names(details_) %in% 'status']
# details <- dt2df(Map(function(x, y) data.frame(y, x,
# stringsAsFactors = FALSE), details_, names(details_)),
# idcol = FALSE)[,-3]
# details2 <- as.data.frame(t(data.frame(details[,2])))
# names(details2) <- details[,1]
# row.names(details2) <- NULL
# }
#
# data.frame(Filter(NROW, list(details2, singles, pv)),
# stringsAsFactors = FALSE)
# }
114 changes: 75 additions & 39 deletions R/scrapenames.r
Original file line number Diff line number Diff line change
@@ -1,38 +1,41 @@
#' @title Resolve names using Global Names Recognition and Discovery.
#'
#' @description Uses the Global Names Recognition and Discovery service, see
#' http://gnrd.globalnames.org/
#' http://gnrd.globalnames.org/
#'
#' Note: this function sometimes gives data back and sometimes not. The API
#' that this function is extremely buggy.
#' Note: this function sometimes gives data back and sometimes not. The API
#' that this function is extremely buggy.
#'
#' @export
#' @param url An encoded URL for a web page, PDF, Microsoft Office document, or
#' image file, see examples
#' @param file When using multipart/form-data as the content-type, a file may
#' be sent. This should be a path to your file on your machine.
#' @param text Type: string. Text content; best used with a POST request, see
#' examples
#' @param engine (optional) (integer) Default: 0. Either 1 for TaxonFinder,
#' 2 for NetiNeti, or 0 for both. If absent, both engines are used.
#' @param unique (optional) (logical) If `TRUE` (default), response has
#' unique names without offsets.
#' @param verbatim (optional) Type: boolean, If `TRUE` (default to
#' `FALSE`), response excludes verbatim strings.
#' @param detect_language (optional) Type: boolean, When `TRUE` (default),
#' NetiNeti is not used if the language of incoming text is determined not to
#' be English. When `FALSE`, NetiNeti will be used if requested.
#' @param all_data_sources (optional) Type: boolean. Resolve found names
#' against all available Data Sources.
#' @param data_source_ids (optional) Type: string. Pipe separated list of
#' data source ids to resolve found names against. See list of Data Sources
#' http://resolver.globalnames.org/data_sources
#' @param return_content (logical) return OCR'ed text. returns text
#' string in `x$meta$content` slot. Default: `FALSE`
#' @param url Defunct. Use the `text` input for URLs as well as text strings.
#' @param file When using multipart/form-data as the content-type, a file may be
#' sent. This should be a path to your file on your machine.
#' @param text A text (or URL pointing to a text) for name detection.
#' @param engine (optional) (integer) Defunct. The API used no longer supports
#' this option.
#' @param unique Defunct. See the `unique_names` option.
#' @param unique_names (optional) (logical) If `TRUE` (the default), the output
#' returns unique names, instead of all name occurrences, without position
#' information of a name in the text.
#' @param verbatim (optional) Defunct. The API used no longer supports this
#' option.
#' @param detect_language (optional) Defunct. See the `language` option.
#' @param language The language of the text. Language value is used for
#' calculation of Bayesian odds. If this parameter is not given, eng is used
#' by default. Currently only English and German languages are supported.
#' Valid values are: `eng`, `deu`, `detect`.
#' @param all_data_sources (optional) Defunct. The API used no longer supports
#' this option.
#' @param data_source_ids (optional) Defunct. See the `sources` option.
#' @param sources Pipe separated list of data source ids to resolve found names
#' against. See list of Data Sources
#' http://resolver.globalnames.org/data_sources
#' @param return_content (logical) return OCR'ed text. returns text string in
#' `x$meta$content` slot. Default: `FALSE`
#' @param ... Further args passed to [crul::verb-GET]
#' @author Scott Chamberlain
#' @author Scott Chamberlain
#' @return A list of length two, first is metadata, second is the data as a
#' data.frame.
#' data.frame.
#' @details One of url, file, or text must be specified - and only one of them.
#' @examples \dontrun{
#' # Get data from a website using its URL
Expand All @@ -49,7 +52,7 @@
#'
#' # With arguments
#' scrapenames(url = 'https://www.mapress.com/zootaxa/2012/f/z03372p265f.pdf',
#' unique=TRUE)
#' unique_names=TRUE)
#' scrapenames(url = 'https://en.wikipedia.org/wiki/Spider',
#' data_source_ids=c(1, 169))
#'
Expand All @@ -69,24 +72,56 @@
#' scrapenames(url='https://www.mapress.com/zootaxa/2012/f/z03372p265f.pdf',
#' return_content = TRUE)
#' }
scrapenames <- function(url = NULL, file = NULL, text = NULL, engine = NULL,
unique = NULL, verbatim = NULL, detect_language = NULL,
all_data_sources = NULL, data_source_ids = NULL,
return_content = FALSE, ...) {

scrapenames <- function(
url = NULL,
file = NULL,
text = NULL,
engine = NULL,
unique = NULL,
unique_names = NULL,
verbatim = NULL,
detect_language = NULL,
language = NULL,
all_data_sources = NULL,
data_source_ids = NULL,
sources = NULL,
return_content = FALSE,
...
) {

# Error if defunct parameters are used.
if (!is.null(url)) {
stop(call. = FALSE, 'The `url` option is defunct. Use the `text` option for URLs as well as text strings.')
}
if (!is.null(unique)) {
stop(call. = FALSE, 'The `unique` option is defunct. See the `unique_names` option. ')
}
if (!is.null(engine)) {
stop(call. = FALSE, 'The `engine` option is defunct. The API no longer supports this option. ')
}
if (!is.null(detect_language)) {
stop(call. = FALSE, 'The `detect_language` option is defunct. See the `language` option. ')
}
if (!is.null(data_source_ids)) {
stop(call. = FALSE, 'The `data_source_ids` option is defunct. See the `source` option. ')
}

method <- tc(list(url = url, file = file, text = text))
if (length(method) > 1) {
stop("Only one of url, file, or text can be used", call. = FALSE)
}

base <- "http://gnrd.globalnames.org/name_finder.json"
base <- "http://gnrd.globalnames.org/api/v1/find"
if (!is.null(data_source_ids))
data_source_ids <- paste0(data_source_ids, collapse = "|")
args <- tc(list(url = url, text = text, engine = engine, unique = unique,
verbatim = verbatim, detect_language = detect_language,
all_data_sources = all_data_sources,
data_source_ids = data_source_ids,
return_content = as_l(return_content)))
args <- tc(list(
text = text,
unique_names = unique_names,
verbatim = verbatim,
language = language,
source = source,
return_content = as_l(return_content)
))
cli <- crul::HttpClient$new(base, headers = tx_ual, opts = list(...))
if (names(method) == 'url') {
tt <- cli$get(query = args)
Expand Down Expand Up @@ -116,3 +151,4 @@ scrapenames <- function(url = NULL, file = NULL, text = NULL, engine = NULL,
meta <- datout[!names(datout) %in% c("names")]
list(meta = meta, data = nmslwr(datout$names))
}

13 changes: 13 additions & 0 deletions log.Rmd
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,19 @@ clipr::write_clip(paste0('## ', format(Sys.time(), "%F (%A %B %e)\n\n")))
# Date and time:
clipr::write_clip(paste0('## ', format(Sys.time(), "%F %X %Z (%A %B %e)\n\n")))
```
## 2024-10-16 (Wednesday October 16)

Working on updating `scrapenames`.
The options have been updated, but it has not been tested.

## 2024-10-04 (Friday October 4)

It seems some of the functions that use the Global Names APIs no longer work.
There seem to be multiple APIs (and corresponding command line tools) with multiple endpoints, so its hard to tell which is the new version for presumably depreciated APIs.
I will look into each of the API/tools.

* GNFinder: searches for latin names in text. Seems intended to provide metadata about species mentioned in old publications. Results can be passed to GNVerifier.
* GNVerifier: Used to provide the currently accepted name for a species.

## 2021-10-13 (Wednesday October 13)

Expand Down
Loading

0 comments on commit 5d2b66c

Please sign in to comment.