diff --git a/NAMESPACE b/NAMESPACE index 683736c..1ab94b9 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -11,6 +11,8 @@ export(get_park_taxon_citations) export(get_park_taxon_refs) export(get_park_taxon_url) export(get_ref_info) +export(get_ref_list) +export(get_refs_info) export(get_unit_code) export(get_unit_code_info) export(get_unit_info) @@ -22,6 +24,7 @@ export(load_domains) export(load_pkg_metadata) export(map_wkt) export(rm_local_packages) +export(summarize_packages) export(validate_data_package) importFrom(lifecycle,deprecated) importFrom(magrittr,"%>%") diff --git a/NEWS.md b/NEWS.md index 84ba16e..4cc1136 100644 --- a/NEWS.md +++ b/NEWS.md @@ -1,5 +1,8 @@ # NPSutils 0.3.3 (under development) +## 2024-12-19 + * updated `load_pkg_metadata` to be simpler and essentially call `DPchecker::load_metadata` but with a preset default directory structure that works well with the default settings for `get_data_package`. + * Add meta-analysis functions for finding and producing summary statistics multiple data packages including `get_ref_list`, `get_refs_info()`, and `summarize_packages`. ## 2024-10-24 * fix how `get_data_package` aliases `get_data_packages`, specifically now allows users to adjust parameters to non-default settings. ## 2024-10-21 diff --git a/R/load_data_package.R b/R/load_data_package.R index 5ae4073..6affde1 100644 --- a/R/load_data_package.R +++ b/R/load_data_package.R @@ -11,7 +11,7 @@ #' #' @examples #' \dontrun{ -#' load_data_package(2272461) +#' load_data_package_deprecated(2272461) #' } load_data_package_deprecated <- function(reference_id) { data_package_directory <- paste("data/", reference_id, sep = "") diff --git a/R/load_data_packages.R b/R/load_data_packages.R index 308150c..803267f 100644 --- a/R/load_data_packages.R +++ b/R/load_data_packages.R @@ -197,4 +197,4 @@ extract_tbl <- function(x) { if (!is.list(x)) return(NULL) unlist(lapply(x, extract_tbl), FALSE) -} \ No newline at end of file +} diff --git a/R/load_pgk_metadata.R b/R/load_pgk_metadata.R index 0ad5c0e..531b1a4 100644 --- a/R/load_pgk_metadata.R +++ b/R/load_pgk_metadata.R @@ -1,12 +1,9 @@ -#' Read contents of data package file and construct a data frame based on the -#' metadata file summarizing the fields and their types/definitions. +#' Loads EML-formatted metadata into R for inspection and/or editing +#' +#' @description `load_pkg_metadata()` is essentially a wrapper around `DPchecker::load_metadata` with the directory structure pre-set to work well the default location that `get_data_package` stores downloaded data packages. If you did not use the default settings for `get_data_package` (or downloaded a data package manually) you may find it easier to adjust the directory structure pointing to your data package and load the metadata using `DPchecker::load_metadata()`. Much like `load_metadata`, `load_pkg_metadata` requires that there be a single .xml file in the data package directory, that the metadata file name end in *_metadata.xml, and that the file contain schema-valid EML metadata. #' -#' @description `load_pkg_metadata()` reads the metadata file from a previously -#' downloaded package and loads a list of fields and their attributes into a -#' dataframe. -#' -#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file. -#' @param directory String. Path to the data package +#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file. Your data should be in a directory that that has the holding ID as its name. +#' @param directory String. Path to the data package directory, defaults to "data". #' #' @return one data frame to the global environment. #' @@ -16,105 +13,10 @@ #' \dontrun{ #' load_pgk_metadata(2266200) #' } -load_pkg_metadata <- function(holding_id, directory = here::here("data")) { - data_package_directory <- paste(directory, "/", holding_id, sep = "") - - metadata_file <- list.files( - path = data_package_directory, - pattern = "metadata.xml" - ) - - # Look for a metadatafile and let the user know about the results of the search. - if (length(metadata_file) == 0) { - cli::cli_abort(c( - "No metadata file found in: {.path {data_package_directory}}.", - "i" = "The filename must end in _metadata.xml")) - return(invisible()) - } - if (length(metadata_file) > 1) { - cli::cli_abort(c( - "Multiple metadata files found.", - "i" = "{.path {data_package_directory}} can contain only one - {.file *_metadata.xml}.")) - return(invisible()) - } +load_pkg_metadata <- function(holding_id, directory = "data") { - meta_location <- paste0(data_package_directory, "/", metadata_file) - if (!file.exists(meta_location)) { - cli::cli_abort(c( - "The data package for: {.var {holding_id}} was not found.", - "i" = "Make sure {.path {data_package_directory}} is the correct location", - "i" = "Make sure you downloaded the correct data package using {.fn get_data_package}." - )) - return(invisible()) - } - - #load metadata - eml_object <- EML::read_eml(meta_location, from = "xml") - #attributeList <- EML::get_attributes(eml_object) - attribute_list <- eml_object$dataset$dataTable$attributeList - attributes <- attribute_list$attributes - factors <- attribute_list$factors - - # Figure out column classes based on attribute table (character, numeric, integer, logical, or complex) - attributes$columnclass <- "character" - if (!"numberType" %in% colnames(attributes)) { - attributes$numberType <- as.character(NA) - } - if (!"formatString" %in% colnames(attributes)) { - attributes$formatString <- as.character(NA) - } - attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "natural", "integer", attributes$columnclass) - attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "whole", "integer", attributes$columnclass) - attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "integer", "integer", attributes$columnclass) - attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "real", "numeric", attributes$columnclass) - attributes$columnclass <- ifelse(attributes$storageType == "date" & attributes$formatString == "YYYY-MM-DD", "Date", attributes$columnclass) - - # return the field table to the workspace. - return(attributes) - -if (metaformat == "fgdc") { - # xmlFilename <- metalocation - workingXMLfile <- EML::read_eml(metalocation, from = "xml") - - # Build attributes table from the xml file - attributes <- data.frame( - id = numeric(), - attribute = character(), - attributeDefinition = character(), - attributeType = character(), - attributeFactors = numeric(), - stringsAsFactors = FALSE - ) - for (i in 1:length(workingXMLfile$ea$detailed$attr)) { - attributes <- rbind( - attributes, - cbind( - id = i, - attribute = workingXMLfile$ea$detailed$attr[[i]]$attrlabl, - attributeDefinition = workingXMLfile$ea$detailed$attr[[i]]$attrdef, - attributeType = workingXMLfile$ea$detailed$attr[[i]]$attrtype, - attributeFactors = length(workingXMLfile$ea$detailed$attr[[i]]$attrdomv) - ) - ) - } - - attributes$id <- as.integer(as.character(attributes$id)) - attributes$attribute <- as.character(attributes$attribute) - attributes$attributeDefinition <- as.character(attributes$attributeDefinition) - # attributes$attributeType<-as.character(attributes$attributeType) - attributes$attributeFactors <- as.integer(as.character(attributes$attributeFactors)) - - attributes$columnclass <- "character" - # attributes$columnclass<-ifelse(attributes$attributeType=="OID","integer",attributes$columnclass) - # attributes$columnclass<-ifelse(attributes$attributeType=="Date","Date",attributes$columnclass) - # attributes$columnclass<-ifelse(attributes$attributeType=="Double","numeric",attributes$columnclass) - - cat("Found ", crayon::blue$bold(nrow(attributes)), " fields.", sep = "") + meta <- DPchecker::load_metadata(directory = here::here("data", holding_id)) + + return(invisible(meta)) +} - # return the field table to the workspace. - return(attributes) - } else { - print("data/metadata format combination not supported") - } -} diff --git a/R/meta_analyses.R b/R/meta_analyses.R new file mode 100644 index 0000000..2cf901e --- /dev/null +++ b/R/meta_analyses.R @@ -0,0 +1,250 @@ +#' Get a list of reference codes from DataStore +#' +#' `get_ref_list` will return a list of the DataStore reference codes associated with a given reference type. Where "All" might be a bit generous: I would not expect more than the number given by "no_of_entries" as that is technically the number of entries per page and the function defaults to returning just one page (not entirely sure what a "page" is in this context). +#' +#' +#' @param reference_type String. The reference type to to query data store for. Defaults to data package ("dataPackage"). +#' @param no_of_entries Integer. The number of entries to return per page (where only one "page" of results is returned by default). Defaults to 500. +#' @param secure Logical. Defaults to FALSE for external users. Setting secure = TRUE will, with the proper credentials, return DataStore references with visibility set to both Public and Restricted. +#' +#' @return A List of reference IDs +#' @export +#' +#' @examples +#' \dontrun{ +#' get_ref_list() +#' } +get_ref_list <- function (reference_type = "dataPackage", + no_of_entries = 500, + secure = FALSE) { + server <- NULL + if (secure == TRUE) { + server <- "https://irmaservices.nps.gov/datastore-secure/v7/rest/" + } + if (secure == FALSE) { + server <- "https://irmaservices.nps.gov/datastore/v7/rest/" + } + + url <- paste0(server, + "ReferenceTypeSearch/", + reference_type, + "?top=", + no_of_entries, + "&page=1") + ref_list <- httr::content(httr::GET(url, + httr::authenticate(":", ":", "ntlm"))) + DS_reference_list <- NULL + + for (i in 1:length(seq_along(ref_list[[1]]))) { + DS_reference_list <- append(DS_reference_list, + ref_list[[1]][[i]][["referenceId"]]) + } + return(DS_reference_list) +} + +#' Return Basic information about a list of DataStore References +#' +#' The function will return a data frame containing information about a given number of references within a reference type. The data returned includes the reference ID (referenceId), the date the references was activated on DataStore (dateOfIssue), the references visibility (visibility), the number of files associated with the reference (fileCount), the access level of the files (fileAccess), the reference title (title), the abbreviated citation (citation), the URL for the DataStore reference (referenceUrl), the group-type for the reference (referenceGroupType), the type of reference (typeName), whether the reference has a DOI associated with it (isDOI), whether their is a newer version of the reference (newVersion) and what the most recent version of the reference is (mostRecentReference). +#' +#' @param reference_type String. The reference type to to query data store for. Defaults to data package ("dataPackage"). +#' @param no_of_entries Integer. The number of entries to return per page (where only one "page" of results is returned by default). Defaults to 500. +#' @param secure Logical. Defaults to FALSE for external users. Setting secure = TRUE will, with the proper credentials, return DataStore references with visibility set to both Public and Restricted. +#' +#' @return a data frame +#' @export +#' +#' @examples +#' \dontrun{ +#' get_ref_info() +#' } +#' +get_refs_info <- function (reference_type = "dataPackage", + no_of_entries = 500, + secure = FALSE) { + server <- NULL + if (secure == TRUE) { + server <- "https://irmaservices.nps.gov/datastore-secure/v7/rest/" + } + if (secure == FALSE) { + server <- "https://irmaservices.nps.gov/datastore/v7/rest/" + } + + url <- paste0(server, + "ReferenceTypeSearch/", + reference_type, + "?top=", + no_of_entries, + "&page=1") + ref_list <- httr::content(httr::GET(url, + httr::authenticate(":", ":", "ntlm"))) + DS_reference_list <- data.frame(referenceId =integer(), + referenceType = character(), + dateOfIssue = as.Date(character()), + visibility = factor(), + fileCount = integer(), + fileAccess = character(), + title = character(), + citation = character(), + referenceUrl = character(), + referenceGroupType = character(), + typeName = character(), + isDOI = logical(), + newVersion = character(), + mostRecentVersion = character() + ) + + for (i in 1:length(seq_along(ref_list[[1]]))) { + + if (is.null(ref_list[[1]][[i]][["newVersion"]])) { + newVersion <- NA + } else { + newVersion <- ref_list[[1]][[i]][["newVersion"]] + } + + if (is.null(ref_list[[1]][[i]][["mostRecentVersion"]])) { + mostRecentVersion <- NA + } else { + mostRecentVersion <- ref_list[[1]][[i]][["mostRecentVersion"]] + } + + ref <- c(ref_list[[1]][[i]][["referenceId"]], + ref_list[[1]][[i]][["referenceType"]], + ref_list[[1]][[i]][["dateOfIssue"]], + ref_list[[1]][[i]][["visibility"]], + ref_list[[1]][[i]][["fileCount"]], + ref_list[[1]][[i]][["fileAccess"]], + ref_list[[1]][[i]][["title"]], + ref_list[[1]][[i]][["citation"]], + ref_list[[1]][[i]][["referenceUrl"]], + ref_list[[1]][[i]][["referenceGroupType"]], + ref_list[[1]][[i]][["typeName"]], + ref_list[[1]][[i]][["isDOI"]], + newVersion, + mostRecentVersion + ) + + ref <- t(ref) + colnames(ref) <- colnames(DS_reference_list) + + DS_reference_list <- rbind(DS_reference_list, ref) + + } + return(DS_reference_list) +} + +#' Collect summary statistics on data packages +#' +#' Given a list of data package references from DataStore the function will download the indicated data packages (using creating the folders /data/reference for each data package; see `get_data_packages` for details), load them into R, and then collect some summary statistics on the data packages. +#' +#' If a data package fails to download (or load) into R, the function will return NAs instead of summary data about the data package as well as a message about the package status ("Loads", "Error") in the dataframe that the function returns. The function will ignore files that fall outside the data package specifications (one or more .csv files and a single .xml file ending in *_metadata.xml). +#' +#' When `check_metadata` is set to the default `FALSE`, the function will attempt to and load any .csv, regardless of the contents. Data packages with restricted access can produce false positives if you do not have the appropriate permissions to download the data as the function will still download the files, but they will be populated with unhelpful hmtl rather than the intended data. Functions that fail to load into R likely violate the data package specifications in some fundamental way (e.g. .CSV file instead of .csv or no .csv files at all). +#' +#' When `check_metadata` is set to `TRUE`, additional checks and tests are run on the data package and load errors may occur for all of the above reasons and also if there are multiple .xml files, if the metadata file name does not end in "*_metadata.xml", if there is no metadata file, or if the metadata file is EML schema-invalid. +#' +#' If you have access to restricted DataStore references (e.g. in an NPS office or logged in to an NPS VPN), you can set secure = TRUE. This will give you access to restricted (internal to NPS) references but if a reference is restricted to a named list of individuals you must be on that named list to access the reference. +#' +#' @param ref_list list or string of data package reference IDs from DataStore (potentially generated via `get_references_list`. +#' @param secure logical. Defaults to TRUE to access secure DataStore server and restricted data packages. Set to FALSE to to access only public references. +#' @param check_metadata Logical. Defaults to FALSE. In this case, metadata will not be checked or loaded. Any load errors will occur due to problems with .csv files (for instance if they don't exist). To test whether the metadata meets minimal requirements (is schema-valid), set check_metadata = TRUE. +#' +#' @return data frame +#' @export +#' +#' @examples +#' \dontrun{ +#' x <- get_ref_list() +#' get_ref_info(x[[1]]) +#' } +summarize_packages <- function(ref_list, + secure = TRUE, + check_metadata = FALSE) { + #setup a dataframe to return data to: + df <- data.frame(pkgid = character(), + status = character(), + fileNumber = integer(), + colNumber = integer(), + cellNumber = integer(), + fileSize = integer()) + + #get data from each data package + for (i in 1:length(seq_along(ref_list))) { + # places to add data to for each package + file_number <- 0 + col_number <- 0 + cell_number <- 0 + file_size <- 0 + + #This is where the data package will be downloaded to: + destination_dir <- paste("data/", ref_list[i], sep = "") + + #only download if the file/directory does not already exist + #caution: partially downloaded data packages WILL cause issues here! + if (!file.exists(destination_dir)) { + pkg_download <- tryCatch( + NPSutils::get_data_package(ref_list[i], + secure = secure, + force = TRUE), + error = function(e) e) + + if(inherits(pkg_download, "error")) { + dat <- data.frame(ref_list[i], "Error", NA, NA, NA, NA) + colnames(dat) <- colnames(df) + df <- rbind(df, dat) + next + } + } + + #tryCatch to load the package + pkg <- tryCatch( + if (check_metadata == FALSE) { + NPSutils::load_data_package(ref_list[i]) + } else { + NPSutils::load_data_package(ref_list[i], assign_attributes = TRUE) + + }, + error = function(e) e) + #if loading fails, put in a bunch of NAs instead of data: + if (inherits(pkg, "error")) { + dat <- data.frame(ref_list[i], "Error", NA, NA, NA, NA) + colnames(dat) <- colnames(df) + df <- rbind(df, dat) + } else { + #if loading is successful, get some basic info about the data package: + #number of data files: + file_number <- length(seq_along(pkg)) + + #number of columns of data + for (j in 1:length(seq_along(pkg))) { + col_number <- col_number + ncol(pkg[[j]]) + } + + #number of cells of data: + for(j in 1:length(seq_along(pkg))) { + cell_number <- + cell_number + ncol(pkg[[j]])*nrow(pkg[[j]]) + } + #total data (not metadata) file size of the data package + for (j in 1:file_number) { + file_size <- file_size + file.size( + list.files(here::here("data", + ref_list[i]), + full.names = TRUE)[j]) + } + #put all the info into a dataframe + dat <- data.frame(ref_list[i], + "Loads", + file_number, + col_number, + cell_number, + file_size) + colnames(dat) <- colnames(df) + #append the package-specific info to the overall dataframe: + df <- rbind(df, dat) + } + #remove the package specific data frame so that it can be re-written + rm(dat) + } + #return the dataframe for the entire set of data packages: + return(df) +} \ No newline at end of file diff --git a/docs/news/index.html b/docs/news/index.html index e149388..fcef3ea 100644 --- a/docs/news/index.html +++ b/docs/news/index.html @@ -53,8 +53,10 @@

Changelog

-

2024-10-24

-