Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

fix load_pkg_metadata; add metanalysis functions #62

Merged
merged 16 commits into from
Dec 20, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions NAMESPACE
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,8 @@ export(get_park_taxon_citations)
export(get_park_taxon_refs)
export(get_park_taxon_url)
export(get_ref_info)
export(get_ref_list)
export(get_refs_info)
export(get_unit_code)
export(get_unit_code_info)
export(get_unit_info)
Expand All @@ -22,6 +24,7 @@ export(load_domains)
export(load_pkg_metadata)
export(map_wkt)
export(rm_local_packages)
export(summarize_packages)
export(validate_data_package)
importFrom(lifecycle,deprecated)
importFrom(magrittr,"%>%")
3 changes: 3 additions & 0 deletions NEWS.md
Original file line number Diff line number Diff line change
@@ -1,5 +1,8 @@
# NPSutils 0.3.3 (under development)

## 2024-12-19
* updated `load_pkg_metadata` to be simpler and essentially call `DPchecker::load_metadata` but with a preset default directory structure that works well with the default settings for `get_data_package`.
* Add meta-analysis functions for finding and producing summary statistics multiple data packages including `get_ref_list`, `get_refs_info()`, and `summarize_packages`.
## 2024-10-24
* fix how `get_data_package` aliases `get_data_packages`, specifically now allows users to adjust parameters to non-default settings.
## 2024-10-21
Expand Down
2 changes: 1 addition & 1 deletion R/load_data_package.R
Original file line number Diff line number Diff line change
Expand Up @@ -11,7 +11,7 @@
#'
#' @examples
#' \dontrun{
#' load_data_package(2272461)
#' load_data_package_deprecated(2272461)
#' }
load_data_package_deprecated <- function(reference_id) {
data_package_directory <- paste("data/", reference_id, sep = "")
Expand Down
2 changes: 1 addition & 1 deletion R/load_data_packages.R
Original file line number Diff line number Diff line change
Expand Up @@ -197,4 +197,4 @@ extract_tbl <- function(x) {
if (!is.list(x))
return(NULL)
unlist(lapply(x, extract_tbl), FALSE)
}
}
118 changes: 10 additions & 108 deletions R/load_pgk_metadata.R
Original file line number Diff line number Diff line change
@@ -1,12 +1,9 @@
#' Read contents of data package file and construct a data frame based on the
#' metadata file summarizing the fields and their types/definitions.
#' Loads EML-formatted metadata into R for inspection and/or editing
#'
#' @description `load_pkg_metadata()` is essentially a wrapper around `DPchecker::load_metadata` with the directory structure pre-set to work well the default location that `get_data_package` stores downloaded data packages. If you did not use the default settings for `get_data_package` (or downloaded a data package manually) you may find it easier to adjust the directory structure pointing to your data package and load the metadata using `DPchecker::load_metadata()`. Much like `load_metadata`, `load_pkg_metadata` requires that there be a single .xml file in the data package directory, that the metadata file name end in *_metadata.xml, and that the file contain schema-valid EML metadata.
#'
#' @description `load_pkg_metadata()` reads the metadata file from a previously
#' downloaded package and loads a list of fields and their attributes into a
#' dataframe.
#'
#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file.
#' @param directory String. Path to the data package
#' @param holding_id is a 6-7 digit number corresponding to the holding ID of the data package zip file. Your data should be in a directory that that has the holding ID as its name.
#' @param directory String. Path to the data package directory, defaults to "data".
#'
#' @return one data frame to the global environment.
#'
Expand All @@ -16,105 +13,10 @@
#' \dontrun{
#' load_pgk_metadata(2266200)
#' }
load_pkg_metadata <- function(holding_id, directory = here::here("data")) {
data_package_directory <- paste(directory, "/", holding_id, sep = "")

metadata_file <- list.files(
path = data_package_directory,
pattern = "metadata.xml"
)

# Look for a metadatafile and let the user know about the results of the search.
if (length(metadata_file) == 0) {
cli::cli_abort(c(
"No metadata file found in: {.path {data_package_directory}}.",
"i" = "The filename must end in _metadata.xml"))
return(invisible())
}
if (length(metadata_file) > 1) {
cli::cli_abort(c(
"Multiple metadata files found.",
"i" = "{.path {data_package_directory}} can contain only one
{.file *_metadata.xml}."))
return(invisible())
}
load_pkg_metadata <- function(holding_id, directory = "data") {

meta_location <- paste0(data_package_directory, "/", metadata_file)
if (!file.exists(meta_location)) {
cli::cli_abort(c(
"The data package for: {.var {holding_id}} was not found.",
"i" = "Make sure {.path {data_package_directory}} is the correct location",
"i" = "Make sure you downloaded the correct data package using {.fn get_data_package}."
))
return(invisible())
}

#load metadata
eml_object <- EML::read_eml(meta_location, from = "xml")
#attributeList <- EML::get_attributes(eml_object)
attribute_list <- eml_object$dataset$dataTable$attributeList
attributes <- attribute_list$attributes
factors <- attribute_list$factors

# Figure out column classes based on attribute table (character, numeric, integer, logical, or complex)
attributes$columnclass <- "character"
if (!"numberType" %in% colnames(attributes)) {
attributes$numberType <- as.character(NA)
}
if (!"formatString" %in% colnames(attributes)) {
attributes$formatString <- as.character(NA)
}
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "natural", "integer", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "whole", "integer", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "integer", "integer", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "float" & attributes$numberType == "real", "numeric", attributes$columnclass)
attributes$columnclass <- ifelse(attributes$storageType == "date" & attributes$formatString == "YYYY-MM-DD", "Date", attributes$columnclass)

# return the field table to the workspace.
return(attributes)

if (metaformat == "fgdc") {
# xmlFilename <- metalocation
workingXMLfile <- EML::read_eml(metalocation, from = "xml")

# Build attributes table from the xml file
attributes <- data.frame(
id = numeric(),
attribute = character(),
attributeDefinition = character(),
attributeType = character(),
attributeFactors = numeric(),
stringsAsFactors = FALSE
)
for (i in 1:length(workingXMLfile$ea$detailed$attr)) {
attributes <- rbind(
attributes,
cbind(
id = i,
attribute = workingXMLfile$ea$detailed$attr[[i]]$attrlabl,
attributeDefinition = workingXMLfile$ea$detailed$attr[[i]]$attrdef,
attributeType = workingXMLfile$ea$detailed$attr[[i]]$attrtype,
attributeFactors = length(workingXMLfile$ea$detailed$attr[[i]]$attrdomv)
)
)
}

attributes$id <- as.integer(as.character(attributes$id))
attributes$attribute <- as.character(attributes$attribute)
attributes$attributeDefinition <- as.character(attributes$attributeDefinition)
# attributes$attributeType<-as.character(attributes$attributeType)
attributes$attributeFactors <- as.integer(as.character(attributes$attributeFactors))

attributes$columnclass <- "character"
# attributes$columnclass<-ifelse(attributes$attributeType=="OID","integer",attributes$columnclass)
# attributes$columnclass<-ifelse(attributes$attributeType=="Date","Date",attributes$columnclass)
# attributes$columnclass<-ifelse(attributes$attributeType=="Double","numeric",attributes$columnclass)

cat("Found ", crayon::blue$bold(nrow(attributes)), " fields.", sep = "")
meta <- DPchecker::load_metadata(directory = here::here("data", holding_id))

return(invisible(meta))
}

# return the field table to the workspace.
return(attributes)
} else {
print("data/metadata format combination not supported")
}
}
Loading
Loading