diff --git a/paper.bib b/paper.bib index 29f6a38..24fe33a 100644 --- a/paper.bib +++ b/paper.bib @@ -1,11 +1,21 @@ -@article{Wickham2019, doi = {10.21105/joss.01686}, url = {https://doi.org/10.21105/joss.01686}, year = {2019}, publisher = {The Open Journal}, volume = {4}, number = {43}, pages = {1686}, author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, title = {Welcome to the Tidyverse}, journal = {Journal of Open Source Software} } +@article{Wickham2019, + doi = {10.21105/joss.01686}, + url = {https://doi.org/10.21105/joss.01686}, + year = {2019}, + publisher = {The Open Journal}, + volume = {4}, + number = {43}, + pages = {1686}, + author = {Hadley Wickham and Mara Averick and Jennifer Bryan and Winston Chang and Lucy D'Agostino McGowan and Romain François and Garrett Grolemund and Alex Hayes and Lionel Henry and Jim Hester and Max Kuhn and Thomas Lin Pedersen and Evan Miller and Stephan Milton Bache and Kirill Müller and Jeroen Ooms and David Robinson and Dana Paige Seidel and Vitalie Spinu and Kohske Takahashi and Davis Vaughan and Claus Wilke and Kara Woo and Hiroaki Yutani}, title = {Welcome to the Tidyverse}, journal = {Journal of Open Source Software} +} @misc{OpenData2019, url = {https://www.congress.gov/bill/115th-congress/house-bill/4174}, howpublished = {H.R.4174 - 115th Congress}, journal = {law}, - title = {"H.R. 4171 - OPEN Government Data Act"}, - year = {2019}} + title = {H.R. 4171}, + year = {2019} + } @manual{nsf2015, url = {https://www.nsf.gov/publications/pub_summ.jsp?ods_key=nsf15094}, @@ -18,14 +28,16 @@ @manual{nsf2015 @misc{Wiley2022, url = {https://authorservices.wiley.com/author-resources/Journal-Authors/open-access/data-sharing-citation/data-sharing-policy.html}, - title = {Wiley's data sharing policies}, + author = {Wiley}, + title = {Wiley's Data Sharing Policies}, year = {2022} } @misc{Springer2023, url = {https://www.springer.com/gp/editorial-policies/data-availability-statement?srsltid=AfmBOoq9OGxFR-H9UXUfYx_Nl1fRgfnBfCIFl3nbUqkNcRey1oaTBNqn}, + author = {Springer}, title = {Data Availability Statement}, - year = {} + year = {2023} } @article{Federer2018, @@ -81,7 +93,8 @@ @article{EML2019 url = {https://eml.ecoinformatics.org}, DOI={10.5063/f11834t2}, publisher = {KNB Data Repository}, author = {Jones, Matthew and O’Brien, Margaret and Mecum, Bryce and Boettiger, Carl and Schildhauer, Mark and Maier, Mitchell and Whiteaker, Timothy and Earl, Stevan and Chong, Steven}, - year = {2019} } + year = {2019} + } @article{Nelson2022, title = {Memorandum for the heads of executive departments and agencies: Ensuring free, immediate, and equitable access to federally funded research}, @@ -147,5 +160,61 @@ @article{Wilkinson2016 volume = {3}, pages = {160018}, doi = {10.1038/sdata.2016.18}, -url = {https://doi.org/10.1038/sdata.2016.18} +url = {https://doi.org/10.1038/sdata.2016.18}, +} + +@Manual{Boettiger2024, + title = {EML: Read and Write Ecological Metadata Language Files}, + author = {Carl Boettiger and Matthew B. Jones}, + year = {2024}, + note = {R package version 2.0.6, https://github.com/ropensci/EML/}, + url = {https://docs.ropensci.org/EML/}, +} + +@Manual{Smith2022, + title = {EMLassemblyline: A tool kit for building EML metadata workflows}, + author = {Colin Smith}, + year = {2022}, + note = {https://github.com/EDIorg/EMLassemblyline, +https://ediorg.github.io/EMLassemblyline/}, +} + +@Manual{Baker_QCkit2024, + title = {QCkit: NPS Inventory and Monitoring Quality Control Toolkit}, + author = {Robert Baker and Judd Patterson and Joe DeVivo and Issac Quevedo and Sarah Wright}, + year = {2024}, + note = {R package version 0.1.7}, + url = {https://github.com/nationalparkservice/QCkit/}, +} + +@Manual{Baker_NPSdataverse2024, + title = {NPSdataverse: Tools and Packages for Data and Metadata Manipulation}, + author = {Robert Baker and Judd Patterson and Joe DeVivo}, + year = {2024}, + note = {R package version 0.1.0}, + url = {https://github.com/nationalparkservice/NPSdataverse}, +} + +@Manual{Baker_EMLeditor2024, + title = {EMLeditor: View and Edit EML Metadata}, + author = {Robert Baker and Judd Patterson}, + year = {2024}, + note = {R package version 0.1.6}, + url = {https://github.com/nationalparkservice/EMLeditor}, +} + +@Manual{Baker_DPchecker2024, + title = {DPchecker: Checks Data Packages for Congruence}, + author = {Rob Baker and Sarah E. Wright}, + year = {2024}, + note = {R package version 0.3.4}, + url = {https://nationalparkservice.github.io/DPchecker/}, +} + +@Manual{Baker_NPSutils2024, + title = {NPSutils: Collection of Functions to read and manipulate information from the NPS DataStore}, + author = {Robert Baker and Joe DeVivo and Judd Patterson}, + year = {2024}, + note = {R package version 0.3.1}, + url = {https://github.com/nationalparkservice/NPSutils}, } diff --git a/paper.md b/paper.md index 0869b70..bd10e37 100644 --- a/paper.md +++ b/paper.md @@ -15,8 +15,10 @@ tags: - data package - data publication - data access -date: "19 August 2024" -output: pdf_document +date: "12 September 2024" +output: + word_document: default + pdf_document: default authors: - name: Robert L. Baker orcid: "0000-0001-7591-5035" @@ -68,48 +70,53 @@ affiliations: # Summary -The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) is a suite of R packages modeled off of the tidyverse concept of several packages built with a common goal [@Wickham2019]. The overarching theme of the NPSdataverse packages is creating, publishing, and accessing open, machine-readable data and metadata. NPSdataverse supports Ecological Metadata Language (EML) metadata and .csv data files. Some of the constituent packages ([R/EML](https://docs.ropensci.org/EML/) and [R/EMLassemblyline](https://ediorg.github.io/EMLassemblyline/)) are general-use and aimed at authoring EML documents. Other packages ([R/QCkit](https://nationalparkservice.github.io/QCkit/), [R/EMLeditor](https://nationalparkservice.github.io/EMLeditor/), [R/DPchecker](https://nationalparkservice.github.io/DPchecker/) and [R/NPSutils](https://nationalparkservice.github.io/NPSutils/)) are designed and maintained by the National Park Service (NPS). Although many functions within the NPSdataverse packages are NPS-specific (particularly some API calls), all of the functions are written so that they can also be used by the general public. Anyone interested applying for research permits or conducting research on NPS Units can reference and utilize the NPSdataverse packages. Additionally, the packages will be useful for data management plans in wide variety of grant proposals and for anyone that needs to create open data and machine readable metadata. Finally, the swiftly and easily ability to author, edit, and check Ecological Metadata Language (EML) metadata in a reproducible fashion will be useful for data publication at any number of repositories or data journals. +The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) is a suite of R packages modeled off of the tidyverse concept of several packages built with a common goal [@Wickham2019]. The overarching theme of the NPSdataverse packages is creating, publishing, and accessing open, machine-readable data and metadata. The NPSdataverse supports Ecological Metadata Language (EML) metadata and .csv data files. Some of the constituent R packages ([R/EML](https://docs.ropensci.org/EML/) and [R/EMLassemblyline](https://ediorg.github.io/EMLassemblyline/)) are general-use and aimed at authoring EML documents. Other R packages ([R/QCkit](https://nationalparkservice.github.io/QCkit/), [R/EMLeditor](https://nationalparkservice.github.io/EMLeditor/), [R/DPchecker](https://nationalparkservice.github.io/DPchecker/) and [R/NPSutils](https://nationalparkservice.github.io/NPSutils/)) are designed and maintained by the National Park Service (NPS). Although many functions within the NPSdataverse packages are NPS-specific (particularly some API calls), whenever possible the functions are written so that they can also be used by the general public. Anyone interested in applying for research permits or conducting research on NPS units can reference and utilize the NPSdataverse. Additionally, the packages will be useful for data management plans in a wide variety of grant proposals and for anyone that needs to create open data and machine readable metadata. Finally, the ability to swiftly and easily author, edit, and check Ecological Metadata Language (EML) metadata in a reproducible fashion will be useful for data publication at any number of repositories or data journals. # Statement of Need -Following a long-term movement for transparency and data accessibility, the U.S. implemented an Open Data Memorandum in 2013 (OMB M-13-13) and the federal OPEN Government Data Act of 2019 [@OpenData2019]. The Open Data Act mandated that federal agencies provide data in open formats with metadata. Subsequently, many funding agencies such as the National Science Foundation have required grant awardees to make data public, often including metadata ([@nsf2015]). Multiple publishers have followed suit ([@Wiley2022], [@Springer2023])) and require data availability statements upon publication. +Following a long-term movement for transparency in scientific research and data accessibility, the U.S. implemented an Open Data Memorandum in 2013 (OMB M-13-13) and the federal OPEN Government Data Act of 2019 [@OpenData2019]. The Open Data Act mandates that federal agencies provide data in open formats with metadata. Subsequently, many funding agencies such as the National Science Foundation have required grant awardees make data public, often including metadata [@nsf2015]. Multiple publishers have followed suit [@Wiley2022; @Springer2023] and require data availability statements upon publication. -One goal of open science, and requirement of the recent "Nelson Memo" is to make data FAIR: findable, inter-operable, accessible, and reuseable ([@Nelson2022], [@Wilkinson2016]). These goals are often achieved by including structured, machine-readable metadata that conforms to a defined schema along with the data. Ecological Metadata Language Metadata (EML) is one metadata standard that is particularly amenable to studies with rich taxonomy ([@Jones2006], [@EML2019]). It has been adopted by multiple research organizations including the Ecological Data Initiative (EDI), the National Ecological Observatory Network (NEON), the Global Biodiversity Information Facility (GBIF), Swedish Biodiversity Data Infrastructure (SBDI), the French Biodiversity Hub ("Pole National de Donnees de Biodiversite"), the U.S. National Park Service, and others. +One goal of open science, and requirement of the recent "Nelson Memo" from the U.S. Office of Science and Technology Policy [@Nelson2022] is to make data FAIR: findable, inter-operable, accessible, and reuseable [@Wilkinson2016]. These goals are often achieved by including structured, machine-readable metadata that conforms to a defined schema along with the data. Ecological Metadata Language Metadata (EML) is one metadata standard that is particularly amenable to studies with rich taxonomy [@Jones2006; @EML2019]. It has been adopted by multiple research organizations including the Ecological Data Initiative (EDI), the National Ecological Observatory Network (NEON), the Global Biodiversity Information Facility (GBIF), Swedish Biodiversity Data Infrastructure (SBDI), the French Biodiversity Hub ("Pole National de Donnees de Biodiversite"), the U.S. National Park Service, and others. -Nevertheless, actual availability of data varies ([@Federer2018, @Tedersoo2021], perhaps because there is a need for more infrastructure and tools to meet the goals of open data and open science ([@Huston2019]). Multiple solutions have been presented, including ezEML, a workflow for authoring metadata in Ecological Metadata Language and publishing data and metadata to a repository ([@Vanderbilt2022]). ezEML is has an intuitive graphical user interface with a relatively low learning curve; however, it does have some drawbacks. For instance, ezEML is not scriptable, which makes repeated deployments of the same or similar workflows challenging. And, ezEML requires the user upload their data to an external site for processing, which may not be suitable for sensitive data. Here we introduce the NPSdataverse, a series of R-based packages for authoring, editing, and checking EML metadata locally in a scriptable fashion. Packages within the NPSdataverse leverage earlier work using R to create and manipulate XML based EML files ([@Boettiger2019]). Building upon that framework, we add user-friendly EML creation workflows; integration with taxonomic databases; fast, easy editing of existing metadata; congruence checks to test correspondence between data and metadata; and integration with public repositories such as the National Park Service's [DataStore](https://irma.nps.gov/DataStore/). Packages within the NPSdataverse also include data munging and data access/download functions that leverage the rich EML associated with the data. +Nevertheless, actual availability of data varies [@Federer2018]; [@Tedersoo2021], perhaps because there is a need for more infrastructure and tools to meet the goals of open data and open science [@Huston2019]. Multiple solutions have been presented, including ezEML, a tool for authoring metadata in Ecological Metadata Language and publishing data and metadata to a repository [@Vanderbilt2022]. ezEML is has an intuitive graphical user interface with a relatively low learning curve; however, it does have some drawbacks. For instance, ezEML is not scriptable, which makes repeated deployments of the same or similar workflows challenging. And, ezEML requires that the user upload their data to an external site for processing, which may not be suitable for sensitive data. Here we introduce the NPSdataverse, a series of R-based packages for authoring, editing, and checking EML metadata locally in a robust, repeatable, and scriptable fashion. R Packages within the NPSdataverse leverage earlier work using R to create and manipulate XML based EML files [@Boettiger2019]. Building upon that framework, we add user-friendly EML creation workflows; integration with taxonomic databases; fast, easy editing of existing metadata; congruence checks to test correspondence between data and metadata; and integration with public repositories such as the National Park Service's [DataStore](https://irma.nps.gov/DataStore/). The EML metadata file in .xml format along with the .csv data files it describes comprise a "data package". R Packages within the NPSdataverse also include data munging and data package access/download functions that leverage the rich EML associated with the data. # NPSdataverse R package -The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) package is a meta-package that loads packages within the NPSdataverse into R. It provides a convenient way to download many of the packages needed to create and access data packages consisting of rich Ecological Metadata Language metadata and .csv data files: +The [NPSdataverse](https://nationalparkservice.github.io/NPSdataverse/) package is a meta-package that loads packages within the NPSdataverse into R [@Baker_NPSdataverse2024]. It provides a convenient way to download, install, and load many of the R packages needed to create and access data packages consisting of rich Ecological Metadata Language metadata and .csv data files: -```{r install_NPSdataverse, eval = FALSE} +``` pak::pkg_install("nationalparkservice/NPSdataverse") +library(NPSdataverse) ``` -NPSdataverse will automatically check that the latest version of the main development branch on GitHub is being loaded. If updates are indicated, the user will be alerted and given instructions on how to update the relevant packages. To prevent API limits on GitHub.com, the package only checks for updates from an interactive R session and will skip checks when the system is not on-line or GitHub.com is not responding. + +The NPSdataverse will automatically check that the latest version of each R package is being loaded: either from the main development branch on GitHub.com or the latest version on CRAN. If updates are indicated, the user will be alerted and given instructions on how to update the relevant packages. To prevent API limits at GitHub.com, the package only checks for updates from an interactive R session and will skip checks when the system is not on-line or GitHub.com is not responding. # QCkit R package -[QCkit](https://nationalparkservice.github.io/QCkit/) is primarily a data munging package designed to prepare data for metadata creation and publication. QCkit includes functions that can help manage date-time formatting, can check data files for threatened or endangered species, and can help increase inter-operability by suggesting appropriate [Darwin Core](https://dwc.tdwg.org/) standards for naming data. Additional functions allow users to convert between decimal latitude and longitude and UTMs, check whether GPS coordinates fall within specific National Park Service unit boundaries, add elevation based on GPS locations via a USGS API, and help deal with "missing values". QCkit also facilitates documenting data munging by generating DataStore references based on GitHub.com releases. The DataStore references can hold processing scripts or code packages and have DOIs attached to them. +[QCkit](https://nationalparkservice.github.io/QCkit/) is primarily a data munging package designed to prepare data for metadata creation and publication [@Baker_QCkit2024]. QCkit includes functions that can help manage date-time formatting, can check data files for threatened or endangered species, and can help increase inter-operability by suggesting appropriate [Darwin Core](https://dwc.tdwg.org/) standards for naming data. Additional functions allow users to convert between decimal latitude and longitude and UTMs, check whether GPS coordinates fall within specific National Park Service unit boundaries, add elevation based on GPS locations via a USGS API, and help deal with "missing values". QCkit also facilitates documenting data munging by generating DataStore references based on GitHub.com releases. The DataStore references can hold processing scripts, code, or packages and have DOIs attached to them that are registered with DataCite once the DataStore reference is activated. # EML R package -The R/[EML](https://docs.ropensci.org/EML/) package is a fundamental package that allows for importing .xml files, creating and validating validating EML within R, and writing R objects back out to .xml files. R/EML allows for creating fully fledged Ecological Metadata Language Metadata files using nested S3 lists within R while relying on the R/[emld](https://docs.ropensci.org/emld/) package [@Boettiger2019_emld]. +The R/[EML](https://docs.ropensci.org/EML/) package is a fundamental package that allows for importing .xml files, creating and validating validating EML within R, and writing R objects back out to .xml files [@Boettiger2024]. R/EML allows for creating fully fledged Ecological Metadata Language Metadata files using nested S3 lists within R while relying on the R/[emld](https://docs.ropensci.org/emld/) package [@Boettiger2019_emld]. # EMLassembyline R package -The [EMLassemblyline](https://ediorg.github.io/EMLassemblyline/) (EAL) package builds upon R/EML and adds substantial functionality. For instance, EAL allows the user to supply .csv files, which are used to generate template .txt files. Users can adjust the template files as needed and use the `EMLassemblyline::make_eml()` function to generate an R-object that can be exported via R/EML as an EML-fomatted .xml file. EAL includes the ability generate entire taxonomic backbones from lists of scientific names via API calls to ITIS, GBIF, or Worms. EAL will validate the R object against the EML schema and provide helpful hints on what might have gone wrong during the `EMLassemblyline::make_eml()` process. EAL provides an efficient bridge between data and EML metadata for users who are familiar with R but may not be experts on the EML schema or the detailed nested lists needed to create EML within R via R/EML. Products from the EAL pipeline are suitable for publication at multiple repositories including the Environmental Data Initiative. +The [EMLassemblyline](https://ediorg.github.io/EMLassemblyline/) (EAL) package builds upon R/EML and adds substantial functionality [@Smith2022]. For instance, EAL allows the user to supply .csv files, which are used to generate template .txt files. Users can adjust the template files as needed and use the `EMLassemblyline::make_eml()` function to generate an R-object that can be exported via R/EML as an EML-fomatted .xml file. EAL includes the ability to generate entire taxonomic backbones from lists of scientific names via API calls to ITIS, GBIF, or Worms. EAL will validate the R object against the EML schema and provide helpful hints on what might have gone wrong during the `EMLassemblyline::make_eml()` process. EAL provides an efficient bridge between .csv data and EML metadata for users who are familiar with R but may not be experts on the EML schema or the detailed nested lists needed to create EML within R via R/EML. Products from the EAL pipeline are suitable for publication at multiple repositories including the Environmental Data Initiative. # EMLeditor R package -The [EMLeditor](https://nationalparkservice.github.io/EMLeditor/) package allows users to quickly and easily view components of metadata in R and make on-the-fly edits to metadata without having to re-run the EAL steps (EAL can be time consuming, especially if there are many taxa that need to be resolved). EMLeditor includes the ability to pick specific licenses (CC0, CC-BY, etc), add [ORCIDs](https://orcid.org/), include organizations as authors, and much more. EMLeditor also adds specific content necessary to be compliant with NPS's DataStore. With the proper permissions, EMLeditor can be used to generate draft references and reserve DOIs on DataStore as well as upload data and metadata files to DataStore. Finally, EMLeditor contains a .rmd template file that is accessible in Rstudio under Files > New File > R markdown. The template provides an editable script that walks the user through using EAL, EMLeditor, and DPchecker to create and validate EML metadata in R. +The [EMLeditor](https://nationalparkservice.github.io/EMLeditor/) package allows users to quickly and easily view components of metadata in R and make on-the-fly edits to metadata [@Baker_EMLeditor2024]. Edits made to EML using EMLeditor do not require re-running the EAL functions to make EML. This is a significant improvement because running the EAL functions can be time consuming, especially if there are many taxa that need to be resolved. EMLeditor includes the ability to pick specific licenses (CC0, CC-BY, etc), add [ORCIDs](https://orcid.org/), include organizations as authors, and much more. EMLeditor also adds specific content necessary to be compliant with NPS's DataStore. With the proper permissions, EMLeditor can be used to generate draft references and reserve DOIs on DataStore as well as upload data and metadata files to DataStore. Finally, EMLeditor contains a .rmd template file that, after loading the package, is accessible in Rstudio under Files > New File > R markdown. The template provides an editable script that walks the user through using EAL, EMLeditor, and DPchecker to create and validate EML metadata in R. + +EMLeditor "set" class functions (which includes all functions that begin with "set_" such as "`EMLeditor::set_abstract()`") will add several NPS-specific items to the metadata using their default settings. For instance, these functions will set NPS as the publisher, Fort Collins as the publication location, and will add a "for or by NPS = TRUE" statement to the metadata. To invoke these functions without adding the NPS-specific metadata elements, set the parameter `NPS = FALSE` when calling each "set_" class function. Non-NPS publisher information can be added using the `EMLeditor::set_publisher()` function with the parameters `for_or_by_NPS` and `NPS` set to `FALSE`: -EMLeditor "set" class functions (which all begin with "set_" such as "`EMLeditor::set_abstract()`") will add several NPS-specific items to metadata using their default settings. For instance, these functions will set NPS as the publisher, Fort Collins as the location, and will add a "for or by NPS = TRUE" statement to the metadata. To invoke these functions without adding the NPS-specific metadata elements, set the parameter `NPS = FALSE`. Non-NPS publisher information can be added using the `EMLeditor::set_publisher()` function with the parameters `for_or_by_NPS` and `NPS` set to `FALSE`: +``` +#example of how a to set the abstract and publisher for non-NPS metadata: -```{r non-NPS-example, eval=FALSE} new_metadata1 <- set_abstract(eml_object = old_metadata, abstract = "This is example/test abstract text", NPS = FALSE) + new_metadata2 <- set_publisher(eml_object = new_metadata1, org_name = "My Institution", street_address = "1234 Sesame St.", @@ -117,21 +124,22 @@ new_metadata2 <- set_publisher(eml_object = new_metadata1, State = "Delaware", zip_code = "12345", country = "USA", - URL = "https://www.MyInstitution.us", + URL = "https://www.myinstitution.us", email = "publisher@myinstitution.us", ror_id = "", for_or_by_NPS = FALSE, NPS = FALSE) ``` -) # DPchecker R Package -The [DPchecker](https://nationalparkservice.github.io/DPchecker/) package provides detailed feedback on data-metadata congruence for use by either data package authors and reviewers. DPchecker goes beyond validating EML objects in R against the EML schema. Using the `DPchecker::run_congruence_checks` function, DPchecker will conduct a series of 46 checks. These are divided into several categories: 1) Metadata to ensure that metadata are well formatted (file names are not duplicated, files specify the field delimiter, data files have URLs, the proper delimiter and header row numbers are present, etc. 2) Metadata elements necessary for DataStore automated extraction are present: creators have valid surnames, publication date is present and in the correct format, keywords are present, abstract and methods are present and well formatted, license is present, attributes have definitions, etc. 3) Recommended EML elements are present including ORCiDs and a notes section 4) Metadata and data are in congruence including all files listed in metadata and all metadata file names refer to data files, the columns in the metadata match the columns in the data files, missing fields in data files are properly documented in metadata, columns indicated as numeric in metadata are numeric in the data files, the date format in the metadata matches the date format in the data files, and dates in data files fall within the date ranges given in the metadata and 5) data and metadata compliance including tests for information that should not be released to the public such as non-.gov emails and GPS coordinates if the data package is not set to public. For each test, the data package may fail with an error, fail with a warning, or pass. When warnings and errors are generated, the user is pointed towards the appropriate EMLeditor function to address the problem. DPchecker will often throw a warning even if an item exists and is properly formatted but could by improved to increase the FAIR characteristics of the metadata. For instance, DPchecker will throw a warning if an abstract is less than 20 words long as it is unlikely the creator is able to meaningfully describe the data collection and processing in less than 20 words. +The [DPchecker](https://nationalparkservice.github.io/DPchecker/) package provides detailed feedback on data-metadata congruence [@Baker_DPchecker2024]. Here, a "data package" consists of the EML metadata file with a filename that ends in *_metadata.xml and one or more data files in .csv format, all of which are in a single directory (and the directory contains no extraneous .csv or .xml files). DPchecker is useful for both data package authors and reviewers. DPchecker goes beyond validating EML objects in R against the EML schema. Using the `DPchecker::run_congruence_checks` function, DPchecker will conduct a series of 46 tests. These are divided into several categories: 1) Metadata to ensure that metadata are well formatted (file names are not duplicated, files specify the field delimiter, data files have URLs, the proper delimiter and header row numbers are present, etc. 2) Metadata elements necessary for DataStore automated extraction are present: creators have valid surnames, publication date is present and in the correct format, keywords are present, abstract and methods are present and well formatted, license is present, attributes have definitions, etc. 3) Recommended EML elements are present including ORCiDs and a notes section 4) Metadata and data are in congruence including all files listed in metadata and all metadata file names refer to data files, the columns in the metadata match the columns in the data files, missing fields in data files are properly documented in metadata, columns indicated as numeric in metadata are numeric in the data files, the date format in the metadata matches the date format in the data files, and dates in data files fall within the date ranges given in the metadata and 5) data and metadata compliance including tests for information that should not be released to the public such as non-.gov emails and GPS coordinates if the data package is not set to public. For each test, the data package may fail with an error, fail with a warning, or pass. When possible, if warnings or errors are generated the user is pointed towards the appropriate EMLeditor function to address the problem. + +DPchecker will often throw a warning even if an EML element exists and is properly formatted but could by improved to increase the FAIR characteristics of the metadata. For instance, DPchecker will throw a warning if an abstract is less than 20 words long as it is unlikely the creator is able to meaningfully describe the data collection and processing in less than 20 words. # NPSutils R Package -The [NPSutils](https://nationalparkservice.github.io/NPSutils/) package serves primarily as a way to access data. NPSutils provides avenues for directly downloading data from DataStore using R. NPSutils can also import data downloaded from any repository into R and take advantage of rich EML metadata to call column types. NPSutils provides some basic meta-analysis capability, assuming certain inter-operabilty standards are met (such as consistently naming columns with species or GPS coordinates). NPSutils can also be used to import data and metadata into common data visualization tools such as PowerBI. +The [NPSutils](https://nationalparkservice.github.io/NPSutils/) package serves primarily as a way to access data [@Baker_NPSutils2024]. NPSutils provides avenues for directly downloading data from DataStore using R. NPSutils can also import data downloaded from any repository into R and take advantage of rich EML metadata to call column types. NPSutils provides some basic meta-analysis capability, assuming certain inter-operabilty standards are met (such as consistently naming columns with species or GPS coordinates). NPSutils can also be used to import data and metadata into common data visualization tools such as PowerBI. # Acknowledgements