diff --git a/NAMESPACE b/NAMESPACE index c5e27a6..7620a2f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,5 @@ # Generated by roxygen2: do not edit by hand export("%>%") -export(get_md_cr) +export(md_cr_snaps_m) importFrom(magrittr,"%>%") diff --git a/R/cr.R b/R/cr.R index ae304b4..047264e 100644 --- a/R/cr.R +++ b/R/cr.R @@ -1,26 +1,70 @@ # version ==== -#' Get the metadata for a Crossref snapshot +#' Metadata for Crossref monthly Snapshots #' #' @description +#' Crossref releases dumps of their database on the 5th of every months. +#' These dumps sometimes change after the fact. +#' This metadata, stored in this package, +#' locks down the versioning of the dumps for our analyses. +#' +#' @details +#' To lock down the reproducibility of the metadata dumps, +#' we store their checksums in this package, +#' instead of merely relying on their *release date*. +#' (They have sometimes been updated in the past *after* release.) +#' +#' Computing a checksum ourselves would be quite expensive #' Gets metadata from the AWS S3 [response header](https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html). #' #' Happily, this works without authorisation for Metadata Plus. #' -#' @examples -#' get_md_cr() +#' Current monthly snapshots: #' -#' @param url -#' The url to the snapshot as documented by Crossref. -#' Passed on to [httr::HEAD()]. +#' ```{r} +#' knitr::kable(md_cr_snaps_m()) +#' ``` #' #' @family version #' @family cr #' @family lake #' #' @export -get_md_cr <- function(url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz") { - res <- httr::HEAD(url) %>% +md_cr_snaps_m <- function() { + dget(file = system.file("checksums", "cr_snaps_md.R", package = "leine")) +} + +#' @describeIn md_cr_snaps_m +#' Update the metadata of monthly snapshots stored in leine +update_md_cr_snaps_m <- function() { + res <- get_md_cr_snaps_m() + # TODO use datapasta for prettier output here again + # https://github.com/subugoe/leine/issues/16 + # datapasta::tribble_construct(res) + dput( + res, + file = system.file("checksums", "cr_snaps_md.R", package = "leine") + ) + invisible(res) +} + +#' @describeIn md_cr_snaps_m +#' Get metadata for currently available monthly snapshots +get_md_cr_snaps_m <- function() { + purrr::map_dfr( + get_md_cr_snaps_m_urls(), + get_md_cr_snap_m, + .id = "period" + ) +} + +#' Get the metadata from the AWS S3 header response for *one* snapshot +#' @param url +#' The url to the snapshot as documented by Crossref. +#' Passed on to [httr::HEAD()]. +#' @noRd +get_md_cr_snap_m <- function(url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz") { + res <- httr::RETRY(verb = "HEAD", url) %>% httr::stop_for_status() %>% httr::headers() if (res$server != "AmazonS3") { @@ -29,16 +73,45 @@ get_md_cr <- function(url = "https://api.crossref.org/snapshots/monthly/latest/a i = "Perhaps Crossref has changed their internal storage implementation." )) } - parse_date_time_aws <- purrr::partial( - lubridate::parse_date_time, - orders = "a d b Y HMS", - tz = "GMT" - ) - list( - date_retrieved = parse_date_time_aws(res$date), + url = url, date_modified = parse_date_time_aws(res[["last-modified"]]), etag = stringr::str_extract(res$etag, '(?<=\").*?(?=\")'), size = structure(as.numeric(res[["content-length"]]), class = "object_size") ) } + +#' Translate datetime from AWS S3 header response to R +#' @noRd +parse_date_time_aws <- purrr::partial( + lubridate::parse_date_time, + orders = "a d b Y HMS", + tz = "GMT" +) + +#' Find all URLs to monthly cr snapshots up to now +#' @noRd +get_md_cr_snaps_m_urls <- function() { + start <- lubridate::ymd("2018-04-06") # always supposed to come out on the 5th + end <- lubridate::today() + n_months_completed <- lubridate::interval(start, end) %/% months(1) - 1 + months_completed <- rep(start, n_months_completed) + lubridate::month(months_completed) <- + lubridate::month(start) + c(1:n_months_completed) + names(months_completed) <- format(months_completed, "%Y-%m") + purrr::map_chr( + months_completed, + function(x) ym2cr_url(lubridate::year(x), format(x, "%Om")) + ) +} + +#' Construct URLs to monthly cr snapshots +#' @param year,month year and month as character strings +#' @noRd +ym2cr_url <- function(y = "2018", m = "04") { + base_url <- httr::parse_url("https://api.crossref.org/") + base_url$path <- paste( + "snapshots", "monthly", y, m, "all.json.tar.gz", sep = "/" + ) + httr::build_url(base_url) +} diff --git a/inst/checksums/cr_snaps_md.R b/inst/checksums/cr_snaps_md.R new file mode 100644 index 0000000..da44039 --- /dev/null +++ b/inst/checksums/cr_snaps_md.R @@ -0,0 +1,75 @@ +structure(list(period = c("2018-05", "2018-06", "2018-07", "2018-08", +"2018-09", "2018-10", "2018-11", "2018-12", "2019-01", "2019-02", +"2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", +"2019-09", "2019-10", "2019-11", "2019-12", "2020-01", "2020-02", +"2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", +"2020-09", "2020-10", "2020-11", "2020-12", "2021-01", "2021-02", +"2021-03"), url = c("https://api.crossref.org/snapshots/monthly/2018/05/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/06/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/07/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/08/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/09/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/10/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/11/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/12/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/01/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/02/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/03/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/04/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/05/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/06/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/07/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/08/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/09/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/10/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/11/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/12/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/01/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/02/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/03/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/04/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/05/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/06/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/07/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/08/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/09/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/10/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/11/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/12/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2021/01/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2021/02/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2021/03/all.json.tar.gz" +), date_modified = structure(c(1528489927, 1532637202, 1533410124, +1536005052, 1539286122, 1541282824, 1543874532, 1546651488, 1582139958, +1551711146, 1554300962, 1556925364, 1559650565, 1563033950, 1564897564, +1567583164, 1570192960, 1573652540, 1575566164, 1578343159, 1582141208, +1583403669, 1585989069, 1589197961, 1591277469, 1593865868, 1596558664, +1599327273, 1603833389, 1604556067, 1607160669, 1610469968, 1612544468, +1614969069, 1617643868), tzone = "GMT", class = c("POSIXct", +"POSIXt")), etag = c("5fdf4c3e01a713c4391fbc193391a948-5803", +"66daedd99bbac1b0406588af39654199-5913", "8b9581b551ac55e255efffed7553dc78-5937", +"3250e1960d390c879cacf7ae8fe8e4e0-6006", "48dd8175cf4ef52f8452745629310e79-6153", +"2de16d1ccc8fa906aa26f49d2fe752e7-6235", "123a2132ede3991a4bee12d96e4ae51c-6169", +"250e3e72cc5bd4ab5dbec899b8db7580-6267", "6305e97bb00c764b1102d22c2be6d663-8495", +"dd03bf567aed0e41b3736d9e9f607e85-6647", "fe2cd8dd764503640f74806e34248781-6840", +"160dc0f8d12b3c91c6ed2ad0df87b4e4-7170", "1eb34559b0e80b6ad778c7a40d1dd45e-7623", +"586e1ace0577ca44c422a0029960098b-7773", "31af31381ff92756beb7ab7559c33291-7858", +"17782fc0071ee11ff5db519446e1edf0-7945", "196ea8897b159c0378dc5086d6fca861-8043", +"5eef0f01562f7bc31d5a016fbd6042f7-5933", "3e417701f575d2a9a19582d2606231d2-8307", +"66db92cdd582f8cf93b03e9b3ade167d-8424", "6305e97bb00c764b1102d22c2be6d663-8495", +"cf5dbcfb9f8c4e01d7dd0d718429b642-8558", "47b39e33efd9824c7c5c68f06021ba88-8658", +"104df834ffb8ff0dfd20e121a6318618-8823", "aeafd8a98336812226affb0bedc973cd-8908", +"f5fbc47585eb76f74887decf4b4d36e7-9013", "1e56d469216fc77d07d103a15d9d7427-9120", +"5cb71554733ac30fcb95c0b99d4c7138-9230", "1936b0825dec4b11db705f8053b7da78-9415", +"ab52c1f7a2235692faa34d431751badb-9451", "7216bfd6eafcd0ff83894acece592c29-9556", +"8f0977e27c695a954f843503a6120946-6367", "242406bc4cce046eaae133f4cf648978-6482", +"e259608e23b932b6aed7f7c67b5256b9-6564", "9a730fb1461b0f170940845fbb93dd2f-6642" +), size = structure(c(48673539319, 49595814726, 49800224394, +50374663277, 51606812435, 52296866610, 51747932929, 52564041493, +71258741599, 55758917898, 57377765958, 60145344184, 63942201992, +65197539697, 65912914209, 66646812608, 67464925959, 99530559599, +69683786041, 70662719003, 71258741599, 71782804006, 72624782825, +74010463525, 74724383102, 75604365177, 76498078549, 77423432433, +78973357479, 79275265379, 80153941136, 106818152559, 108733400373, +110120778006, 111419433519), fallback_class = "object_size", class = "vctrs:::common_class_fallback")), row.names = c(NA, +-35L), class = c("tbl_df", "tbl", "data.frame")) diff --git a/man/get_md_cr.Rd b/man/get_md_cr.Rd deleted file mode 100644 index ad1d0f3..0000000 --- a/man/get_md_cr.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cr.R -\name{get_md_cr} -\alias{get_md_cr} -\title{Get the metadata for a Crossref snapshot} -\usage{ -get_md_cr( - url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz" -) -} -\arguments{ -\item{url}{The url to the snapshot as documented by Crossref. -Passed on to \code{\link[httr:HEAD]{httr::HEAD()}}.} -} -\description{ -Gets metadata from the AWS S3 \href{https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html}{response header}. - -Happily, this works without authorisation for Metadata Plus. -} -\examples{ -get_md_cr() - -} -\concept{cr} -\concept{lake} -\concept{version} diff --git a/man/md_cr_snaps_m.Rd b/man/md_cr_snaps_m.Rd new file mode 100644 index 0000000..f4db28b --- /dev/null +++ b/man/md_cr_snaps_m.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cr.R +\name{md_cr_snaps_m} +\alias{md_cr_snaps_m} +\alias{update_md_cr_snaps_m} +\alias{get_md_cr_snaps_m} +\title{Metadata for Crossref monthly Snapshots} +\usage{ +md_cr_snaps_m() + +update_md_cr_snaps_m() + +get_md_cr_snaps_m() +} +\description{ +Crossref releases dumps of their database on the 5th of every months. +These dumps sometimes change after the fact. +This metadata, stored in this package, +locks down the versioning of the dumps for our analyses. +} +\details{ +To lock down the reproducibility of the metadata dumps, +we store their checksums in this package, +instead of merely relying on their \emph{release date}. +(They have sometimes been updated in the past \emph{after} release.) + +Computing a checksum ourselves would be quite expensive +Gets metadata from the AWS S3 \href{https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html}{response header}. + +Happily, this works without authorisation for Metadata Plus. + +Current monthly snapshots:\if{html}{\out{