diff --git a/NAMESPACE b/NAMESPACE index c5e27a6..7620a2f 100644 --- a/NAMESPACE +++ b/NAMESPACE @@ -1,5 +1,5 @@ # Generated by roxygen2: do not edit by hand export("%>%") -export(get_md_cr) +export(md_cr_snaps_m) importFrom(magrittr,"%>%") diff --git a/R/cr.R b/R/cr.R index ae304b4..047264e 100644 --- a/R/cr.R +++ b/R/cr.R @@ -1,26 +1,70 @@ # version ==== -#' Get the metadata for a Crossref snapshot +#' Metadata for Crossref monthly Snapshots #' #' @description +#' Crossref releases dumps of their database on the 5th of every months. +#' These dumps sometimes change after the fact. +#' This metadata, stored in this package, +#' locks down the versioning of the dumps for our analyses. +#' +#' @details +#' To lock down the reproducibility of the metadata dumps, +#' we store their checksums in this package, +#' instead of merely relying on their *release date*. +#' (They have sometimes been updated in the past *after* release.) +#' +#' Computing a checksum ourselves would be quite expensive #' Gets metadata from the AWS S3 [response header](https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html). #' #' Happily, this works without authorisation for Metadata Plus. #' -#' @examples -#' get_md_cr() +#' Current monthly snapshots: #' -#' @param url -#' The url to the snapshot as documented by Crossref. -#' Passed on to [httr::HEAD()]. +#' ```{r} +#' knitr::kable(md_cr_snaps_m()) +#' ``` #' #' @family version #' @family cr #' @family lake #' #' @export -get_md_cr <- function(url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz") { - res <- httr::HEAD(url) %>% +md_cr_snaps_m <- function() { + dget(file = system.file("checksums", "cr_snaps_md.R", package = "leine")) +} + +#' @describeIn md_cr_snaps_m +#' Update the metadata of monthly snapshots stored in leine +update_md_cr_snaps_m <- function() { + res <- get_md_cr_snaps_m() + # TODO use datapasta for prettier output here again + # https://github.com/subugoe/leine/issues/16 + # datapasta::tribble_construct(res) + dput( + res, + file = system.file("checksums", "cr_snaps_md.R", package = "leine") + ) + invisible(res) +} + +#' @describeIn md_cr_snaps_m +#' Get metadata for currently available monthly snapshots +get_md_cr_snaps_m <- function() { + purrr::map_dfr( + get_md_cr_snaps_m_urls(), + get_md_cr_snap_m, + .id = "period" + ) +} + +#' Get the metadata from the AWS S3 header response for *one* snapshot +#' @param url +#' The url to the snapshot as documented by Crossref. +#' Passed on to [httr::HEAD()]. +#' @noRd +get_md_cr_snap_m <- function(url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz") { + res <- httr::RETRY(verb = "HEAD", url) %>% httr::stop_for_status() %>% httr::headers() if (res$server != "AmazonS3") { @@ -29,16 +73,45 @@ get_md_cr <- function(url = "https://api.crossref.org/snapshots/monthly/latest/a i = "Perhaps Crossref has changed their internal storage implementation." )) } - parse_date_time_aws <- purrr::partial( - lubridate::parse_date_time, - orders = "a d b Y HMS", - tz = "GMT" - ) - list( - date_retrieved = parse_date_time_aws(res$date), + url = url, date_modified = parse_date_time_aws(res[["last-modified"]]), etag = stringr::str_extract(res$etag, '(?<=\").*?(?=\")'), size = structure(as.numeric(res[["content-length"]]), class = "object_size") ) } + +#' Translate datetime from AWS S3 header response to R +#' @noRd +parse_date_time_aws <- purrr::partial( + lubridate::parse_date_time, + orders = "a d b Y HMS", + tz = "GMT" +) + +#' Find all URLs to monthly cr snapshots up to now +#' @noRd +get_md_cr_snaps_m_urls <- function() { + start <- lubridate::ymd("2018-04-06") # always supposed to come out on the 5th + end <- lubridate::today() + n_months_completed <- lubridate::interval(start, end) %/% months(1) - 1 + months_completed <- rep(start, n_months_completed) + lubridate::month(months_completed) <- + lubridate::month(start) + c(1:n_months_completed) + names(months_completed) <- format(months_completed, "%Y-%m") + purrr::map_chr( + months_completed, + function(x) ym2cr_url(lubridate::year(x), format(x, "%Om")) + ) +} + +#' Construct URLs to monthly cr snapshots +#' @param year,month year and month as character strings +#' @noRd +ym2cr_url <- function(y = "2018", m = "04") { + base_url <- httr::parse_url("https://api.crossref.org/") + base_url$path <- paste( + "snapshots", "monthly", y, m, "all.json.tar.gz", sep = "/" + ) + httr::build_url(base_url) +} diff --git a/inst/checksums/cr_snaps_md.R b/inst/checksums/cr_snaps_md.R new file mode 100644 index 0000000..da44039 --- /dev/null +++ b/inst/checksums/cr_snaps_md.R @@ -0,0 +1,75 @@ +structure(list(period = c("2018-05", "2018-06", "2018-07", "2018-08", +"2018-09", "2018-10", "2018-11", "2018-12", "2019-01", "2019-02", +"2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08", +"2019-09", "2019-10", "2019-11", "2019-12", "2020-01", "2020-02", +"2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08", +"2020-09", "2020-10", "2020-11", "2020-12", "2021-01", "2021-02", +"2021-03"), url = c("https://api.crossref.org/snapshots/monthly/2018/05/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/06/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/07/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/08/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/09/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/10/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/11/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2018/12/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/01/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/02/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/03/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/04/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/05/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/06/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/07/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/08/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/09/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/10/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/11/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2019/12/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/01/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/02/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/03/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/04/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/05/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/06/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/07/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/08/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/09/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/10/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/11/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2020/12/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2021/01/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2021/02/all.json.tar.gz", +"https://api.crossref.org/snapshots/monthly/2021/03/all.json.tar.gz" +), date_modified = structure(c(1528489927, 1532637202, 1533410124, +1536005052, 1539286122, 1541282824, 1543874532, 1546651488, 1582139958, +1551711146, 1554300962, 1556925364, 1559650565, 1563033950, 1564897564, +1567583164, 1570192960, 1573652540, 1575566164, 1578343159, 1582141208, +1583403669, 1585989069, 1589197961, 1591277469, 1593865868, 1596558664, +1599327273, 1603833389, 1604556067, 1607160669, 1610469968, 1612544468, +1614969069, 1617643868), tzone = "GMT", class = c("POSIXct", +"POSIXt")), etag = c("5fdf4c3e01a713c4391fbc193391a948-5803", +"66daedd99bbac1b0406588af39654199-5913", "8b9581b551ac55e255efffed7553dc78-5937", +"3250e1960d390c879cacf7ae8fe8e4e0-6006", "48dd8175cf4ef52f8452745629310e79-6153", +"2de16d1ccc8fa906aa26f49d2fe752e7-6235", "123a2132ede3991a4bee12d96e4ae51c-6169", +"250e3e72cc5bd4ab5dbec899b8db7580-6267", "6305e97bb00c764b1102d22c2be6d663-8495", +"dd03bf567aed0e41b3736d9e9f607e85-6647", "fe2cd8dd764503640f74806e34248781-6840", +"160dc0f8d12b3c91c6ed2ad0df87b4e4-7170", "1eb34559b0e80b6ad778c7a40d1dd45e-7623", +"586e1ace0577ca44c422a0029960098b-7773", "31af31381ff92756beb7ab7559c33291-7858", +"17782fc0071ee11ff5db519446e1edf0-7945", "196ea8897b159c0378dc5086d6fca861-8043", +"5eef0f01562f7bc31d5a016fbd6042f7-5933", "3e417701f575d2a9a19582d2606231d2-8307", +"66db92cdd582f8cf93b03e9b3ade167d-8424", "6305e97bb00c764b1102d22c2be6d663-8495", +"cf5dbcfb9f8c4e01d7dd0d718429b642-8558", "47b39e33efd9824c7c5c68f06021ba88-8658", +"104df834ffb8ff0dfd20e121a6318618-8823", "aeafd8a98336812226affb0bedc973cd-8908", +"f5fbc47585eb76f74887decf4b4d36e7-9013", "1e56d469216fc77d07d103a15d9d7427-9120", +"5cb71554733ac30fcb95c0b99d4c7138-9230", "1936b0825dec4b11db705f8053b7da78-9415", +"ab52c1f7a2235692faa34d431751badb-9451", "7216bfd6eafcd0ff83894acece592c29-9556", +"8f0977e27c695a954f843503a6120946-6367", "242406bc4cce046eaae133f4cf648978-6482", +"e259608e23b932b6aed7f7c67b5256b9-6564", "9a730fb1461b0f170940845fbb93dd2f-6642" +), size = structure(c(48673539319, 49595814726, 49800224394, +50374663277, 51606812435, 52296866610, 51747932929, 52564041493, +71258741599, 55758917898, 57377765958, 60145344184, 63942201992, +65197539697, 65912914209, 66646812608, 67464925959, 99530559599, +69683786041, 70662719003, 71258741599, 71782804006, 72624782825, +74010463525, 74724383102, 75604365177, 76498078549, 77423432433, +78973357479, 79275265379, 80153941136, 106818152559, 108733400373, +110120778006, 111419433519), fallback_class = "object_size", class = "vctrs:::common_class_fallback")), row.names = c(NA, +-35L), class = c("tbl_df", "tbl", "data.frame")) diff --git a/man/get_md_cr.Rd b/man/get_md_cr.Rd deleted file mode 100644 index ad1d0f3..0000000 --- a/man/get_md_cr.Rd +++ /dev/null @@ -1,26 +0,0 @@ -% Generated by roxygen2: do not edit by hand -% Please edit documentation in R/cr.R -\name{get_md_cr} -\alias{get_md_cr} -\title{Get the metadata for a Crossref snapshot} -\usage{ -get_md_cr( - url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz" -) -} -\arguments{ -\item{url}{The url to the snapshot as documented by Crossref. -Passed on to \code{\link[httr:HEAD]{httr::HEAD()}}.} -} -\description{ -Gets metadata from the AWS S3 \href{https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html}{response header}. - -Happily, this works without authorisation for Metadata Plus. -} -\examples{ -get_md_cr() - -} -\concept{cr} -\concept{lake} -\concept{version} diff --git a/man/md_cr_snaps_m.Rd b/man/md_cr_snaps_m.Rd new file mode 100644 index 0000000..f4db28b --- /dev/null +++ b/man/md_cr_snaps_m.Rd @@ -0,0 +1,83 @@ +% Generated by roxygen2: do not edit by hand +% Please edit documentation in R/cr.R +\name{md_cr_snaps_m} +\alias{md_cr_snaps_m} +\alias{update_md_cr_snaps_m} +\alias{get_md_cr_snaps_m} +\title{Metadata for Crossref monthly Snapshots} +\usage{ +md_cr_snaps_m() + +update_md_cr_snaps_m() + +get_md_cr_snaps_m() +} +\description{ +Crossref releases dumps of their database on the 5th of every months. +These dumps sometimes change after the fact. +This metadata, stored in this package, +locks down the versioning of the dumps for our analyses. +} +\details{ +To lock down the reproducibility of the metadata dumps, +we store their checksums in this package, +instead of merely relying on their \emph{release date}. +(They have sometimes been updated in the past \emph{after} release.) + +Computing a checksum ourselves would be quite expensive +Gets metadata from the AWS S3 \href{https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html}{response header}. + +Happily, this works without authorisation for Metadata Plus. + +Current monthly snapshots:\if{html}{\out{
}}\preformatted{knitr::kable(md_cr_snaps_m()) +}\if{html}{\out{
}}\tabular{llllr}{ + period \tab url \tab date_modified \tab etag \tab size \cr + 2018-05 \tab https://api.crossref.org/snapshots/monthly/2018/05/all.json.tar.gz \tab 2018-06-08 20:32:07 \tab 5fdf4c3e01a713c4391fbc193391a948-5803 \tab 48673539319 \cr + 2018-06 \tab https://api.crossref.org/snapshots/monthly/2018/06/all.json.tar.gz \tab 2018-07-26 20:33:22 \tab 66daedd99bbac1b0406588af39654199-5913 \tab 49595814726 \cr + 2018-07 \tab https://api.crossref.org/snapshots/monthly/2018/07/all.json.tar.gz \tab 2018-08-04 19:15:24 \tab 8b9581b551ac55e255efffed7553dc78-5937 \tab 49800224394 \cr + 2018-08 \tab https://api.crossref.org/snapshots/monthly/2018/08/all.json.tar.gz \tab 2018-09-03 20:04:12 \tab 3250e1960d390c879cacf7ae8fe8e4e0-6006 \tab 50374663277 \cr + 2018-09 \tab https://api.crossref.org/snapshots/monthly/2018/09/all.json.tar.gz \tab 2018-10-11 19:28:42 \tab 48dd8175cf4ef52f8452745629310e79-6153 \tab 51606812435 \cr + 2018-10 \tab https://api.crossref.org/snapshots/monthly/2018/10/all.json.tar.gz \tab 2018-11-03 22:07:04 \tab 2de16d1ccc8fa906aa26f49d2fe752e7-6235 \tab 52296866610 \cr + 2018-11 \tab https://api.crossref.org/snapshots/monthly/2018/11/all.json.tar.gz \tab 2018-12-03 22:02:12 \tab 123a2132ede3991a4bee12d96e4ae51c-6169 \tab 51747932929 \cr + 2018-12 \tab https://api.crossref.org/snapshots/monthly/2018/12/all.json.tar.gz \tab 2019-01-05 01:24:48 \tab 250e3e72cc5bd4ab5dbec899b8db7580-6267 \tab 52564041493 \cr + 2019-01 \tab https://api.crossref.org/snapshots/monthly/2019/01/all.json.tar.gz \tab 2020-02-19 19:19:18 \tab 6305e97bb00c764b1102d22c2be6d663-8495 \tab 71258741599 \cr + 2019-02 \tab https://api.crossref.org/snapshots/monthly/2019/02/all.json.tar.gz \tab 2019-03-04 14:52:26 \tab dd03bf567aed0e41b3736d9e9f607e85-6647 \tab 55758917898 \cr + 2019-03 \tab https://api.crossref.org/snapshots/monthly/2019/03/all.json.tar.gz \tab 2019-04-03 14:16:02 \tab fe2cd8dd764503640f74806e34248781-6840 \tab 57377765958 \cr + 2019-04 \tab https://api.crossref.org/snapshots/monthly/2019/04/all.json.tar.gz \tab 2019-05-03 23:16:04 \tab 160dc0f8d12b3c91c6ed2ad0df87b4e4-7170 \tab 60145344184 \cr + 2019-05 \tab https://api.crossref.org/snapshots/monthly/2019/05/all.json.tar.gz \tab 2019-06-04 12:16:05 \tab 1eb34559b0e80b6ad778c7a40d1dd45e-7623 \tab 63942201992 \cr + 2019-06 \tab https://api.crossref.org/snapshots/monthly/2019/06/all.json.tar.gz \tab 2019-07-13 16:05:50 \tab 586e1ace0577ca44c422a0029960098b-7773 \tab 65197539697 \cr + 2019-07 \tab https://api.crossref.org/snapshots/monthly/2019/07/all.json.tar.gz \tab 2019-08-04 05:46:04 \tab 31af31381ff92756beb7ab7559c33291-7858 \tab 65912914209 \cr + 2019-08 \tab https://api.crossref.org/snapshots/monthly/2019/08/all.json.tar.gz \tab 2019-09-04 07:46:04 \tab 17782fc0071ee11ff5db519446e1edf0-7945 \tab 66646812608 \cr + 2019-09 \tab https://api.crossref.org/snapshots/monthly/2019/09/all.json.tar.gz \tab 2019-10-04 12:42:40 \tab 196ea8897b159c0378dc5086d6fca861-8043 \tab 67464925959 \cr + 2019-10 \tab https://api.crossref.org/snapshots/monthly/2019/10/all.json.tar.gz \tab 2019-11-13 13:42:20 \tab 5eef0f01562f7bc31d5a016fbd6042f7-5933 \tab 99530559599 \cr + 2019-11 \tab https://api.crossref.org/snapshots/monthly/2019/11/all.json.tar.gz \tab 2019-12-05 17:16:04 \tab 3e417701f575d2a9a19582d2606231d2-8307 \tab 69683786041 \cr + 2019-12 \tab https://api.crossref.org/snapshots/monthly/2019/12/all.json.tar.gz \tab 2020-01-06 20:39:19 \tab 66db92cdd582f8cf93b03e9b3ade167d-8424 \tab 70662719003 \cr + 2020-01 \tab https://api.crossref.org/snapshots/monthly/2020/01/all.json.tar.gz \tab 2020-02-19 19:40:08 \tab 6305e97bb00c764b1102d22c2be6d663-8495 \tab 71258741599 \cr + 2020-02 \tab https://api.crossref.org/snapshots/monthly/2020/02/all.json.tar.gz \tab 2020-03-05 10:21:09 \tab cf5dbcfb9f8c4e01d7dd0d718429b642-8558 \tab 71782804006 \cr + 2020-03 \tab https://api.crossref.org/snapshots/monthly/2020/03/all.json.tar.gz \tab 2020-04-04 08:31:09 \tab 47b39e33efd9824c7c5c68f06021ba88-8658 \tab 72624782825 \cr + 2020-04 \tab https://api.crossref.org/snapshots/monthly/2020/04/all.json.tar.gz \tab 2020-05-11 11:52:41 \tab 104df834ffb8ff0dfd20e121a6318618-8823 \tab 74010463525 \cr + 2020-05 \tab https://api.crossref.org/snapshots/monthly/2020/05/all.json.tar.gz \tab 2020-06-04 13:31:09 \tab aeafd8a98336812226affb0bedc973cd-8908 \tab 74724383102 \cr + 2020-06 \tab https://api.crossref.org/snapshots/monthly/2020/06/all.json.tar.gz \tab 2020-07-04 12:31:08 \tab f5fbc47585eb76f74887decf4b4d36e7-9013 \tab 75604365177 \cr + 2020-07 \tab https://api.crossref.org/snapshots/monthly/2020/07/all.json.tar.gz \tab 2020-08-04 16:31:04 \tab 1e56d469216fc77d07d103a15d9d7427-9120 \tab 76498078549 \cr + 2020-08 \tab https://api.crossref.org/snapshots/monthly/2020/08/all.json.tar.gz \tab 2020-09-05 17:34:33 \tab 5cb71554733ac30fcb95c0b99d4c7138-9230 \tab 77423432433 \cr + 2020-09 \tab https://api.crossref.org/snapshots/monthly/2020/09/all.json.tar.gz \tab 2020-10-27 21:16:29 \tab 1936b0825dec4b11db705f8053b7da78-9415 \tab 78973357479 \cr + 2020-10 \tab https://api.crossref.org/snapshots/monthly/2020/10/all.json.tar.gz \tab 2020-11-05 06:01:07 \tab ab52c1f7a2235692faa34d431751badb-9451 \tab 79275265379 \cr + 2020-11 \tab https://api.crossref.org/snapshots/monthly/2020/11/all.json.tar.gz \tab 2020-12-05 09:31:09 \tab 7216bfd6eafcd0ff83894acece592c29-9556 \tab 80153941136 \cr + 2020-12 \tab https://api.crossref.org/snapshots/monthly/2020/12/all.json.tar.gz \tab 2021-01-12 16:46:08 \tab 8f0977e27c695a954f843503a6120946-6367 \tab 106818152559 \cr + 2021-01 \tab https://api.crossref.org/snapshots/monthly/2021/01/all.json.tar.gz \tab 2021-02-05 17:01:08 \tab 242406bc4cce046eaae133f4cf648978-6482 \tab 108733400373 \cr + 2021-02 \tab https://api.crossref.org/snapshots/monthly/2021/02/all.json.tar.gz \tab 2021-03-05 18:31:09 \tab e259608e23b932b6aed7f7c67b5256b9-6564 \tab 110120778006 \cr + 2021-03 \tab https://api.crossref.org/snapshots/monthly/2021/03/all.json.tar.gz \tab 2021-04-05 17:31:08 \tab 9a730fb1461b0f170940845fbb93dd2f-6642 \tab 111419433519 \cr +} +} +\section{Related Functions and Methods}{ +\subsection{Functions}{ +\itemize{ +\item \code{update_md_cr_snaps_m}: Update the metadata of monthly snapshots stored in leine +} +\itemize{ +\item \code{get_md_cr_snaps_m}: Get metadata for currently available monthly snapshots +}}} + +\concept{cr} +\concept{lake} +\concept{version} diff --git a/tests/testthat/test-cr.R b/tests/testthat/test-cr.R index e9dc647..9e95e32 100644 --- a/tests/testthat/test-cr.R +++ b/tests/testthat/test-cr.R @@ -1,4 +1,11 @@ test_that("getting metadata works", { - expect_error(get_md_cr(url = "https://www.google.com")) - expect_type(get_md_cr(), "list") + expect_error(get_md_cr_snap_m(url = "https://www.google.com")) + expect_type(get_md_cr_snap_m(), "list") +}) + +test_that("stored metadata is still up to date", { + expect_equal( + md_cr_snaps_m(), + get_md_cr_snaps_m() + ) })