Skip to content

Commit

Permalink
lock down cr md snaps #15 opens #16
Browse files Browse the repository at this point in the history
  • Loading branch information
maxheld83 committed Apr 19, 2021
1 parent ab098b6 commit b7e4107
Show file tree
Hide file tree
Showing 6 changed files with 256 additions and 44 deletions.
2 changes: 1 addition & 1 deletion NAMESPACE
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
# Generated by roxygen2: do not edit by hand

export("%>%")
export(get_md_cr)
export(md_cr_snaps_m)
importFrom(magrittr,"%>%")
103 changes: 88 additions & 15 deletions R/cr.R
Original file line number Diff line number Diff line change
@@ -1,26 +1,70 @@
# version ====

#' Get the metadata for a Crossref snapshot
#' Metadata for Crossref monthly Snapshots
#'
#' @description
#' Crossref releases dumps of their database on the 5th of every months.
#' These dumps sometimes change after the fact.
#' This metadata, stored in this package,
#' locks down the versioning of the dumps for our analyses.
#'
#' @details
#' To lock down the reproducibility of the metadata dumps,
#' we store their checksums in this package,
#' instead of merely relying on their *release date*.
#' (They have sometimes been updated in the past *after* release.)
#'
#' Computing a checksum ourselves would be quite expensive
#' Gets metadata from the AWS S3 [response header](https://docs.aws.amazon.com/AmazonS3/latest/API/RESTCommonResponseHeaders.html).
#'
#' Happily, this works without authorisation for Metadata Plus.
#'
#' @examples
#' get_md_cr()
#' Current monthly snapshots:
#'
#' @param url
#' The url to the snapshot as documented by Crossref.
#' Passed on to [httr::HEAD()].
#' ```{r}
#' knitr::kable(md_cr_snaps_m())
#' ```
#'
#' @family version
#' @family cr
#' @family lake
#'
#' @export
get_md_cr <- function(url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz") {
res <- httr::HEAD(url) %>%
md_cr_snaps_m <- function() {
dget(file = system.file("checksums", "cr_snaps_md.R", package = "leine"))
}

#' @describeIn md_cr_snaps_m
#' Update the metadata of monthly snapshots stored in leine
update_md_cr_snaps_m <- function() {
res <- get_md_cr_snaps_m()
# TODO use datapasta for prettier output here again
# https://github.com/subugoe/leine/issues/16
# datapasta::tribble_construct(res)
dput(
res,
file = system.file("checksums", "cr_snaps_md.R", package = "leine")
)
invisible(res)
}

#' @describeIn md_cr_snaps_m
#' Get metadata for currently available monthly snapshots
get_md_cr_snaps_m <- function() {
purrr::map_dfr(
get_md_cr_snaps_m_urls(),
get_md_cr_snap_m,
.id = "period"
)
}

#' Get the metadata from the AWS S3 header response for *one* snapshot
#' @param url
#' The url to the snapshot as documented by Crossref.
#' Passed on to [httr::HEAD()].
#' @noRd
get_md_cr_snap_m <- function(url = "https://api.crossref.org/snapshots/monthly/latest/all.json.tar.gz") {
res <- httr::RETRY(verb = "HEAD", url) %>%
httr::stop_for_status() %>%
httr::headers()
if (res$server != "AmazonS3") {
Expand All @@ -29,16 +73,45 @@ get_md_cr <- function(url = "https://api.crossref.org/snapshots/monthly/latest/a
i = "Perhaps Crossref has changed their internal storage implementation."
))
}
parse_date_time_aws <- purrr::partial(
lubridate::parse_date_time,
orders = "a d b Y HMS",
tz = "GMT"
)

list(
date_retrieved = parse_date_time_aws(res$date),
url = url,
date_modified = parse_date_time_aws(res[["last-modified"]]),
etag = stringr::str_extract(res$etag, '(?<=\").*?(?=\")'),
size = structure(as.numeric(res[["content-length"]]), class = "object_size")
)
}

#' Translate datetime from AWS S3 header response to R
#' @noRd
parse_date_time_aws <- purrr::partial(
lubridate::parse_date_time,
orders = "a d b Y HMS",
tz = "GMT"
)

#' Find all URLs to monthly cr snapshots up to now
#' @noRd
get_md_cr_snaps_m_urls <- function() {
start <- lubridate::ymd("2018-04-06") # always supposed to come out on the 5th
end <- lubridate::today()
n_months_completed <- lubridate::interval(start, end) %/% months(1) - 1
months_completed <- rep(start, n_months_completed)
lubridate::month(months_completed) <-
lubridate::month(start) + c(1:n_months_completed)
names(months_completed) <- format(months_completed, "%Y-%m")
purrr::map_chr(
months_completed,
function(x) ym2cr_url(lubridate::year(x), format(x, "%Om"))
)
}

#' Construct URLs to monthly cr snapshots
#' @param year,month year and month as character strings
#' @noRd
ym2cr_url <- function(y = "2018", m = "04") {
base_url <- httr::parse_url("https://api.crossref.org/")
base_url$path <- paste(
"snapshots", "monthly", y, m, "all.json.tar.gz", sep = "/"
)
httr::build_url(base_url)
}
75 changes: 75 additions & 0 deletions inst/checksums/cr_snaps_md.R
Original file line number Diff line number Diff line change
@@ -0,0 +1,75 @@
structure(list(period = c("2018-05", "2018-06", "2018-07", "2018-08",
"2018-09", "2018-10", "2018-11", "2018-12", "2019-01", "2019-02",
"2019-03", "2019-04", "2019-05", "2019-06", "2019-07", "2019-08",
"2019-09", "2019-10", "2019-11", "2019-12", "2020-01", "2020-02",
"2020-03", "2020-04", "2020-05", "2020-06", "2020-07", "2020-08",
"2020-09", "2020-10", "2020-11", "2020-12", "2021-01", "2021-02",
"2021-03"), url = c("https://api.crossref.org/snapshots/monthly/2018/05/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/06/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/07/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/08/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/09/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/10/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/11/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2018/12/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/01/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/02/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/03/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/04/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/05/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/06/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/07/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/08/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/09/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/10/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/11/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2019/12/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/01/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/02/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/03/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/04/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/05/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/06/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/07/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/08/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/09/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/10/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/11/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2020/12/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2021/01/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2021/02/all.json.tar.gz",
"https://api.crossref.org/snapshots/monthly/2021/03/all.json.tar.gz"
), date_modified = structure(c(1528489927, 1532637202, 1533410124,
1536005052, 1539286122, 1541282824, 1543874532, 1546651488, 1582139958,
1551711146, 1554300962, 1556925364, 1559650565, 1563033950, 1564897564,
1567583164, 1570192960, 1573652540, 1575566164, 1578343159, 1582141208,
1583403669, 1585989069, 1589197961, 1591277469, 1593865868, 1596558664,
1599327273, 1603833389, 1604556067, 1607160669, 1610469968, 1612544468,
1614969069, 1617643868), tzone = "GMT", class = c("POSIXct",
"POSIXt")), etag = c("5fdf4c3e01a713c4391fbc193391a948-5803",
"66daedd99bbac1b0406588af39654199-5913", "8b9581b551ac55e255efffed7553dc78-5937",
"3250e1960d390c879cacf7ae8fe8e4e0-6006", "48dd8175cf4ef52f8452745629310e79-6153",
"2de16d1ccc8fa906aa26f49d2fe752e7-6235", "123a2132ede3991a4bee12d96e4ae51c-6169",
"250e3e72cc5bd4ab5dbec899b8db7580-6267", "6305e97bb00c764b1102d22c2be6d663-8495",
"dd03bf567aed0e41b3736d9e9f607e85-6647", "fe2cd8dd764503640f74806e34248781-6840",
"160dc0f8d12b3c91c6ed2ad0df87b4e4-7170", "1eb34559b0e80b6ad778c7a40d1dd45e-7623",
"586e1ace0577ca44c422a0029960098b-7773", "31af31381ff92756beb7ab7559c33291-7858",
"17782fc0071ee11ff5db519446e1edf0-7945", "196ea8897b159c0378dc5086d6fca861-8043",
"5eef0f01562f7bc31d5a016fbd6042f7-5933", "3e417701f575d2a9a19582d2606231d2-8307",
"66db92cdd582f8cf93b03e9b3ade167d-8424", "6305e97bb00c764b1102d22c2be6d663-8495",
"cf5dbcfb9f8c4e01d7dd0d718429b642-8558", "47b39e33efd9824c7c5c68f06021ba88-8658",
"104df834ffb8ff0dfd20e121a6318618-8823", "aeafd8a98336812226affb0bedc973cd-8908",
"f5fbc47585eb76f74887decf4b4d36e7-9013", "1e56d469216fc77d07d103a15d9d7427-9120",
"5cb71554733ac30fcb95c0b99d4c7138-9230", "1936b0825dec4b11db705f8053b7da78-9415",
"ab52c1f7a2235692faa34d431751badb-9451", "7216bfd6eafcd0ff83894acece592c29-9556",
"8f0977e27c695a954f843503a6120946-6367", "242406bc4cce046eaae133f4cf648978-6482",
"e259608e23b932b6aed7f7c67b5256b9-6564", "9a730fb1461b0f170940845fbb93dd2f-6642"
), size = structure(c(48673539319, 49595814726, 49800224394,
50374663277, 51606812435, 52296866610, 51747932929, 52564041493,
71258741599, 55758917898, 57377765958, 60145344184, 63942201992,
65197539697, 65912914209, 66646812608, 67464925959, 99530559599,
69683786041, 70662719003, 71258741599, 71782804006, 72624782825,
74010463525, 74724383102, 75604365177, 76498078549, 77423432433,
78973357479, 79275265379, 80153941136, 106818152559, 108733400373,
110120778006, 111419433519), fallback_class = "object_size", class = "vctrs:::common_class_fallback")), row.names = c(NA,
-35L), class = c("tbl_df", "tbl", "data.frame"))
26 changes: 0 additions & 26 deletions man/get_md_cr.Rd

This file was deleted.

83 changes: 83 additions & 0 deletions man/md_cr_snaps_m.Rd

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

11 changes: 9 additions & 2 deletions tests/testthat/test-cr.R
Original file line number Diff line number Diff line change
@@ -1,4 +1,11 @@
test_that("getting metadata works", {
expect_error(get_md_cr(url = "https://www.google.com"))
expect_type(get_md_cr(), "list")
expect_error(get_md_cr_snap_m(url = "https://www.google.com"))
expect_type(get_md_cr_snap_m(), "list")
})

test_that("stored metadata is still up to date", {
expect_equal(
md_cr_snaps_m(),
get_md_cr_snaps_m()
)
})

0 comments on commit b7e4107

Please sign in to comment.