Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

[resource indexer] include dataset redirects in index #791

Merged
merged 4 commits into from
Feb 7, 2024
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
37 changes: 36 additions & 1 deletion resourceIndexer/coreUrlRemapping.js
Original file line number Diff line number Diff line change
@@ -1,7 +1,9 @@
import { readFileSync } from "fs";
import path from 'path';
import { fileURLToPath } from 'url';
import { ResourceIndexerError } from "./errors.js";
import { convertManifestJsonToAvailableDatasetList } from "../src/endpoints/charon/parseManifest.js";
import { datasetRedirects } from "../src/redirects.js";

const __filename = fileURLToPath(import.meta.url);
const __dirname = path.dirname(__filename);
Expand Down Expand Up @@ -29,19 +31,29 @@ export const remapCoreUrl = (urlPath) => {


/**
* The server contains a map of partial paths and their next (default) path,
* The server redirects core dataset URLs via two (complementary) approaches.
*
* First, there is a set of hardcoded redirects (src/redirects.js), as exposed by
* the `datasetRedirects` function.
*
* Secondly, we use our own "manifest JSON" to compute a map of partial paths
* and their next (default) path,
* e.g. flu → seasonal
* flu/seasonal → h3n2
* etc
* and uses an iterative approach to resolve the final path. Here we want to
* resolve the entire path for every partial path, for instance
* flu → flu/seasonal/h3n2/h2/2y
* flu/seasonal → flu/seasonal/h3n2/h2/2y
*
* This function combines those approaches and returns a complete list of
* URL paths and the final URL path they would be redirected to.
* @param {Object} defaults
* @returns {Object}
*/
function resolveAll(defaults) {
const urlMap = {};

Object.entries(defaults).forEach(([partialPath, nextPathPart]) => {
if (partialPath==='') return; // manifest has a base default of zika!
let resolvedPath = `${partialPath}/${nextPathPart}`;
Expand All @@ -50,5 +62,28 @@ function resolveAll(defaults) {
}
urlMap[partialPath] = resolvedPath;
})

/**
* Process server-hardcoded dataset redirects after the above defaults, as the
* path we redirect to may itself be further redirected by the defaults.
*/
const redirects = datasetRedirects().map((paths) => paths.map(_removeSlashes));
for (const [requestUrlPath, redirectUrlPath] of redirects) {
// The datasetRedirects are simple paths, with no version descriptors or
// queries or patterns/regex like behaviour. Nonetheless it's prudent
// to add some basic checks for these
if (requestUrlPath.includes('@') || requestUrlPath.includes(':') || requestUrlPath.includes('?')) {
throw new ResourceIndexerError(`Hardcoded server redirect '${requestUrlPath}' is invalid.`);
}
urlMap[requestUrlPath] = urlMap[redirectUrlPath] || redirectUrlPath;
}

return urlMap;
}

/**
* Remove any leading & trailing slashes
*/
function _removeSlashes(path) {
return path.replace(/^\/+/, "").replace(/\/+$/, "");
}
83 changes: 57 additions & 26 deletions src/redirects.js
Original file line number Diff line number Diff line change
Expand Up @@ -22,32 +22,18 @@ export const setup = (app) => {
res.redirect(`/groups${req.originalUrl}`);
});

/** prior to June 2021 our core nCoV builds were available at
* /ncov/global, ncov/asia etc. These used GISAID data exclusively.
* We now have GISAID builds and GenBank builds, and so the URLs
* (i.e. names on s3://nextstrain-data) have changed. We add redirects
* for the old URLs to point to the new GISAID URLs.
* The timestamped URLs (e.g. /ncov/:region/YYYY-MM-DD) which currently exist
* will not be redirected, but new URLs will be of the format
* /ncov/gisaid/:region/YYYY-MM-DD.
*/
app.route('/ncov/:region((global|asia|oceania|north-america|south-america|europe|africa))')
.get((req, res) => res.redirect(
url.format({
pathname: `/ncov/gisaid/${req.params.region}`,
query: req.query
})
));

/**
* We shifted from using 'monkeypox' to 'mpox', as per WHO naming
* recommendations. Note that monkeypox YYYY-MM-DD URLs remain,
* e.g. /monkeypox/hmpxv1/2022-09-04
*/
app.route('/monkeypox').get((req, res) => res.redirect('/mpox'));
app.route('/monkeypox/mpxv').get((req, res) => res.redirect('/mpox/all-clades'));
app.route('/monkeypox/hmpxv1').get((req, res) => res.redirect('/mpox/clade-IIb'));
app.route('/monkeypox/hmpxv1/big').get((req, res) => res.redirect('/mpox/lineage-B.1'));
/* handle redirects for dataset paths which have changed name & preserve any queries */
for (const [requestPath, redirectPath] of datasetRedirects()) {
app.route(requestPath).get((req, res) =>
res.redirect(url.format({pathname: redirectPath, query: req.query}))
)
/* redirect versioned requests as well. The format or existence of the
version isn't checked here, but it will be when the (redirected) request is
handled */
app.route(`${requestPath}@:version`).get((req, res) =>
res.redirect(url.format({pathname: `${redirectPath}@${req.params.version}`, query: req.query}))
)
}

/*
* Redirect to translations of narratives if the client has
Expand Down Expand Up @@ -141,3 +127,48 @@ export const setup = (app) => {
}

};


/**
* Produces a list of dataset redirects. These are defined in a stand-alone
* function so that they can be easily reused by the resource indexer to
* associate "old" filenames with their new dataset path.
Comment on lines +133 to +135
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

non-blocking

Why a function and not just a static list?

Copy link
Member Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

A few reasons:

  • I like the organizational structure it imposes. This includes using a docstring which is clear that it applies to all contents (and thus harder to miss).
  • It's easy to add checks later on if we want to - e.g. paths must start with /, or must not end with / etc.
  • More generally (although not really applicable in this case), I prefer deferred execution rather than execution upon import. Writ large we pay a sizeable performance cost.

Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Hmm. I don't find those compelling in this case, FWIW.

  • You can use a JSDoc comment with plain old variables.
  • Checks can be done on a plain old variable too, done immediately after. (Or in the example of slashes handling like you give, the values can just be canonicalized with a .map() instead of erroring.)
  • Deferred execution can be nice, but as you say is not applicable here at all.

*
* Express routes can include patterns and regexes but these are hard to parse
* outside of express so for these redirects you must use plain strings.
*
* @returns {[string,string][]} for each entry, [0]: request URL path, [1]:
* redirect URL path
*/
export function datasetRedirects() {

/** prior to June 2021 our core nCoV builds were available at
* /ncov/global, ncov/asia etc. These used GISAID data exclusively.
* We now have GISAID builds and GenBank builds, and so the URLs
* (i.e. names on s3://nextstrain-data) have changed. We add redirects
* for the old URLs to point to the new GISAID URLs.
* The timestamped URLs (e.g. /ncov/:region/YYYY-MM-DD) which currently exist
* will not be redirected, but new URLs will be of the format
* /ncov/gisaid/:region/YYYY-MM-DD.
*/
const ncovRegionRedirects =
['global', 'asia', 'oceania', 'north-america', 'south-america', 'europe', 'africa']
.map((region) => [`/ncov/${region}`, `/ncov/gisaid/${region}`]);

/**
* We shifted from using 'monkeypox' to 'mpox', as per WHO naming
* recommendations. Note that monkeypox YYYY-MM-DD URLs remain,
* e.g. /monkeypox/hmpxv1/2022-09-04
*/
const monkeypoxRedirects = [
['/monkeypox', '/mpox'],
['/monkeypox/mpxv', '/mpox/all-clades'],
['/monkeypox/hmpxv1', '/mpox/clade-IIb'],
['/monkeypox/hmpxv1/big', '/mpox/lineage-B.1'],
];

return [
...ncovRegionRedirects,
...monkeypoxRedirects,
]
}
7 changes: 4 additions & 3 deletions src/resourceIndex.js
Original file line number Diff line number Diff line change
Expand Up @@ -96,12 +96,12 @@ async function updateResourceVersions() {
if (RESOURCE_INDEX.startsWith("s3://")) {
const parts = RESOURCE_INDEX.match(/^s3:\/\/(.+?)\/(.+)$/);
const [Bucket, Key] = [parts[1], parts[2]];
utils.verbose(`Updating available resources index from s3://${Bucket}/${Key}`);
utils.verbose(`[RESOURCE INDEX UPDATE S3] Fetching index from s3://${Bucket}/${Key}`);
try {
const newETag = (await fetch(await signedUrl({bucket:Bucket, key:Key, method: 'HEAD'}), {method: 'HEAD'}))
.headers.get('etag'); // value is enclosed in double quotes, but that doesn't matter for our purposes
if (newETag && newETag === eTag) {
utils.verbose("Skipping available resource update as eTag hasn't changed");
utils.verbose(`[RESOURCE INDEX UPDATE S3] Skipping as eTag hasn't changed: ${eTag}`);
return;
}
const res = await fetch(await signedUrl({bucket:Bucket, key:Key, method: 'GET'}), {method: 'GET'})
Expand All @@ -110,8 +110,9 @@ async function updateResourceVersions() {
}
const newResources = JSON.parse(await gunzip(await res.buffer()));
[resources, eTag] = [newResources, newETag];
utils.verbose(`[RESOURCE INDEX UPDATE S3] Updated. New eTag: ${eTag}`);
} catch (err) {
utils.warn(`Resource updating failed: ${err.message}`)
utils.warn(`[RESOURCE INDEX UPDATE S3] updating failed: ${err.message}`)
}
return;
}
Expand Down
20 changes: 20 additions & 0 deletions test/date_descriptor.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -349,6 +349,26 @@ describe("Paths redirect with version descriptors", () => {
})


const redirectsRestOnly = [
/* Following paths are redirected early on in our routing stack and doesn't involve
any checking of the version provided */
["monkeypox/mpxv", "mpox/all-clades"],
["monkeypox/mpxv@anything", "mpox/all-clades@anything"]
]

describe("Paths redirect according to hardcoded top-level redirects", () => {
redirectsRestOnly.forEach(([fromUrl, toUrl]) => {
(['html', 'json']).forEach((type) => {
it(`REST API: ${fromUrl} goes to ${toUrl} (${type})`, async () => {
const res = await fetchType(`${BASE_URL}/${fromUrl}`, type);
expect(res.status).toEqual(302); // res.redirect() has a default of 302
expect(res.headers.get('location')).toEqual(`${BASE_URL}/${toUrl}`);
})
})
})
})


/**
* Map the sidecar names we use day-to-day to their content types
* used within the server
Expand Down