From 5a75ac0868c5ceea3b293f792549c1d53d81f013 Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 21 Jun 2022 13:48:09 -0700 Subject: [PATCH 1/3] Move run() and setupConsole() from scripts/migrate-group to the utils/scripts module Two generic functions that I wrote with reusability in mind, and now I have a need to re-use them. --- scripts/migrate-group | 63 +------------------------------------------ src/utils/scripts.js | 63 +++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 64 insertions(+), 62 deletions(-) diff --git a/scripts/migrate-group b/scripts/migrate-group index 227a9ba38..2701f2ff8 100755 --- a/scripts/migrate-group +++ b/scripts/migrate-group @@ -2,16 +2,13 @@ const {ArgumentParser} = require("argparse"); const IAM = require("@aws-sdk/client-iam"); const S3 = require("@aws-sdk/client-s3"); -const {spawn} = require("child_process"); -const {Console} = require("console"); const fs = require("fs"); const os = require("os"); const {basename, relative: relativePath, parse: parsePath} = require("path"); const process = require("process"); -const {Transform} = require("stream"); const {Group} = require("../src/groups"); -const {reportUnhandledRejectionsAtExit} = require("../src/utils/scripts"); +const {reportUnhandledRejectionsAtExit, run, setupConsole} = require("../src/utils/scripts"); const AWS_ACCOUNT_ID = process.env.AWS_ACCOUNT_ID; @@ -379,64 +376,6 @@ async function diff(...args) { } -/** - * Run a command with stdout and stderr sent to `console.log()` and - * `console.error()`. - * - * @param {string[]} argv - * @returns {{code, signal, argv}} - */ -async function run(argv) { - return new Promise((resolve, reject) => { - const proc = spawn(argv[0], argv.slice(1), {stdio: ["ignore", "pipe", "pipe"]}); - - proc.stdout.on("data", data => console.log(data.toString().replace(/\n$/, ""))); - proc.stderr.on("data", data => console.error(data.toString().replace(/\n$/, ""))); - - proc.on("close", (code, signal) => { - const result = code !== 0 || signal != null - ? reject - : resolve; - return result({code, signal, argv}); - }); - }); -} - - -/** - * Set up the global `console` object to prefix all output lines with an - * indication of the state of *dryRun*. - * - * @param {{dryRun: boolean}} - */ -function setupConsole({dryRun = true}) { - if (!dryRun) return; - - const LinePrefixer = class extends Transform { - constructor(prefix) { - super(); - this.prefix = prefix; - } - _transform(chunk, encoding, callback) { - // Prefix the beginning of the string and every internal newline, but not - // the last trailing newline. - this.push(chunk.toString().replace(/^|(?<=\n)(?!$)/g, this.prefix)); - callback(); - } - }; - - const stdout = new LinePrefixer("DRY RUN | "); - const stderr = new LinePrefixer("DRY RUN | "); - - stdout.pipe(process.stdout); - stderr.pipe(process.stderr); - - console = new Console({stdout, stderr}); - process.stdout = stdout; - process.stderr = stderr; -} - - /** * Read file at *path* as JSON. * diff --git a/src/utils/scripts.js b/src/utils/scripts.js index 8550854dc..f54a809eb 100644 --- a/src/utils/scripts.js +++ b/src/utils/scripts.js @@ -1,4 +1,41 @@ +const {Console} = require("console"); const process = require("process"); +const {spawn} = require("child_process"); +const {Transform} = require("stream"); + + +/** + * Set up the global `console` object to prefix all output lines with an + * indication of the state of *dryRun*. + * + * @param {{dryRun: boolean}} + */ +function setupConsole({dryRun = true}) { + if (!dryRun) return; + + const LinePrefixer = class extends Transform { + constructor(prefix) { + super(); + this.prefix = prefix; + } + _transform(chunk, encoding, callback) { + // Prefix the beginning of the string and every internal newline, but not + // the last trailing newline. + this.push(chunk.toString().replace(/^|(?<=\n)(?!$)/g, this.prefix)); + callback(); + } + }; + + const stdout = new LinePrefixer("DRY RUN | "); + const stderr = new LinePrefixer("DRY RUN | "); + + stdout.pipe(process.stdout); + stderr.pipe(process.stderr); + + console = new Console({stdout, stderr}); // eslint-disable-line no-global-assign + process.stdout = stdout; + process.stderr = stderr; +} /** @@ -40,6 +77,32 @@ function reportUnhandledRejectionsAtExit() { } +/** + * Run a command with stdout and stderr sent to `console.log()` and + * `console.error()`. + * + * @param {string[]} argv + * @returns {{code, signal, argv}} + */ +async function run(argv) { + return new Promise((resolve, reject) => { + const proc = spawn(argv[0], argv.slice(1), {stdio: ["ignore", "pipe", "pipe"]}); + + proc.stdout.on("data", data => console.log(data.toString().replace(/\n$/, ""))); + proc.stderr.on("data", data => console.error(data.toString().replace(/\n$/, ""))); + + proc.on("close", (code, signal) => { + const result = code !== 0 || signal != null + ? reject + : resolve; + return result({code, signal, argv}); + }); + }); +} + + module.exports = { + setupConsole, reportUnhandledRejectionsAtExit, + run, }; From d89c75f7653dcab8b8b774ab2c7213d4cc4cb67e Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Thu, 10 Mar 2022 12:25:51 -0800 Subject: [PATCH 2/3] Adjust layout of multi-tenant groups bucket MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bucket now stores datasets and narratives separately under a fixed prefix within each group's prefix so the user-provided names/paths of the datasets and narratives can't conflict with control objects we store at the root of the group's prefix. Besides making security easier to reason about here with the user-provided names, this will also, in time, allow us to eliminate a few special-cased exclusions (like excluding group-overview.md from a list of *.md narratives). As part of deploying this, we'll need to perform a one-time manual layout switch behind the scenes using scripts/migrate-groups-layout. Though migrate-groups-layout is intended to be run only twice: 1. Immediately before deploy of the layout change. 2. Immediately after deploy of the layout change, with --delete-after-copy. …it is designed to be safe to run repeatedly. However, as it is a single-use program, it's not designed to live past those two primary invocations and we should delete it after it's served its purpose. --- .../NextstrainDotOrgServerInstanceDev.json | 8 +- scripts/migrate-group | 45 ++++- scripts/migrate-groups-layout | 160 ++++++++++++++++++ src/sources/groups.js | 44 ++++- 4 files changed, 244 insertions(+), 13 deletions(-) create mode 100755 scripts/migrate-groups-layout diff --git a/aws/iam/policy/NextstrainDotOrgServerInstanceDev.json b/aws/iam/policy/NextstrainDotOrgServerInstanceDev.json index 1c8798599..fa44a0d92 100644 --- a/aws/iam/policy/NextstrainDotOrgServerInstanceDev.json +++ b/aws/iam/policy/NextstrainDotOrgServerInstanceDev.json @@ -50,11 +50,11 @@ "arn:aws:s3:::nextstrain-groups" ], "Condition": { - "StringEquals": { + "StringLike": { "s3:prefix": [ - "blab/", - "test/", - "test-private/" + "blab/*", + "test/*", + "test-private/*" ] } } diff --git a/scripts/migrate-group b/scripts/migrate-group index 2701f2ff8..5e6e648cd 100755 --- a/scripts/migrate-group +++ b/scripts/migrate-group @@ -90,6 +90,48 @@ async function migrate({group, dryRun = true}) { async function syncData({dryRun = true, group}) { + console.group(`\nSyncing S3 data`); + + // Datasets + await s3Sync({ + dryRun, + group, + prefix: "datasets/", + filters: [ + "--exclude=*", + "--include=*.json", + ] + }); + + // Narratives + await s3Sync({ + dryRun, + group, + prefix: "narratives/", + filters: [ + "--exclude=*", + "--include=*.md", + "--exclude=group-overview.md", + ] + }); + + // Control/customization files + await s3Sync({ + dryRun, + group, + prefix: "", + filters: [ + "--exclude=*", + "--include=group-overview.md", + "--include=group-logo.png", + ] + }); + + console.groupEnd(); +} + + +async function s3Sync({dryRun = true, group, prefix = "", filters = []}) { const argv = [ "aws", "s3", "sync", ...(dryRun @@ -97,7 +139,8 @@ async function syncData({dryRun = true, group}) { : []), "--delete", `s3://${group.bucket}/`, - `s3://nextstrain-groups/${group.name}/`, + `s3://nextstrain-groups/${group.name}/${prefix}`, + ...filters, ]; console.group(`\nRunning ${argv.join(" ")}`); await run(argv); diff --git a/scripts/migrate-groups-layout b/scripts/migrate-groups-layout new file mode 100755 index 000000000..ca16751a3 --- /dev/null +++ b/scripts/migrate-groups-layout @@ -0,0 +1,160 @@ +#!/usr/bin/env node +const {ArgumentParser} = require("argparse"); +const S3 = require("@aws-sdk/client-s3"); +const process = require("process"); + +const {reportUnhandledRejectionsAtExit, run, setupConsole} = require("../src/utils/scripts"); + +const BUCKET = "nextstrain-groups"; + + +function parseArgs() { + const argparser = new ArgumentParser({ + usage: `%(prog)s [--dry-run | --wet-run] [--delete-after-copy]`, + description: ` + Migrate layout of new multi-tenant bucket for Nextstrain Groups from old + layout to the new layout. + + This program is designed to be idempotent if run multiple times. In + practice, it likely only needs to be run once before deploy of the + layout change and once again (this time with --delete-after-copy) after + deploy. + `, + }); + + argparser.addArgument("--dry-run", { + help: "Go through the motions locally but don't actually make any changes on S3. This is the default.", + dest: "dryRun", + action: "storeTrue", + defaultValue: true, + }); + argparser.addArgument("--wet-run", { + help: "Actually make changes on S3.", + dest: "dryRun", + action: "storeFalse", + }); + + argparser.addArgument("--delete-after-copy", { + help: "Delete objects in the old layout after copying them to the new layout.", + dest: "deleteAfterCopy", + action: "storeTrue", + defaultValue: false, + }); + + return argparser.parseArgs(); +} + + +function main({dryRun = true, deleteAfterCopy = false}) { + setupConsole({dryRun}); + + console.log(`Migrating layout of multi-tenant bucket`); + + migrate({dryRun, deleteAfterCopy}) + .then(counts => { + console.log(`\nMigration complete: %o`, counts); + }) + .catch(error => { + console.error("\n\n%s\n", error); + console.error("Migration FAILED. See above for details. It's typically safe to re-run this program after fixing the issue."); + process.exitCode = 1; + }); +} + + +async function migrate({dryRun = true, deleteAfterCopy = false}) { + const s3 = new S3.S3Client(); + + console.log("\nDiscovering objects…"); + let objects = []; + + for await (const page of S3.paginateListObjectsV2({client: s3}, {Bucket: BUCKET})) { + objects = objects.concat(page.Contents); + } + + const existingKeys = new Map(objects.map(o => [o.Key, o])); + + console.group(`\n${deleteAfterCopy ? "Moving" : "Copying"} objects…`); + + const counts = {copied: 0, updated: 0, existed: 0}; + + for (const object of objects) { + const oldKey = object.Key; + const newKey = newKeyFor(oldKey); + + if (!newKey) continue; + + let status; + const existingCopy = existingKeys.get(newKey); + if (existingCopy) { + if (existingCopy.LastModified >= object.LastModified) { + status = "existed"; + } else { + status = "updated"; + } + } else { + status = "copied"; + } + + if (status !== "existed") { + console.log(`copying: ${oldKey} → ${newKey}`); + + if (!dryRun) { + await s3.send(new S3.CopyObjectCommand({ + CopySource: `${BUCKET}/${oldKey}`, + Bucket: BUCKET, + Key: newKey, + })); + } + } + + if (!dryRun && deleteAfterCopy) { + console.log(`deleting: ${oldKey}`); + + await s3.send(new S3.DeleteObjectCommand({ + Bucket: BUCKET, + Key: oldKey, + })); + } + + counts[status]++; + } + + console.groupEnd(); + + return counts; +} + + +function newKeyFor(key) { + const {groupName, subKey} = parseKey(key); + + if (!shouldCopy(subKey)) return; + + const subPrefix = + subKey.endsWith(".json") ? "datasets" : + subKey.endsWith(".md") ? "narratives" : + undefined ; + + if (!subPrefix) throw new Error(`unrecognized key: ${key}`); + + return `${groupName}/${subPrefix}/${subKey}`; +} + + +function parseKey(key) { + const [groupName, ...rest] = key.split("/"); + return {groupName, subKey: rest.join("/")}; +} + + +function shouldCopy(subKey) { + return !subKey.startsWith("datasets/") + && !subKey.startsWith("narratives/") + && subKey !== "group-overview.md" + && subKey !== "group-logo.png"; +} + + +reportUnhandledRejectionsAtExit(); +main(parseArgs()); diff --git a/src/sources/groups.js b/src/sources/groups.js index 706215027..af6f26500 100644 --- a/src/sources/groups.js +++ b/src/sources/groups.js @@ -7,7 +7,7 @@ const authz = require("../authz"); const {fetch} = require("../fetch"); const {Group} = require("../groups"); const utils = require("../utils"); -const {Source} = require("./models"); +const {Source, Dataset, Narrative} = require("./models"); const S3 = new AWS.S3(); @@ -62,6 +62,13 @@ class GroupSource extends Source { : ""; } + dataset(pathParts) { + return new GroupDataset(this, pathParts); + } + narrative(pathParts) { + return new GroupNarrative(this, pathParts); + } + async urlFor(path, method = 'GET', headers = {}) { const normalizedHeaders = utils.normalizeHeaders(headers); const action = { @@ -86,10 +93,12 @@ class GroupSource extends Source { ...action[method].params, }); } - async _listFiles() { + async _listFiles(listPrefix = "") { + const prefix = this.prefix + listPrefix; + return new Promise((resolve, reject) => { let files = []; - S3.listObjectsV2({Bucket: this.bucket, Prefix: this.prefix}).eachPage((err, data, done) => { + S3.listObjectsV2({Bucket: this.bucket, Prefix: prefix}).eachPage((err, data, done) => { if (err) { utils.warn(`Could not list S3 objects for group '${this.group.name}'\n${err.message}`); return reject(err); @@ -106,22 +115,27 @@ class GroupSource extends Source { files = files.concat( data.Contents .map(object => object.Key) - .filter(key => key.startsWith(this.prefix)) - .map(key => key.slice(this.prefix.length)) - .filter(subKey => !subKey.startsWith("datasets/") && !subKey.startsWith("narratives/")) + .filter(key => key.startsWith(prefix)) + .map(key => key.slice(prefix.length)) ); return done(); }); }); } async availableDatasets() { - const files = await this._listFiles(); + const prefix = this.bucket === MULTI_TENANT_BUCKET + ? "datasets/" + : ""; + const files = await this._listFiles(prefix); const pathnames = utils.getDatasetsFromListOfFilenames(files); return pathnames; } async availableNarratives() { // Walking logic borrowed from auspice's cli/server/getAvailable.js - const files = await this._listFiles(); + const prefix = this.bucket === MULTI_TENANT_BUCKET + ? "narratives/" + : ""; + const files = await this._listFiles(prefix); return files .filter((file) => file !== 'group-overview.md') .filter((file) => file.endsWith(".md")) @@ -223,6 +237,20 @@ class GroupSource extends Source { } +class GroupDataset extends Dataset { + get baseName() { + return `datasets/${super.baseName}`; + } +} + + +class GroupNarrative extends Narrative { + get baseName() { + return `narratives/${super.baseName}`; + } +} + + /** * Generate the authorization policy for a given Nextstrain Group. * From c42ee783854ff640b7833854d9922ce9f11f56da Mon Sep 17 00:00:00 2001 From: Thomas Sibley Date: Tue, 21 Jun 2022 12:42:57 -0700 Subject: [PATCH 3/3] migrate-group: Flag un-migrated objects for manual review I'd been doing this manually while reviewing the previous sync logs, as before the layout change the sync just migrated everything even though some objects would be no longer accessible. Nicer to have it built-in as part of the rest of the manual steps checklist. --- scripts/migrate-group | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/scripts/migrate-group b/scripts/migrate-group index 5e6e648cd..647251b95 100755 --- a/scripts/migrate-group +++ b/scripts/migrate-group @@ -127,7 +127,17 @@ async function syncData({dryRun = true, group}) { ] }); + // Discover files to consider for manual review + const unsynced = (await s3ListObjects({group})).filter( + key => !key.endsWith(".json") + && !key.endsWith(".md") + && key !== "group-overview.md" + && key !== "group-logo.png" + ); + console.groupEnd(); + + return unsynced.map(key => `Investigate unsynced object s3://${group.bucket}/${key}`); } @@ -148,6 +158,16 @@ async function s3Sync({dryRun = true, group, prefix = "", filters = []}) { } +async function s3ListObjects({group}) { + const client = new S3.S3Client(); + + return await collate( + S3.paginateListObjectsV2({client}, {Bucket: group.bucket}), + page => page.Contents.map(object => object.Key), + ); +} + + async function updateServerPolicies({dryRun = true, oldBucket}) { const policyFiles = [ "aws/iam/policy/NextstrainDotOrgServerInstance.json",