diff --git a/.changelog/24023.txt b/.changelog/24023.txt new file mode 100644 index 00000000000..d75f68a7574 --- /dev/null +++ b/.changelog/24023.txt @@ -0,0 +1,3 @@ +```release-note:improvement +cli: Added redaction options to operator snapshot commands +``` diff --git a/command/commands.go b/command/commands.go index 7a46c08d484..51fa81db3be 100644 --- a/command/commands.go +++ b/command/commands.go @@ -849,6 +849,11 @@ func Commands(metaPtr *Meta, agentUi cli.Ui) map[string]cli.CommandFactory { Meta: meta, }, nil }, + "operator snapshot redact": func() (cli.Command, error) { + return &OperatorSnapshotRedactCommand{ + Meta: meta, + }, nil + }, "plan": func() (cli.Command, error) { return &JobPlanCommand{ diff --git a/command/operator_snapshot_redact.go b/command/operator_snapshot_redact.go new file mode 100644 index 00000000000..4dea7ad4895 --- /dev/null +++ b/command/operator_snapshot_redact.go @@ -0,0 +1,95 @@ +// Copyright (c) HashiCorp, Inc. +// SPDX-License-Identifier: BUSL-1.1 + +package command + +import ( + "fmt" + "io" + "os" + "strings" + + "github.com/hashicorp/nomad/helper/raftutil" + "github.com/posener/complete" +) + +type OperatorSnapshotRedactCommand struct { + Meta +} + +func (c *OperatorSnapshotRedactCommand) Help() string { + helpText := ` +Usage: nomad operator snapshot redact [options] + + Removes key material from an existing snapshot file created by the operator + snapshot save command, when using the AEAD keyring provider. When using a KMS + keyring provider, no cleartext key material is stored in snapshots and this + command is not necessary. Note that this command requires loading the entire + snapshot into memory locally and overwrites the existing snapshot. + + This is useful for situations where you need to transmit a snapshot without + exposing key material. + +General Options: + + ` + generalOptionsUsage(usageOptsDefault|usageOptsNoNamespace) + + return strings.TrimSpace(helpText) +} + +func (c *OperatorSnapshotRedactCommand) AutocompleteFlags() complete.Flags { + return complete.Flags{} +} + +func (c *OperatorSnapshotRedactCommand) AutocompleteArgs() complete.Predictor { + return complete.PredictFiles("*") +} + +func (c *OperatorSnapshotRedactCommand) Synopsis() string { + return "Redacts an existing snapshot of Nomad server state" +} + +func (c *OperatorSnapshotRedactCommand) Name() string { return "operator snapshot redact" } + +func (c *OperatorSnapshotRedactCommand) Run(args []string) int { + if len(args) != 1 { + c.Ui.Error("This command takes one argument: ") + c.Ui.Error(commandErrorText(c)) + return 1 + } + + path := args[0] + f, err := os.Open(path) + if err != nil { + c.Ui.Error(fmt.Sprintf("Error opening snapshot file: %s", err)) + return 1 + } + defer f.Close() + + tmpFile, err := os.Create(path + ".tmp") + if err != nil { + c.Ui.Error(fmt.Sprintf("Failed to create temporary file: %v", err)) + return 1 + } + + _, err = io.Copy(tmpFile, f) + if err != nil { + c.Ui.Error(fmt.Sprintf("Failed to copy snapshot to temporary file: %v", err)) + return 1 + } + + err = raftutil.RedactSnapshot(tmpFile) + if err != nil { + c.Ui.Error(fmt.Sprintf("Failed to redact snapshot: %v", err)) + return 1 + } + + err = os.Rename(tmpFile.Name(), path) + if err != nil { + c.Ui.Error(fmt.Sprintf("Failed to finalize snapshot file: %v", err)) + return 1 + } + + c.Ui.Output("Snapshot redacted") + return 0 +} diff --git a/command/operator_snapshot_save.go b/command/operator_snapshot_save.go index 9dacf8d7792..d004563d63f 100644 --- a/command/operator_snapshot_save.go +++ b/command/operator_snapshot_save.go @@ -11,6 +11,7 @@ import ( "time" "github.com/hashicorp/nomad/api" + "github.com/hashicorp/nomad/helper/raftutil" "github.com/posener/complete" ) @@ -48,8 +49,14 @@ General Options: Snapshot Save Options: - -stale=[true|false] - The -stale argument defaults to "false" which means the leader provides the + -redact + The -redact option will locally edit the snapshot to remove any cleartext key + material from the root keyring. Only the AEAD keyring provider has cleartext + key material in Raft. Note that this operation requires loading the snapshot + into memory locally. + + -stale + The -stale option defaults to "false" which means the leader provides the result. If the cluster is in an outage state without a leader, you may need to set -stale to "true" to get the configuration from a non-leader server. ` @@ -74,12 +81,14 @@ func (c *OperatorSnapshotSaveCommand) Synopsis() string { func (c *OperatorSnapshotSaveCommand) Name() string { return "operator snapshot save" } func (c *OperatorSnapshotSaveCommand) Run(args []string) int { - var stale bool + var stale, redact bool flags := c.Meta.FlagSet(c.Name(), FlagSetClient) flags.Usage = func() { c.Ui.Output(c.Help()) } flags.BoolVar(&stale, "stale", false, "") + flags.BoolVar(&redact, "redact", false, "") + if err := flags.Parse(args); err != nil { c.Ui.Error(fmt.Sprintf("Failed to parse args: %v", err)) return 1 @@ -141,6 +150,15 @@ func (c *OperatorSnapshotSaveCommand) Run(args []string) int { return 1 } + if redact { + c.Ui.Info("Redacting key material from snapshot") + err := raftutil.RedactSnapshot(tmpFile) + if err != nil { + c.Ui.Error(fmt.Sprintf("Could not redact snapshot: %v", err)) + return 1 + } + } + err = os.Rename(tmpFile.Name(), filename) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to finalize snapshot file: %v", err)) diff --git a/command/operator_snapshot_state.go b/command/operator_snapshot_state.go index 6580859652c..6eb0374a044 100644 --- a/command/operator_snapshot_state.go +++ b/command/operator_snapshot_state.go @@ -85,7 +85,7 @@ func (c *OperatorSnapshotStateCommand) Run(args []string) int { } defer f.Close() - state, meta, err := raftutil.RestoreFromArchive(f, filter) + _, state, meta, err := raftutil.RestoreFromArchive(f, filter) if err != nil { c.Ui.Error(fmt.Sprintf("Failed to read archive file: %s", err)) return 1 diff --git a/helper/raftutil/fsm.go b/helper/raftutil/fsm.go index d3525f9e866..1cff0b0da9d 100644 --- a/helper/raftutil/fsm.go +++ b/helper/raftutil/fsm.go @@ -14,6 +14,7 @@ import ( "github.com/hashicorp/go-memdb" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" raftboltdb "github.com/hashicorp/raft-boltdb/v2" ) @@ -209,6 +210,7 @@ func StateAsMap(store *state.StateStore) map[string][]interface{} { "Jobs": toArray(store.Jobs(nil, state.SortDefault)), "Nodes": toArray(store.Nodes(nil)), "PeriodicLaunches": toArray(store.PeriodicLaunches(nil)), + "RootKeys": rootKeyMeta(store), "SITokenAccessors": toArray(store.SITokenAccessors(nil)), "ScalingEvents": toArray(store.ScalingEvents(nil)), "ScalingPolicies": toArray(store.ScalingPolicies(nil)), @@ -265,3 +267,27 @@ func toArray(iter memdb.ResultIterator, err error) []interface{} { return r } + +// rootKeyMeta allows displaying keys without their key material +func rootKeyMeta(store *state.StateStore) []any { + + iter, err := store.RootKeys(nil) + if err != nil { + return []any{err} + } + + keyMeta := []any{} + for { + raw := iter.Next() + if raw == nil { + break + } + k := raw.(*structs.RootKey) + if k == nil { + break + } + keyMeta = append(keyMeta, k.Meta()) + } + + return keyMeta +} diff --git a/helper/raftutil/snapshot.go b/helper/raftutil/snapshot.go index 62d351ee6a4..78e6b38dd8e 100644 --- a/helper/raftutil/snapshot.go +++ b/helper/raftutil/snapshot.go @@ -6,21 +6,22 @@ package raftutil import ( "fmt" "io" + "os" "github.com/hashicorp/go-hclog" - "github.com/hashicorp/raft" - "github.com/hashicorp/nomad/helper/snapshot" "github.com/hashicorp/nomad/nomad" "github.com/hashicorp/nomad/nomad/state" + "github.com/hashicorp/nomad/nomad/structs" + "github.com/hashicorp/raft" ) -func RestoreFromArchive(archive io.Reader, filter *nomad.FSMFilter) (*state.StateStore, *raft.SnapshotMeta, error) { +func RestoreFromArchive(archive io.Reader, filter *nomad.FSMFilter) (raft.FSM, *state.StateStore, *raft.SnapshotMeta, error) { logger := hclog.L() fsm, err := dummyFSM(logger) if err != nil { - return nil, nil, fmt.Errorf("failed to create FSM: %w", err) + return nil, nil, nil, fmt.Errorf("failed to create FSM: %w", err) } // r is closed by RestoreFiltered, w is closed by CopySnapshot @@ -40,13 +41,67 @@ func RestoreFromArchive(archive io.Reader, filter *nomad.FSMFilter) (*state.Stat err = fsm.RestoreWithFilter(r, filter) if err != nil { - return nil, nil, fmt.Errorf("failed to restore from snapshot: %w", err) + return nil, nil, nil, fmt.Errorf("failed to restore from snapshot: %w", err) } select { case err := <-errCh: - return nil, nil, err + return nil, nil, nil, err case meta := <-metaCh: - return fsm.State(), meta, nil + return fsm, fsm.State(), meta, nil + } +} + +func RedactSnapshot(srcFile *os.File) error { + srcFile.Seek(0, 0) + fsm, store, meta, err := RestoreFromArchive(srcFile, nil) + if err != nil { + return fmt.Errorf("Failed to load snapshot from archive: %w", err) + } + + iter, err := store.RootKeys(nil) + if err != nil { + return fmt.Errorf("Failed to query for root keys: %v", err) + } + + for { + raw := iter.Next() + if raw == nil { + break + } + rootKey := raw.(*structs.RootKey) + if rootKey == nil { + break + } + if len(rootKey.WrappedKeys) > 0 { + rootKey.KeyID = rootKey.KeyID + " [REDACTED]" + rootKey.WrappedKeys = nil + } + msg, err := structs.Encode(structs.WrappedRootKeysUpsertRequestType, + &structs.KeyringUpsertWrappedRootKeyRequest{ + WrappedRootKeys: rootKey, + }) + if err != nil { + return fmt.Errorf("Could not re-encode redacted key: %v", err) + } + + fsm.Apply(&raft.Log{ + Type: raft.LogCommand, + Data: msg, + }) + } + + snap, err := snapshot.NewFromFSM(hclog.Default(), fsm, meta) + if err != nil { + return fmt.Errorf("Failed to create redacted snapshot: %v", err) + } + + srcFile.Truncate(0) + srcFile.Seek(0, 0) + + _, err = io.Copy(srcFile, snap) + if err != nil { + return fmt.Errorf("Failed to copy snapshot to temporary file: %v", err) } + return nil } diff --git a/helper/snapshot/snapshot.go b/helper/snapshot/snapshot.go index 3896f68a6df..0c5bb004cf1 100644 --- a/helper/snapshot/snapshot.go +++ b/helper/snapshot/snapshot.go @@ -43,6 +43,49 @@ func New(logger hclog.Logger, r *raft.Raft) (*Snapshot, error) { if err != nil { return nil, fmt.Errorf("failed to open snapshot: %v:", err) } + + return writeSnapshot(logger, metadata, snap) +} + +// NewFromFSM takes a state snapshot of the given FSM (for when we don't have a +// Raft instance setup) into a temporary file and returns an object that gives +// access to the file as an io.Reader. You must arrange to call Close() on the +// returned object or else you will leak a temporary file. +func NewFromFSM(logger hclog.Logger, fsm raft.FSM, meta *raft.SnapshotMeta) (*Snapshot, error) { + _, trans := raft.NewInmemTransport("") + snapshotStore := raft.NewInmemSnapshotStore() + + fsmSnap, err := fsm.Snapshot() + if err != nil { + return nil, err + } + + sink, err := snapshotStore.Create(meta.Version, meta.Index, meta.Term, + meta.Configuration, meta.ConfigurationIndex, trans) + if err != nil { + return nil, err + } + err = fsmSnap.Persist(sink) + if err != nil { + return nil, err + } + + err = sink.Close() + if err != nil { + return nil, err + } + + snapshotID := sink.ID() + metadata, snap, err := snapshotStore.Open(snapshotID) + if err != nil { + return nil, err + } + + return writeSnapshot(logger, metadata, snap) +} + +func writeSnapshot(logger hclog.Logger, metadata *raft.SnapshotMeta, snap io.ReadCloser) (*Snapshot, error) { + defer func() { if err := snap.Close(); err != nil { logger.Error("Failed to close Raft snapshot", "error", err) diff --git a/helper/snapshot/snapshot_test.go b/helper/snapshot/snapshot_test.go index e17b10bf6ef..b4f03605481 100644 --- a/helper/snapshot/snapshot_test.go +++ b/helper/snapshot/snapshot_test.go @@ -17,8 +17,10 @@ import ( "github.com/hashicorp/consul/sdk/testutil" "github.com/hashicorp/go-msgpack/v2/codec" + "github.com/hashicorp/nomad/helper/uuid" "github.com/hashicorp/nomad/nomad/structs" "github.com/hashicorp/raft" + "github.com/shoenig/test/must" "github.com/stretchr/testify/require" ) @@ -350,3 +352,72 @@ func TestSnapshot_BadRestore(t *testing.T) { } } } + +func TestSnapshot_FromFSM(t *testing.T) { + dir := testutil.TempDir(t, "snapshot") + defer os.RemoveAll(dir) + + // Make a Raft and populate it with some data. We tee everything we + // apply off to a buffer for checking post-snapshot. + var expected []bytes.Buffer + entries := 64 * 1024 + before, fsm := makeRaft(t, filepath.Join(dir, "before")) + defer before.Shutdown() + for i := 0; i < entries; i++ { + var log bytes.Buffer + var copy bytes.Buffer + both := io.MultiWriter(&log, ©) + _, err := io.CopyN(both, rand.Reader, 256) + must.NoError(t, err) + future := before.Apply(log.Bytes(), time.Second) + must.NoError(t, future.Error()) + expected = append(expected, copy) + } + + // Take a snapshot. + logger := testutil.Logger(t) + snap, err := NewFromFSM(logger, fsm, &raft.SnapshotMeta{ + Version: 1, + ID: uuid.Generate(), + Index: uint64(entries) + 2, + Term: 2, + Peers: []byte{}, + Configuration: raft.Configuration{}, + }) + must.NoError(t, err) + defer snap.Close() + + // Verify the snapshot. We have to rewind it after for the restore. + metadata, err := Verify(snap) + must.NoError(t, err) + _, err = snap.file.Seek(0, 0) + must.NoError(t, err) + must.Eq(t, entries+2, int(metadata.Index)) + + // Make a new, independent Raft. + after, fsm := makeRaft(t, filepath.Join(dir, "after")) + defer after.Shutdown() + + // Put some initial data in there that the snapshot should overwrite. + for i := 0; i < 16; i++ { + var log bytes.Buffer + _, err := io.CopyN(&log, rand.Reader, 256) + must.NoError(t, err) + future := after.Apply(log.Bytes(), time.Second) + must.NoError(t, future.Error()) + } + + // Restore the snapshot. + must.NoError(t, Restore(logger, snap, after)) + + // Compare the contents. + fsm.Lock() + defer fsm.Unlock() + must.Len(t, len(expected), fsm.logs) + + for i := range fsm.logs { + if !bytes.Equal(fsm.logs[i], expected[i].Bytes()) { + t.Fatalf("bad: log %d doesn't match", i) + } + } +} diff --git a/nomad/fsm.go b/nomad/fsm.go index 325075f0e14..3996708d9fc 100644 --- a/nomad/fsm.go +++ b/nomad/fsm.go @@ -1836,16 +1836,17 @@ func (n *nomadFSM) restoreImpl(old io.ReadCloser, filter *FSMFilter) error { if err := dec.Decode(keyMeta); err != nil { return err } + if filter.Include(keyMeta) { + wrappedKeys := structs.NewRootKey(keyMeta) + if err := restore.RootKeyRestore(wrappedKeys); err != nil { + return err + } - wrappedKeys := structs.NewRootKey(keyMeta) - if err := restore.RootKeyRestore(wrappedKeys); err != nil { - return err - } - - if n.encrypter != nil { - // only decrypt the key if we're running in a real server and - // not the 'operator snapshot' command context - go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys) + if n.encrypter != nil { + // only decrypt the key if we're running in a real server and + // not the 'operator snapshot' command context + go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys) + } } case RootKeySnapshot: @@ -1853,15 +1854,16 @@ func (n *nomadFSM) restoreImpl(old io.ReadCloser, filter *FSMFilter) error { if err := dec.Decode(wrappedKeys); err != nil { return err } + if filter.Include(wrappedKeys) { + if err := restore.RootKeyRestore(wrappedKeys); err != nil { + return err + } - if err := restore.RootKeyRestore(wrappedKeys); err != nil { - return err - } - - if n.encrypter != nil { - // only decrypt the key if we're running in a real server and - // not the 'operator snapshot' command context - go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys) + if n.encrypter != nil { + // only decrypt the key if we're running in a real server and + // not the 'operator snapshot' command context + go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedKeys) + } } case ACLRoleSnapshot: @@ -2344,8 +2346,11 @@ func (n *nomadFSM) applyRootKeyMetaUpsert(msgType structs.MessageType, buf []byt return err } - // start a task to decrypt the key material - go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedRootKeys) + if n.encrypter != nil { + // start a task to decrypt the key material if we're running in a real + // server and not the 'operator snapshot' command context + go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, wrappedRootKeys) + } return nil } @@ -2363,8 +2368,11 @@ func (n *nomadFSM) applyWrappedRootKeysUpsert(msgType structs.MessageType, buf [ return err } - // start a task to decrypt the key material - go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, req.WrappedRootKeys) + if n.encrypter != nil { + // start a task to decrypt the key material if we're running in a real + // server and not the 'operator snapshot' command context + go n.encrypter.AddWrappedKey(n.encrypter.srv.shutdownCtx, req.WrappedRootKeys) + } return nil } @@ -2382,7 +2390,9 @@ func (n *nomadFSM) applyWrappedRootKeysDelete(msgType structs.MessageType, buf [ return err } - n.encrypter.RemoveKey(req.KeyID) + if n.encrypter != nil { + n.encrypter.RemoveKey(req.KeyID) + } return nil } diff --git a/scheduler/benchmarks/helpers_test.go b/scheduler/benchmarks/helpers_test.go index e41c4186d04..956846e112f 100644 --- a/scheduler/benchmarks/helpers_test.go +++ b/scheduler/benchmarks/helpers_test.go @@ -73,7 +73,7 @@ func NewHarnessFromSnapshot(t testing.TB, snapshotPath string) (*scheduler.Harne } defer f.Close() - state, _, err := raftutil.RestoreFromArchive(f, nil) + _, state, _, err := raftutil.RestoreFromArchive(f, nil) if err != nil { return nil, err } diff --git a/website/content/docs/commands/operator/snapshot/redact.mdx b/website/content/docs/commands/operator/snapshot/redact.mdx new file mode 100644 index 00000000000..b6a3fca4cd3 --- /dev/null +++ b/website/content/docs/commands/operator/snapshot/redact.mdx @@ -0,0 +1,36 @@ +--- +layout: docs +page_title: 'Commands: operator snapshot redact' +description: | + Redacts snapshot of Nomad server state +--- + +# Command: operator snapshot redact + + +The `operator snapshot redact` command removes key material from an existing +snapshot file created by the `operator snapshot save` command, when using the +AEAD keyring provider. + +This is useful for situations where you need to transmit a snapshot without +exposing key material. + + + +When using a [KMS keyring provider][], no cleartext key material is stored in +snapshots and this command is not necessary. Note that this command requires +loading the entire snapshot into memory locally and overwrites the existing +snapshot. + +Snapshots made before Nomad 1.9.0 will not include the keyrings. + + + +## Usage + +```plaintext +nomad operator snapshot redact +``` + + +[KMS keyring provider]: /nomad/docs/configuration/keyring diff --git a/website/content/docs/commands/operator/snapshot/save.mdx b/website/content/docs/commands/operator/snapshot/save.mdx index d1e4a4a5e72..03a564aedec 100644 --- a/website/content/docs/commands/operator/snapshot/save.mdx +++ b/website/content/docs/commands/operator/snapshot/save.mdx @@ -16,12 +16,14 @@ snapshot operations. -This command only saves a Raft snapshot. This snapshot does not include -keyrings. You must back up keyrings separately. +This command includes Nomad's keyring in the snapshot. If you are not using a +[KMS provider][] to secure the keyring, you should use the `-redact` flag to +remove key material before transmitting the snapshot to HashiCorp Support. -If you use this snapshot to recover a cluster, you also need to restore the -keyring onto at least one server. Refer to the Key Management's [Restoring the -Keyring from Backup][restore the keyring] section for instructions. +Snapshots made before Nomad 1.9.0 will not include the keyrings. If you use +older snapshots to recover a cluster, you also need to restore the keyring onto +at least one server. Refer to the Key Management's [Restoring the Keyring from +Backup][restore the keyring] section for instructions. @@ -54,10 +56,16 @@ nomad operator snapshot save [options] ## Snapshot Save Options -- `-stale`: The stale argument defaults to `false`, which means the leader +- `-redact`: The redact option will locally edit the snapshot to remove any + cleartext key material from the root keyring. Only the AEAD keyring provider + has cleartext key material in Raft. Note that this operation requires loading + the snapshot into memory locally. + +- `-stale`: The stale option defaults to `false`, which means the leader provides the result. If the cluster is in an outage state without a leader, you may need to set `-stale` to `true` to get the configuration from a non-leader server. [outage recovery]: /nomad/tutorials/manage-clusters/outage-recovery [restore the keyring]: /nomad/docs/operations/key-management#restoring-the-keyring-from-backup +[KMS provider]: /nomad/docs/configuration/keyring diff --git a/website/data/docs-nav-data.json b/website/data/docs-nav-data.json index 8d60dcecddd..1bb4393d1c8 100644 --- a/website/data/docs-nav-data.json +++ b/website/data/docs-nav-data.json @@ -937,6 +937,10 @@ "title": "inspect", "path": "commands/operator/snapshot/inspect" }, + { + "title": "redact", + "path": "commands/operator/snapshot/redact" + }, { "title": "restore", "path": "commands/operator/snapshot/restore"