Skip to content

Commit

Permalink
keyring in raft
Browse files Browse the repository at this point in the history
In Nomad 1.4, we implemented a root keyring to support encrypting Variables and
signing Workload Identities. The keyring was originally stored with the
AEAD-wrapped DEKs and the KEK together in a JSON keystore file on disk. We
recently added support for using an external KMS for the KEK to improve the
security model for the keyring. But we've encountered multiple instances of the
keystore files not getting backed up separately from the Raft snapshot,
resulting in failure to restore clusters from backup.

Move Nomad's root keyring into Raft (encrypted with a KMS/Vault where available)
in order to eliminate operational problems with the separate on-disk keystore.

Fixes: #23665
Ref: https://hashicorp.atlassian.net/browse/NET-10523
  • Loading branch information
tgross committed Sep 19, 2024
1 parent 85e8774 commit 1c1704d
Show file tree
Hide file tree
Showing 20 changed files with 938 additions and 371 deletions.
3 changes: 3 additions & 0 deletions .changelog/23977.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,3 @@
```release-note:improvement
keyring: Stored wrapped data encryption keys in Raft
```
2 changes: 1 addition & 1 deletion go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -128,6 +128,7 @@ require (
go.etcd.io/bbolt v1.3.9
go.uber.org/goleak v1.2.1
golang.org/x/crypto v0.27.0
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8
golang.org/x/sync v0.8.0
golang.org/x/sys v0.25.0
golang.org/x/time v0.3.0
Expand Down Expand Up @@ -291,7 +292,6 @@ require (
github.com/vmware/govmomi v0.18.0 // indirect
github.com/yusufpapurcu/wmi v1.2.3 // indirect
go.opencensus.io v0.24.0 // indirect
golang.org/x/exp v0.0.0-20240613232115-7f521ea00fb8 // indirect
golang.org/x/mod v0.18.0 // indirect
golang.org/x/net v0.26.0 // indirect
golang.org/x/oauth2 v0.18.0 // indirect
Expand Down
33 changes: 33 additions & 0 deletions helper/backoff.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package helper

import (
"context"
"fmt"
"time"
)

Expand All @@ -29,3 +31,34 @@ func Backoff(backoffBase time.Duration, backoffLimit time.Duration, attempt uint

return deadline
}

// WithBackoffFunc is a helper that runs a function with geometric backoff + a
// small jitter to a maximum backoff. It returns once the context closes, with
// the error wrapping over the error from the function.
func WithBackoffFunc(ctx context.Context, minBackoff, maxBackoff time.Duration, fn func() error) error {
var err error
backoff := minBackoff
t, stop := NewSafeTimer(0)
defer stop()
for {
select {
case <-ctx.Done():
return fmt.Errorf("operation cancelled: %w", err)
case <-t.C:
}

err = fn()
if err == nil {
return nil
}

if backoff < maxBackoff {
backoff = backoff*2 + RandomStagger(minBackoff/10)
if backoff > maxBackoff {
backoff = maxBackoff
}
}

t.Reset(backoff)
}
}
5 changes: 3 additions & 2 deletions helper/raftutil/msgtypes.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

86 changes: 71 additions & 15 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,19 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// migration sends updates to the leader so our view of state is no longer
// valid. we ack this core job and will pick up against at the next
// interval.
//
// COMPAT(1.12.0): remove this block in 1.12.0 LTS
wasMigrated, err := c.rootKeyMigrate(eval)
if err != nil {
return err
}
if wasMigrated {
return nil
}

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
Expand All @@ -915,7 +928,7 @@ func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {
func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error {

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return err
}
Expand All @@ -931,7 +944,7 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
keyMeta := raw.(*structs.WrappedRootKeys)
if !keyMeta.IsInactive() {
continue // never GC keys we're still using
}
Expand Down Expand Up @@ -970,24 +983,67 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error
return nil
}

// rootKeyRotate checks if the active key is old enough that we need to kick off
// a rotation. It prepublishes a key first and only promotes that prepublished
// key to active once the rotation threshold has expired
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) {
// rootKeyMigrate checks if the cluster is fully upgraded and migrates all the
// legacy root meta keys to the new wrapped key format
//
// COMPAT(1.12.0): remove this function in 1.12.0 LTS
func (c *CoreScheduler) rootKeyMigrate(eval *structs.Evaluation) (bool, error) {
if !ServersMeetMinimumVersion(
c.srv.serf.Members(), c.srv.Region(), minVersionKeyringInRaft, true) {
return false, nil
}

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return false, err
}
wasMigrated := false
for raw := iter.Next(); raw != nil; raw = iter.Next() {
wrappedKeys := raw.(*structs.WrappedRootKeys)
if len(wrappedKeys.WrappedKeys) > 0 {
continue // already migrated
}
rootKey, err := c.srv.encrypter.GetKey(wrappedKeys.KeyID)
if err != nil {
return wasMigrated, err
}
req := &structs.KeyringUpdateRootKeyRequest{
RootKey: rootKey,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}

if err := c.srv.RPC("Keyring.Update",
req, &structs.KeyringUpdateRootKeyResponse{}); err != nil {
c.logger.Error("migrating legacy key material failed",
"error", err, "key_id", wrappedKeys.KeyID)
return false, err
}
wasMigrated = true
}

return wasMigrated, nil
}

// rootKeyRotate checks if the active key is old enough that we need to kick off
// a rotation. It prepublishes a key first and only promotes that prepublished
// key to active once the rotation threshold has expired
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) {
var (
activeKey *structs.RootKeyMeta
prepublishedKey *structs.RootKeyMeta
activeKey *structs.WrappedRootKeys
prepublishedKey *structs.WrappedRootKeys
)

ws := memdb.NewWatchSet()
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return false, err
}
for raw := iter.Next(); raw != nil; raw = iter.Next() {
key := raw.(*structs.RootKeyMeta)
key := raw.(*structs.WrappedRootKeys)
switch key.State {
case structs.RootKeyStateActive:
activeKey = key
Expand Down Expand Up @@ -1083,7 +1139,7 @@ func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (
func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return err
}
Expand All @@ -1093,11 +1149,11 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if !keyMeta.IsRekeying() {
wrappedKeys := raw.(*structs.WrappedRootKeys)
if !wrappedKeys.IsRekeying() {
continue
}
varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID)
varIter, err := c.snap.GetVariablesByKeyID(ws, wrappedKeys.KeyID)
if err != nil {
return err
}
Expand All @@ -1106,7 +1162,7 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
return err
}

rootKey, err := c.srv.encrypter.GetKey(keyMeta.KeyID)
rootKey, err := c.srv.encrypter.GetKey(wrappedKeys.KeyID)
if err != nil {
return fmt.Errorf("rotated key does not exist in keyring: %w", err)
}
Expand Down
Loading

0 comments on commit 1c1704d

Please sign in to comment.