Skip to content

Commit

Permalink
keyring in raft
Browse files Browse the repository at this point in the history
In Nomad 1.4, we implemented a root keyring to support encrypting Variables and
signing Workload Identities. The keyring was originally stored with the
AEAD-wrapped DEKs and the KEK together in a JSON keystore file on disk. We
recently added support for using an external KMS for the KEK to improve the
security model for the keyring. But we've encountered multiple instances of the
keystore files not getting backed up separately from the Raft snapshot,
resulting in failure to restore clusters from backup.

Move Nomad's root keyring into Raft (encrypted with a KMS/Vault where available)
in order to eliminate operational problems with the separate on-disk keystore.

Fixes: #23665
Ref: https://hashicorp.atlassian.net/browse/NET-10523
  • Loading branch information
tgross committed Sep 17, 2024
1 parent a2b1985 commit 428667f
Show file tree
Hide file tree
Showing 17 changed files with 933 additions and 369 deletions.
33 changes: 33 additions & 0 deletions helper/backoff.go
Original file line number Diff line number Diff line change
Expand Up @@ -4,6 +4,8 @@
package helper

import (
"context"
"fmt"
"time"
)

Expand All @@ -29,3 +31,34 @@ func Backoff(backoffBase time.Duration, backoffLimit time.Duration, attempt uint

return deadline
}

// WithBackoffFunc is a helper that runs a function with geometric backoff + a
// small jitter to a maximum backoff. It returns once the context closes, with
// the error wrapping over the error from the function.
func WithBackoffFunc(ctx context.Context, minBackoff, maxBackoff time.Duration, fn func() error) error {
var err error
backoff := minBackoff
t, stop := NewSafeTimer(0)
defer stop()
for {
select {
case <-ctx.Done():
return fmt.Errorf("operation cancelled: %w", err)
case <-t.C:
}

err = fn()
if err == nil {
return nil
}

if backoff < maxBackoff {
backoff = backoff*2 + RandomStagger(minBackoff/10)
if backoff > maxBackoff {
backoff = maxBackoff
}
}

t.Reset(backoff)
}
}
5 changes: 3 additions & 2 deletions helper/raftutil/msgtypes.go

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

86 changes: 71 additions & 15 deletions nomad/core_sched.go
Original file line number Diff line number Diff line change
Expand Up @@ -899,6 +899,19 @@ func (c *CoreScheduler) expiredACLTokenGC(eval *structs.Evaluation, global bool)
// rootKeyRotateOrGC is used to rotate or garbage collect root keys
func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {

// migration sends updates to the leader so our view of state is no longer
// valid. we ack this core job and will pick up against at the next
// interval.
//
// COMPAT(1.12.0): remove this block in 1.12.0 LTS
wasMigrated, err := c.rootKeyMigrate(eval)
if err != nil {
return err
}
if wasMigrated {
return nil
}

// a rotation will be sent to the leader so our view of state
// is no longer valid. we ack this core job and will pick up
// the GC work on the next interval
Expand All @@ -915,7 +928,7 @@ func (c *CoreScheduler) rootKeyRotateOrGC(eval *structs.Evaluation) error {
func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error {

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return err
}
Expand All @@ -931,7 +944,7 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
keyMeta := raw.(*structs.WrappedRootKeys)
if !keyMeta.IsInactive() {
continue // never GC keys we're still using
}
Expand Down Expand Up @@ -970,24 +983,67 @@ func (c *CoreScheduler) rootKeyGC(eval *structs.Evaluation, now time.Time) error
return nil
}

// rootKeyRotate checks if the active key is old enough that we need to kick off
// a rotation. It prepublishes a key first and only promotes that prepublished
// key to active once the rotation threshold has expired
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) {
// rootKeyMigrate checks if the cluster is fully upgraded and migrates all the
// legacy root meta keys to the new wrapped key format
//
// COMPAT(1.12.0): remove this function in 1.12.0 LTS
func (c *CoreScheduler) rootKeyMigrate(eval *structs.Evaluation) (bool, error) {
if !ServersMeetMinimumVersion(
c.srv.serf.Members(), c.srv.Region(), minVersionKeyringInRaft, true) {
return false, nil
}

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return false, err
}
wasMigrated := false
for raw := iter.Next(); raw != nil; raw = iter.Next() {
wrappedKeys := raw.(*structs.WrappedRootKeys)
if len(wrappedKeys.WrappedKeys) > 0 {
continue // already migrated
}
rootKey, err := c.srv.encrypter.GetKey(wrappedKeys.KeyID)
if err != nil {
return wasMigrated, err
}
req := &structs.KeyringUpdateRootKeyRequest{
RootKey: rootKey,
WriteRequest: structs.WriteRequest{
Region: c.srv.config.Region,
AuthToken: eval.LeaderACL,
},
}

if err := c.srv.RPC("Keyring.Update",
req, &structs.KeyringUpdateRootKeyResponse{}); err != nil {
c.logger.Error("migrating legacy key material failed",
"error", err, "key_id", wrappedKeys.KeyID)
return false, err
}
wasMigrated = true
}

return wasMigrated, nil
}

// rootKeyRotate checks if the active key is old enough that we need to kick off
// a rotation. It prepublishes a key first and only promotes that prepublished
// key to active once the rotation threshold has expired
func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (bool, error) {
var (
activeKey *structs.RootKeyMeta
prepublishedKey *structs.RootKeyMeta
activeKey *structs.WrappedRootKeys
prepublishedKey *structs.WrappedRootKeys
)

ws := memdb.NewWatchSet()
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return false, err
}
for raw := iter.Next(); raw != nil; raw = iter.Next() {
key := raw.(*structs.RootKeyMeta)
key := raw.(*structs.WrappedRootKeys)
switch key.State {
case structs.RootKeyStateActive:
activeKey = key
Expand Down Expand Up @@ -1083,7 +1139,7 @@ func (c *CoreScheduler) rootKeyRotate(eval *structs.Evaluation, now time.Time) (
func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {

ws := memdb.NewWatchSet()
iter, err := c.snap.RootKeyMetas(ws)
iter, err := c.snap.WrappedRootKeys(ws)
if err != nil {
return err
}
Expand All @@ -1093,11 +1149,11 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
if raw == nil {
break
}
keyMeta := raw.(*structs.RootKeyMeta)
if !keyMeta.IsRekeying() {
wrappedKeys := raw.(*structs.WrappedRootKeys)
if !wrappedKeys.IsRekeying() {
continue
}
varIter, err := c.snap.GetVariablesByKeyID(ws, keyMeta.KeyID)
varIter, err := c.snap.GetVariablesByKeyID(ws, wrappedKeys.KeyID)
if err != nil {
return err
}
Expand All @@ -1106,7 +1162,7 @@ func (c *CoreScheduler) variablesRekey(eval *structs.Evaluation) error {
return err
}

rootKey, err := c.srv.encrypter.GetKey(keyMeta.KeyID)
rootKey, err := c.srv.encrypter.GetKey(wrappedKeys.KeyID)
if err != nil {
return fmt.Errorf("rotated key does not exist in keyring: %w", err)
}
Expand Down
60 changes: 30 additions & 30 deletions nomad/core_sched_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -2623,7 +2623,7 @@ func TestCoreScheduler_RootKeyRotate(t *testing.T) {

// active key, will never be GC'd
store := srv.fsm.State()
key0, err := store.GetActiveRootKeyMeta(nil)
key0, err := store.GetActiveRootKey(nil)
must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped"))
must.NoError(t, err)

Expand All @@ -2648,11 +2648,11 @@ func TestCoreScheduler_RootKeyRotate(t *testing.T) {
must.NoError(t, err)
must.True(t, rotated, must.Sprint("key should rotate"))

var key1 *structs.RootKeyMeta
iter, err := store.RootKeyMetas(nil)
var key1 *structs.WrappedRootKeys
iter, err := store.WrappedRootKeys(nil)
must.NoError(t, err)
for raw := iter.Next(); raw != nil; raw = iter.Next() {
k := raw.(*structs.RootKeyMeta)
k := raw.(*structs.WrappedRootKeys)
if k.KeyID == key0.KeyID {
must.True(t, k.IsActive(), must.Sprint("expected original key to be active"))
} else {
Expand All @@ -2675,10 +2675,10 @@ func TestCoreScheduler_RootKeyRotate(t *testing.T) {
c.snap, _ = store.Snapshot()
rotated, err = c.rootKeyRotate(eval, now)

iter, err = store.RootKeyMetas(nil)
iter, err = store.WrappedRootKeys(nil)
must.NoError(t, err)
for raw := iter.Next(); raw != nil; raw = iter.Next() {
k := raw.(*structs.RootKeyMeta)
k := raw.(*structs.WrappedRootKeys)
switch k.KeyID {
case key0.KeyID:
must.True(t, k.IsActive(), must.Sprint("original key should still be active"))
Expand All @@ -2694,10 +2694,10 @@ func TestCoreScheduler_RootKeyRotate(t *testing.T) {
now = time.Unix(0, key1.PublishTime+(time.Minute*10).Nanoseconds())
rotated, err = c.rootKeyRotate(eval, now)

iter, err = store.RootKeyMetas(nil)
iter, err = store.WrappedRootKeys(nil)
must.NoError(t, err)
for raw := iter.Next(); raw != nil; raw = iter.Next() {
k := raw.(*structs.RootKeyMeta)
k := raw.(*structs.WrappedRootKeys)
switch k.KeyID {
case key0.KeyID:
must.True(t, k.IsInactive(), must.Sprint("original key should be inactive"))
Expand Down Expand Up @@ -2725,22 +2725,22 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {

// active key, will never be GC'd
store := srv.fsm.State()
key0, err := store.GetActiveRootKeyMeta(nil)
key0, err := store.GetActiveRootKey(nil)
must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped"))
must.NoError(t, err)

now := key0.CreateTime
yesterday := now - (24 * time.Hour).Nanoseconds()

// insert an "old" inactive key
key1 := structs.NewRootKeyMeta().MakeInactive()
key1 := structs.NewWrappedRootKeys(structs.NewRootKeyMeta()).MakeInactive()
key1.CreateTime = yesterday
must.NoError(t, store.UpsertRootKeyMeta(600, key1, false))
must.NoError(t, store.UpsertWrappedRootKeys(600, key1, false))

// insert an "old" and inactive key with a variable that's using it
key2 := structs.NewRootKeyMeta().MakeInactive()
key2 := structs.NewWrappedRootKeys(structs.NewRootKeyMeta()).MakeInactive()
key2.CreateTime = yesterday
must.NoError(t, store.UpsertRootKeyMeta(700, key2, false))
must.NoError(t, store.UpsertWrappedRootKeys(700, key2, false))

variable := mock.VariableEncrypted()
variable.KeyID = key2.KeyID
Expand All @@ -2752,9 +2752,9 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
must.NoError(t, setResp.Error)

// insert an "old" key that's inactive but being used by an alloc
key3 := structs.NewRootKeyMeta().MakeInactive()
key3 := structs.NewWrappedRootKeys(structs.NewRootKeyMeta()).MakeInactive()
key3.CreateTime = yesterday
must.NoError(t, store.UpsertRootKeyMeta(800, key3, false))
must.NoError(t, store.UpsertWrappedRootKeys(800, key3, false))

// insert the allocation using key3
alloc := mock.Alloc()
Expand All @@ -2764,9 +2764,9 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
structs.MsgTypeTestSetup, 850, []*structs.Allocation{alloc}))

// insert an "old" key that's inactive but being used by an alloc
key4 := structs.NewRootKeyMeta().MakeInactive()
key4 := structs.NewWrappedRootKeys(structs.NewRootKeyMeta()).MakeInactive()
key4.CreateTime = yesterday
must.NoError(t, store.UpsertRootKeyMeta(900, key4, false))
must.NoError(t, store.UpsertWrappedRootKeys(900, key4, false))

// insert the dead allocation using key4
alloc2 := mock.Alloc()
Expand All @@ -2777,14 +2777,14 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
structs.MsgTypeTestSetup, 950, []*structs.Allocation{alloc2}))

// insert an inactive key older than RootKeyGCThreshold but not RootKeyRotationThreshold
key5 := structs.NewRootKeyMeta().MakeInactive()
key5 := structs.NewWrappedRootKeys(structs.NewRootKeyMeta()).MakeInactive()
key5.CreateTime = now - (15 * time.Minute).Nanoseconds()
must.NoError(t, store.UpsertRootKeyMeta(1500, key5, false))
must.NoError(t, store.UpsertWrappedRootKeys(1500, key5, false))

// prepublishing key should never be GC'd no matter how old
key6 := structs.NewRootKeyMeta().MakePrepublished(yesterday)
key6 := structs.NewWrappedRootKeys(structs.NewRootKeyMeta()).MakePrepublished(yesterday)
key6.CreateTime = yesterday
must.NoError(t, store.UpsertRootKeyMeta(1600, key6, false))
must.NoError(t, store.UpsertWrappedRootKeys(1600, key6, false))

// run the core job
snap, err := store.Snapshot()
Expand All @@ -2795,31 +2795,31 @@ func TestCoreScheduler_RootKeyGC(t *testing.T) {
must.NoError(t, c.rootKeyGC(eval, time.Now()))

ws := memdb.NewWatchSet()
key, err := store.RootKeyMetaByID(ws, key0.KeyID)
key, err := store.WrappedRootKeysByID(ws, key0.KeyID)
must.NoError(t, err)
must.NotNil(t, key, must.Sprint("active key should not have been GCd"))

key, err = store.RootKeyMetaByID(ws, key1.KeyID)
key, err = store.WrappedRootKeysByID(ws, key1.KeyID)
must.NoError(t, err)
must.Nil(t, key, must.Sprint("old and unused inactive key should have been GCd"))

key, err = store.RootKeyMetaByID(ws, key2.KeyID)
key, err = store.WrappedRootKeysByID(ws, key2.KeyID)
must.NoError(t, err)
must.NotNil(t, key, must.Sprint("old key should not have been GCd if still in use"))

key, err = store.RootKeyMetaByID(ws, key3.KeyID)
key, err = store.WrappedRootKeysByID(ws, key3.KeyID)
must.NoError(t, err)
must.NotNil(t, key, must.Sprint("old key used to sign a live alloc should not have been GCd"))

key, err = store.RootKeyMetaByID(ws, key4.KeyID)
key, err = store.WrappedRootKeysByID(ws, key4.KeyID)
must.NoError(t, err)
must.Nil(t, key, must.Sprint("old key used to sign a terminal alloc should have been GCd"))

key, err = store.RootKeyMetaByID(ws, key5.KeyID)
key, err = store.WrappedRootKeysByID(ws, key5.KeyID)
must.NoError(t, err)
must.NotNil(t, key, must.Sprint("key newer than GC+rotation threshold should not have been GCd"))

key, err = store.RootKeyMetaByID(ws, key6.KeyID)
key, err = store.WrappedRootKeysByID(ws, key6.KeyID)
must.NoError(t, err)
must.NotNil(t, key, must.Sprint("prepublishing key should not have been GCd"))
}
Expand All @@ -2835,7 +2835,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) {
testutil.WaitForKeyring(t, srv.RPC, "global")

store := srv.fsm.State()
key0, err := store.GetActiveRootKeyMeta(nil)
key0, err := store.GetActiveRootKey(nil)
must.NotNil(t, key0, must.Sprint("expected keyring to be bootstapped"))
must.NoError(t, err)

Expand Down Expand Up @@ -2883,7 +2883,7 @@ func TestCoreScheduler_VariablesRekey(t *testing.T) {
}
}

originalKey, _ := store.RootKeyMetaByID(nil, key0.KeyID)
originalKey, _ := store.WrappedRootKeysByID(nil, key0.KeyID)
return originalKey.IsInactive()
}),
), must.Sprint("variable rekey should be complete"))
Expand Down
Loading

0 comments on commit 428667f

Please sign in to comment.