From 3d97bd25aac017d4a204c0df38659a3aaa16c2ad Mon Sep 17 00:00:00 2001 From: Tim Ross Date: Mon, 28 Oct 2024 11:50:37 -0400 Subject: [PATCH] Prevent overwriting existing host_uuid file In some circumstances, multiple Teleport processes may be trying to write the host_uuid to the same data directory at the same time. The last of the writers would win, and any process using a host UUID that did not match what was on disc could get into a perpertual state of being unable to connect to the cluster. To avoid any potential raciness, the writing of the file no longer truncates and overwrites any existing data. In the even that writes fail due to this change, the value is attempted to be read again and returned. --- lib/utils/utils.go | 81 ++++++++++++++++++++++++++++------------- lib/utils/utils_test.go | 28 ++++++++++++++ 2 files changed, 84 insertions(+), 25 deletions(-) diff --git a/lib/utils/utils.go b/lib/utils/utils.go index b1931e2ae8cf4..f8452b67ba03d 100644 --- a/lib/utils/utils.go +++ b/lib/utils/utils.go @@ -496,9 +496,23 @@ func ReadHostUUID(dataDir string) (string, error) { return id, nil } -// WriteHostUUID writes host UUID into a file +// WriteHostUUID writes host UUID into a file. The file +// is not truncated to prevent clobbering an operating value. func WriteHostUUID(dataDir string, id string) error { - err := os.WriteFile(GetHostUUIDPath(dataDir), []byte(id), os.ModeExclusive|0400) + f, err := os.OpenFile(GetHostUUIDPath(dataDir), os.O_WRONLY|os.O_CREATE|os.O_EXCL, os.ModeExclusive|0o400) + if err != nil { + if errors.Is(err, fs.ErrPermission) { + //do not convert to system error as this loses the ability to compare that it is a permission error + return err + } + return trace.ConvertSystemError(err) + } + + _, err = f.Write([]byte(id)) + if err1 := f.Close(); err1 != nil && err == nil { + err = err1 + } + if err != nil { if errors.Is(err, fs.ErrPermission) { //do not convert to system error as this loses the ability to compare that it is a permission error @@ -509,32 +523,49 @@ func WriteHostUUID(dataDir string, id string) error { return nil } -// ReadOrMakeHostUUID looks for a hostid file in the data dir. If present, -// returns the UUID from it, otherwise generates one +// ReadOrMakeHostUUID looks for a host_uuid file in the data dir. If present, +// returns the UUID from it, otherwise attempts to generate one. Any contention +// creating the file is attempted to be handled gracefully, but may result in +// a [trace.LimitExceeded] error if contention is not resolved. func ReadOrMakeHostUUID(dataDir string) (string, error) { - id, err := ReadHostUUID(dataDir) - if err == nil { + // Attempt to get the host uuid a few times to allow for scenarios + // where two processes may be trying to generate the file at the same time. + // While in most cases this should never happen, but if it does, the contents + // of the file should not be allowed to be overwritten by the second process. + const iterationLimit = 3 + for i := 0; i < iterationLimit; i++ { + id, err := ReadHostUUID(dataDir) + if err == nil { + return id, nil + } + if !trace.IsNotFound(err) { + return "", trace.Wrap(err) + } + + // Checking error instead of the usual uuid.New() in case uuid generation + // fails due to not enough randomness. It's been known to happen when + // Teleport starts very early in the node initialization cycle and /dev/urandom + // isn't ready yet. + rawID, err := uuid.NewRandom() + if err != nil { + return "", trace.BadParameter("" + + "Teleport failed to generate host UUID. " + + "This may happen if randomness source is not fully initialized when the node is starting up. " + + "Please try restarting Teleport again.") + } + + id = rawID.String() + if err = WriteHostUUID(dataDir, id); err != nil { + if trace.IsAlreadyExists(err) || errors.Is(err, fs.ErrPermission) { + continue + } + + return "", trace.Wrap(err) + } return id, nil } - if !trace.IsNotFound(err) { - return "", trace.Wrap(err) - } - // Checking error instead of the usual uuid.New() in case uuid generation - // fails due to not enough randomness. It's been known to happen happen when - // Teleport starts very early in the node initialization cycle and /dev/urandom - // isn't ready yet. - rawID, err := uuid.NewRandom() - if err != nil { - return "", trace.BadParameter("" + - "Teleport failed to generate host UUID. " + - "This may happen if randomness source is not fully initialized when the node is starting up. " + - "Please try restarting Teleport again.") - } - id = rawID.String() - if err = WriteHostUUID(dataDir, id); err != nil { - return "", trace.Wrap(err) - } - return id, nil + + return "", trace.LimitExceeded("failed to retrieve the host UUID within %v iterations", iterationLimit) } // StringSliceSubset returns true if b is a subset of a. diff --git a/lib/utils/utils_test.go b/lib/utils/utils_test.go index 1ff85e1ff8d31..41934ff122cc1 100644 --- a/lib/utils/utils_test.go +++ b/lib/utils/utils_test.go @@ -23,6 +23,7 @@ import ( "fmt" "os" "path/filepath" + "slices" "strings" "testing" "time" @@ -31,6 +32,7 @@ import ( "github.com/gravitational/trace" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" + "golang.org/x/sync/errgroup" "github.com/gravitational/teleport/api/utils/keys" "github.com/gravitational/teleport/lib/fixtures" @@ -42,6 +44,32 @@ func TestMain(m *testing.M) { os.Exit(m.Run()) } +func TestReadOrMakeHostUUID(t *testing.T) { + t.Parallel() + + dir := t.TempDir() + + var wg errgroup.Group + concurrency := 4 + ids := make([]string, concurrency) + barrier := make(chan struct{}) + + for i := 0; i < concurrency; i++ { + wg.Go(func() error { + <-barrier + id, err := ReadOrMakeHostUUID(dir) + ids[i] = id + return err + }) + } + + close(barrier) + + require.NoError(t, wg.Wait()) + require.Equal(t, slices.Repeat([]string{ids[0]}, concurrency), ids) + require.Equal(t, ids[0], ids[1]) +} + func TestHostUUIDIdempotent(t *testing.T) { t.Parallel()