Skip to content

Commit

Permalink
Prevent overwriting existing host_uuid file
Browse files Browse the repository at this point in the history
In some circumstances, multiple Teleport processes may be trying
to write the host_uuid to the same data directory at the same time.
The last of the writers would win, and any process using a host
UUID that did not match what was on disc could get into a perpertual
state of being unable to connect to the cluster.

To avoid any potential raciness, the writing of the file no longer
truncates and overwrites any existing data. In the even that writes
fail due to this change, the value is attempted to be read again and
returned.
  • Loading branch information
rosstimothy committed Oct 28, 2024
1 parent ac2f8dc commit 3d97bd2
Show file tree
Hide file tree
Showing 2 changed files with 84 additions and 25 deletions.
81 changes: 56 additions & 25 deletions lib/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -496,9 +496,23 @@ func ReadHostUUID(dataDir string) (string, error) {
return id, nil
}

// WriteHostUUID writes host UUID into a file
// WriteHostUUID writes host UUID into a file. The file
// is not truncated to prevent clobbering an operating value.
func WriteHostUUID(dataDir string, id string) error {
err := os.WriteFile(GetHostUUIDPath(dataDir), []byte(id), os.ModeExclusive|0400)
f, err := os.OpenFile(GetHostUUIDPath(dataDir), os.O_WRONLY|os.O_CREATE|os.O_EXCL, os.ModeExclusive|0o400)
if err != nil {
if errors.Is(err, fs.ErrPermission) {
//do not convert to system error as this loses the ability to compare that it is a permission error
return err
}
return trace.ConvertSystemError(err)
}

_, err = f.Write([]byte(id))
if err1 := f.Close(); err1 != nil && err == nil {
err = err1
}

if err != nil {
if errors.Is(err, fs.ErrPermission) {
//do not convert to system error as this loses the ability to compare that it is a permission error
Expand All @@ -509,32 +523,49 @@ func WriteHostUUID(dataDir string, id string) error {
return nil
}

// ReadOrMakeHostUUID looks for a hostid file in the data dir. If present,
// returns the UUID from it, otherwise generates one
// ReadOrMakeHostUUID looks for a host_uuid file in the data dir. If present,
// returns the UUID from it, otherwise attempts to generate one. Any contention
// creating the file is attempted to be handled gracefully, but may result in
// a [trace.LimitExceeded] error if contention is not resolved.
func ReadOrMakeHostUUID(dataDir string) (string, error) {
id, err := ReadHostUUID(dataDir)
if err == nil {
// Attempt to get the host uuid a few times to allow for scenarios
// where two processes may be trying to generate the file at the same time.
// While in most cases this should never happen, but if it does, the contents
// of the file should not be allowed to be overwritten by the second process.
const iterationLimit = 3
for i := 0; i < iterationLimit; i++ {
id, err := ReadHostUUID(dataDir)
if err == nil {
return id, nil
}
if !trace.IsNotFound(err) {
return "", trace.Wrap(err)
}

// Checking error instead of the usual uuid.New() in case uuid generation
// fails due to not enough randomness. It's been known to happen when
// Teleport starts very early in the node initialization cycle and /dev/urandom
// isn't ready yet.
rawID, err := uuid.NewRandom()
if err != nil {
return "", trace.BadParameter("" +
"Teleport failed to generate host UUID. " +
"This may happen if randomness source is not fully initialized when the node is starting up. " +
"Please try restarting Teleport again.")
}

id = rawID.String()
if err = WriteHostUUID(dataDir, id); err != nil {
if trace.IsAlreadyExists(err) || errors.Is(err, fs.ErrPermission) {
continue
}

return "", trace.Wrap(err)
}
return id, nil
}
if !trace.IsNotFound(err) {
return "", trace.Wrap(err)
}
// Checking error instead of the usual uuid.New() in case uuid generation
// fails due to not enough randomness. It's been known to happen happen when
// Teleport starts very early in the node initialization cycle and /dev/urandom
// isn't ready yet.
rawID, err := uuid.NewRandom()
if err != nil {
return "", trace.BadParameter("" +
"Teleport failed to generate host UUID. " +
"This may happen if randomness source is not fully initialized when the node is starting up. " +
"Please try restarting Teleport again.")
}
id = rawID.String()
if err = WriteHostUUID(dataDir, id); err != nil {
return "", trace.Wrap(err)
}
return id, nil

return "", trace.LimitExceeded("failed to retrieve the host UUID within %v iterations", iterationLimit)
}

// StringSliceSubset returns true if b is a subset of a.
Expand Down
28 changes: 28 additions & 0 deletions lib/utils/utils_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -23,6 +23,7 @@ import (
"fmt"
"os"
"path/filepath"
"slices"
"strings"
"testing"
"time"
Expand All @@ -31,6 +32,7 @@ import (
"github.com/gravitational/trace"
"github.com/stretchr/testify/assert"
"github.com/stretchr/testify/require"
"golang.org/x/sync/errgroup"

"github.com/gravitational/teleport/api/utils/keys"
"github.com/gravitational/teleport/lib/fixtures"
Expand All @@ -42,6 +44,32 @@ func TestMain(m *testing.M) {
os.Exit(m.Run())
}

func TestReadOrMakeHostUUID(t *testing.T) {
t.Parallel()

dir := t.TempDir()

var wg errgroup.Group
concurrency := 4
ids := make([]string, concurrency)
barrier := make(chan struct{})

for i := 0; i < concurrency; i++ {
wg.Go(func() error {
<-barrier
id, err := ReadOrMakeHostUUID(dir)
ids[i] = id
return err
})
}

close(barrier)

require.NoError(t, wg.Wait())
require.Equal(t, slices.Repeat([]string{ids[0]}, concurrency), ids)
require.Equal(t, ids[0], ids[1])
}

func TestHostUUIDIdempotent(t *testing.T) {
t.Parallel()

Expand Down

0 comments on commit 3d97bd2

Please sign in to comment.