From 0bea31ef7fd883ac341b8660332acf714f7a412e Mon Sep 17 00:00:00 2001 From: Travis Nielsen Date: Wed, 10 Apr 2024 15:48:49 -0600 Subject: [PATCH] pool: return error if device class update fails Updating the device class swallowed any error if updated for the pool. The error was not even logged, so we couldn't troubleshoot why the new crush rule was not applied. Log the error for troubleshooting and also fail the pool reconcile since the desired configuration was not applied. Signed-off-by: Travis Nielsen --- PendingReleaseNotes.md | 1 + pkg/daemon/ceph/client/pool.go | 34 ++++++++++++++-------------- pkg/daemon/ceph/client/pool_test.go | 11 ++++++++- pkg/operator/ceph/pool/controller.go | 4 ++-- 4 files changed, 30 insertions(+), 20 deletions(-) diff --git a/PendingReleaseNotes.md b/PendingReleaseNotes.md index b5b24eda3d54..0cf183144e02 100644 --- a/PendingReleaseNotes.md +++ b/PendingReleaseNotes.md @@ -4,5 +4,6 @@ - Updating Ceph COSI driver images, this impact existing COSI `Buckets` and `BucketAccesses`, please update the `BucketClass` and `BucketAccessClass` for resolving refer [here](https://github.com/rook/rook/discussions/14297) +- During CephBlockPool updates, return an error if an invalid device class is specified. Pools with invalid device classes may start failing reconcile until the correct device class is specified. See #14057. ## Features diff --git a/pkg/daemon/ceph/client/pool.go b/pkg/daemon/ceph/client/pool.go index 95d778ec8077..82eae5e77eb9 100644 --- a/pkg/daemon/ceph/client/pool.go +++ b/pkg/daemon/ceph/client/pool.go @@ -253,16 +253,16 @@ func DeletePool(context *clusterd.Context, clusterInfo *ClusterInfo, name string logger.Infof("purging pool %q (id=%d)", name, pool.Number) args := []string{"osd", "pool", "delete", name, name, reallyConfirmFlag} - _, err = NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to delete pool %q", name) + return errors.Wrapf(err, "failed to delete pool %q. %s", name, string(output)) } // remove the crush rule for this pool and ignore the error in case the rule is still in use or not found args = []string{"osd", "crush", "rule", "rm", name} - _, err = NewCephCommand(context, clusterInfo, args).Run() + output, err = NewCephCommand(context, clusterInfo, args).Run() if err != nil { - logger.Errorf("failed to delete crush rule %q. %v", name, err) + logger.Errorf("failed to delete crush rule %q. %v. %s", name, err, string(output)) } logger.Infof("purge completed for pool %q", name) @@ -280,9 +280,9 @@ func givePoolAppTag(context *clusterd.Context, clusterInfo *ClusterInfo, poolNam } args := []string{"osd", "pool", "application", "enable", poolName, appName, confirmFlag} - _, err = NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to enable application %q on pool %q", appName, poolName) + return errors.Wrapf(err, "failed to enable application %q on pool %q. %s", appName, poolName, string(output)) } return nil @@ -456,7 +456,7 @@ func createReplicatedPoolForApp(context *clusterd.Context, clusterInfo *ClusterI if checkFailureDomain || pool.PoolSpec.DeviceClass != "" { if err = updatePoolCrushRule(context, clusterInfo, clusterSpec, pool); err != nil { - return nil + return errors.Wrapf(err, "failed to update crush rule for pool %q", pool.Name) } } return nil @@ -561,9 +561,9 @@ func extractPoolDetails(rule ruleSpec) (string, string) { func setCrushRule(context *clusterd.Context, clusterInfo *ClusterInfo, poolName, crushRule string) error { args := []string{"osd", "pool", "set", poolName, "crush_rule", crushRule} - _, err := NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to set crush rule %q", crushRule) + return errors.Wrapf(err, "failed to set crush rule %q. %s", crushRule, string(output)) } return nil } @@ -714,9 +714,9 @@ func createReplicationCrushRule(context *clusterd.Context, clusterInfo *ClusterI args = append(args, deviceClass) } - _, err := NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to create crush rule %s", ruleName) + return errors.Wrapf(err, "failed to create crush rule %s. %s", ruleName, string(output)) } return nil @@ -726,9 +726,9 @@ func createReplicationCrushRule(context *clusterd.Context, clusterInfo *ClusterI func SetPoolProperty(context *clusterd.Context, clusterInfo *ClusterInfo, name, propName, propVal string) error { args := []string{"osd", "pool", "set", name, propName, propVal} logger.Infof("setting pool property %q to %q on pool %q", propName, propVal, name) - _, err := NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to set pool property %q on pool %q", propName, name) + return errors.Wrapf(err, "failed to set pool property %q on pool %q. %s", propName, name, string(output)) } return nil } @@ -737,9 +737,9 @@ func SetPoolProperty(context *clusterd.Context, clusterInfo *ClusterInfo, name, func setPoolQuota(context *clusterd.Context, clusterInfo *ClusterInfo, poolName, quotaType, quotaVal string) error { args := []string{"osd", "pool", "set-quota", poolName, quotaType, quotaVal} logger.Infof("setting quota %q=%q on pool %q", quotaType, quotaVal, poolName) - _, err := NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to set %q quota on pool %q", quotaType, poolName) + return errors.Wrapf(err, "failed to set %q quota on pool %q. %s", quotaType, poolName, string(output)) } return nil } @@ -752,9 +752,9 @@ func SetPoolReplicatedSizeProperty(context *clusterd.Context, clusterInfo *Clust args = append(args, "--yes-i-really-mean-it") } - _, err := NewCephCommand(context, clusterInfo, args).Run() + output, err := NewCephCommand(context, clusterInfo, args).Run() if err != nil { - return errors.Wrapf(err, "failed to set pool property %q on pool %q", propName, poolName) + return errors.Wrapf(err, "failed to set pool property %q on pool %q. %s", propName, poolName, string(output)) } return nil diff --git a/pkg/daemon/ceph/client/pool_test.go b/pkg/daemon/ceph/client/pool_test.go index 28bacae2109a..3afe66b820ca 100644 --- a/pkg/daemon/ceph/client/pool_test.go +++ b/pkg/daemon/ceph/client/pool_test.go @@ -179,6 +179,11 @@ func testCreateReplicaPool(t *testing.T, failureDomain, crushRoot, deviceClass, assert.Equal(t, "12345", args[8]) return "", nil } + if args[2] == "get" { + assert.Equal(t, "mypool", args[3]) + assert.Equal(t, "all", args[4]) + return `{"pool":"replicapool","pool_id":2,"size":1,"min_size":1,"crush_rule":"replicapool_osd"}`, nil + } if args[2] == "set" { assert.Equal(t, "mypool", args[3]) if args[4] == "size" { @@ -203,8 +208,12 @@ func testCreateReplicaPool(t *testing.T, failureDomain, crushRoot, deviceClass, if args[1] == "crush" { crushRuleCreated = true assert.Equal(t, "rule", args[2]) + if args[3] == "dump" { + assert.Equal(t, "replicapool_osd", args[4]) + return `{"rule_id": 3,"rule_name": "replicapool_osd","type": 1}`, nil + } assert.Equal(t, "create-replicated", args[3]) - assert.Equal(t, "mypool", args[4]) + assert.Contains(t, args[4], "mypool") if crushRoot == "" { assert.Equal(t, "cluster-crush-root", args[5]) } else { diff --git a/pkg/operator/ceph/pool/controller.go b/pkg/operator/ceph/pool/controller.go index 58d0f13379cf..24d080ebf644 100644 --- a/pkg/operator/ceph/pool/controller.go +++ b/pkg/operator/ceph/pool/controller.go @@ -366,7 +366,7 @@ func (r *ReconcileCephBlockPool) reconcileCreatePool(clusterInfo *cephclient.Clu poolSpec := cephBlockPool.ToNamedPoolSpec() err := createPool(r.context, clusterInfo, cephCluster, &poolSpec) if err != nil { - return opcontroller.ImmediateRetryResult, errors.Wrapf(err, "failed to create pool %q.", cephBlockPool.GetName()) + return opcontroller.ImmediateRetryResult, errors.Wrapf(err, "failed to configure pool %q.", cephBlockPool.GetName()) } // Let's return here so that on the initial creation we don't check for update right away @@ -382,7 +382,7 @@ func createPool(context *clusterd.Context, clusterInfo *cephclient.ClusterInfo, // create the pool logger.Infof("creating pool %q in namespace %q", p.Name, clusterInfo.Namespace) if err := cephclient.CreatePool(context, clusterInfo, clusterSpec, *p); err != nil { - return errors.Wrapf(err, "failed to create pool %q", p.Name) + return errors.Wrapf(err, "failed to configure pool %q", p.Name) } if p.Application != poolApplicationNameRBD {