Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Allow cross cell promotion in PRS #16461

Merged
merged 5 commits into from
Jul 25, 2024
Merged
Show file tree
Hide file tree
Changes from 3 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions changelog/21.0/21.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- **[Traffic Mirroring](#traffic-mirroring)**
- **[New VTGate Shutdown Behavior](#new-vtgate-shutdown-behavior)**
- **[Tablet Throttler: Multi-Metric support](#tablet-throttler)**
- **[Allow Cross Cell Promotion in PRS](#allow-cross-cell)**

## <a id="major-changes"/>Major Changes

Expand Down Expand Up @@ -96,3 +97,8 @@ Each metric has a factory threshold which can be overridden by the `UpdateThrott
The throttler also supports the catch-all `"all"` app name, and it is thus possible to assign metrics to _all_ apps. Explicit app to metric assignments will override the catch-all configuration.

Metrics are assigned a default _scope_, which could be `self` (isolated to the tablet) or `shard` (max, aka _worst_ value among shard tablets). It is further possible to require a different scope for each metric.

### <a id="allow-cross-cell"/>Allow Cross Cell Promotion in PRS
Up until now if the users wanted to promote a replica in a different cell than the current primary using `PlannedReparentShard`, they had to specify the new primary with the `--new-primary` flag.

We have now added a new flag `--allow-cross-cell-promotion` that lets `PlannedReparentShard` choose a primary in a different cell even if no new primary is provided explicitly.
3 changes: 3 additions & 0 deletions go/cmd/vtctldclient/command/reparents.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ var plannedReparentShardOptions = struct {
AvoidPrimaryAliasStr string
WaitReplicasTimeout time.Duration
TolerableReplicationLag time.Duration
AllowCrossCellPromotion bool
}{}

func commandPlannedReparentShard(cmd *cobra.Command, args []string) error {
Expand Down Expand Up @@ -223,6 +224,7 @@ func commandPlannedReparentShard(cmd *cobra.Command, args []string) error {
AvoidPrimary: avoidPrimaryAlias,
WaitReplicasTimeout: protoutil.DurationToProto(plannedReparentShardOptions.WaitReplicasTimeout),
TolerableReplicationLag: protoutil.DurationToProto(plannedReparentShardOptions.TolerableReplicationLag),
AllowCrossCellPromotion: plannedReparentShardOptions.AllowCrossCellPromotion,
})
if err != nil {
return err
Expand Down Expand Up @@ -297,6 +299,7 @@ func init() {
PlannedReparentShard.Flags().DurationVar(&plannedReparentShardOptions.TolerableReplicationLag, "tolerable-replication-lag", 0, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary.")
PlannedReparentShard.Flags().StringVar(&plannedReparentShardOptions.NewPrimaryAliasStr, "new-primary", "", "Alias of a tablet that should be the new primary.")
PlannedReparentShard.Flags().StringVar(&plannedReparentShardOptions.AvoidPrimaryAliasStr, "avoid-primary", "", "Alias of a tablet that should not be the primary; i.e. \"reparent to any other tablet if this one is the primary\".")
PlannedReparentShard.Flags().BoolVar(&plannedReparentShardOptions.AllowCrossCellPromotion, "allow-cross-cell-promotion", false, "Allow cross cell promotion")
Root.AddCommand(PlannedReparentShard)

Root.AddCommand(ReparentTablet)
Expand Down
34 changes: 20 additions & 14 deletions go/test/endtoend/reparent/plannedreparent/reparent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,8 @@ func TestReparentReplicaOffline(t *testing.T) {
require.Error(t, err)

// Assert that PRS failed
if clusterInstance.VtctlMajorVersion <= 17 {
assert.True(t, utils.SetReplicationSourceFailed(tablets[3], out))
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
} else {
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc")
utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
}

assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc")
utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
}

func TestReparentAvoid(t *testing.T) {
Expand All @@ -155,20 +149,32 @@ func TestReparentAvoid(t *testing.T) {
require.NoError(t, err)
utils.ValidateTopology(t, clusterInstance, false)

// tablets[1 is in the same cell and tablets[3] is in a different cell, so we must land on tablets[1
// tablets[1] is in the same cell and tablets[3] is in a different cell, so we must land on tablets[1]
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])

// If we kill the tablet in the same cell as primary then reparent --avoid_tablet will fail.
utils.StopTablet(t, tablets[0], true)
out, err := utils.PrsAvoid(t, clusterInstance, tablets[1])
require.Error(t, err)
if clusterInstance.VtctlMajorVersion <= 17 {
assert.Contains(t, out, "cannot find a tablet to reparent to in the same cell as the current primary")
} else {
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc = latest balancer error")
}
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc = latest balancer error")
utils.ValidateTopology(t, clusterInstance, false)
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])

t.Run("Allow cross cell promotion", func(t *testing.T) {
if clusterInstance.VtctlMajorVersion <= 20 {
t.Skip("Allow Cross Cell Promotion was added in v21")
}
utils.DeleteTablet(t, clusterInstance, tablets[0])
// Perform a graceful reparent operation and verify it fails because we have no replicas in the same cell as the primary.
out, err = utils.PrsAvoid(t, clusterInstance, tablets[1])
require.Error(t, err)
assert.Contains(t, out, "is not in the same cell as the previous primary")

// If we run PRS with allow cross cell promotion then it should succeed and should promote the replica in another cell.
_, err = utils.PrsAvoid(t, clusterInstance, tablets[1], "--allow-cross-cell-promotion")
require.NoError(t, err)
utils.CheckPrimaryTablet(t, clusterInstance, tablets[3])
})
}

func TestReparentFromOutside(t *testing.T) {
Expand Down
11 changes: 6 additions & 5 deletions go/test/endtoend/reparent/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,17 +293,17 @@ func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result {
// region ers, prs

// Prs runs PRS
func Prs(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, false, "", "")
func Prs(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, extraArgs ...string) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, false, "", "", extraArgs...)
}

// PrsAvoid runs PRS
func PrsAvoid(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, true, "", "")
func PrsAvoid(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, extraArgs ...string) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, true, "", "", extraArgs...)
}

// PrsWithTimeout runs PRS
func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string) (string, error) {
func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string, extraArgs ...string) (string, error) {
args := []string{
"PlannedReparentShard",
fmt.Sprintf("%s/%s", KeyspaceName, ShardName)}
Expand All @@ -319,6 +319,7 @@ func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster,
args = append(args, "--new-primary")
}
args = append(args, tab.Alias)
args = append(args, extraArgs...)
out, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(args...)
return out, err
}
Expand Down
Loading
Loading