Skip to content

Commit

Permalink
Allow cross cell promotion in PRS (#16461)
Browse files Browse the repository at this point in the history
Signed-off-by: Manan Gupta <manan@planetscale.com>
  • Loading branch information
GuptaManan100 authored Jul 25, 2024
1 parent f481a77 commit 88d7033
Show file tree
Hide file tree
Showing 14 changed files with 1,056 additions and 895 deletions.
6 changes: 6 additions & 0 deletions changelog/21.0/21.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,7 @@
- **[Traffic Mirroring](#traffic-mirroring)**
- **[New VTGate Shutdown Behavior](#new-vtgate-shutdown-behavior)**
- **[Tablet Throttler: Multi-Metric support](#tablet-throttler)**
- **[Allow Cross Cell Promotion in PRS](#allow-cross-cell)**

## <a id="major-changes"/>Major Changes

Expand Down Expand Up @@ -96,3 +97,8 @@ Each metric has a factory threshold which can be overridden by the `UpdateThrott
The throttler also supports the catch-all `"all"` app name, and it is thus possible to assign metrics to _all_ apps. Explicit app to metric assignments will override the catch-all configuration.

Metrics are assigned a default _scope_, which could be `self` (isolated to the tablet) or `shard` (max, aka _worst_ value among shard tablets). It is further possible to require a different scope for each metric.

### <a id="allow-cross-cell"/>Allow Cross Cell Promotion in PRS
Up until now if the users wanted to promote a replica in a different cell than the current primary using `PlannedReparentShard`, they had to specify the new primary with the `--new-primary` flag.

We have now added a new flag `--allow-cross-cell-promotion` that lets `PlannedReparentShard` choose a primary in a different cell even if no new primary is provided explicitly.
3 changes: 3 additions & 0 deletions go/cmd/vtctldclient/command/reparents.go
Original file line number Diff line number Diff line change
Expand Up @@ -187,6 +187,7 @@ var plannedReparentShardOptions = struct {
AvoidPrimaryAliasStr string
WaitReplicasTimeout time.Duration
TolerableReplicationLag time.Duration
AllowCrossCellPromotion bool
}{}

func commandPlannedReparentShard(cmd *cobra.Command, args []string) error {
Expand Down Expand Up @@ -223,6 +224,7 @@ func commandPlannedReparentShard(cmd *cobra.Command, args []string) error {
AvoidPrimary: avoidPrimaryAlias,
WaitReplicasTimeout: protoutil.DurationToProto(plannedReparentShardOptions.WaitReplicasTimeout),
TolerableReplicationLag: protoutil.DurationToProto(plannedReparentShardOptions.TolerableReplicationLag),
AllowCrossCellPromotion: plannedReparentShardOptions.AllowCrossCellPromotion,
})
if err != nil {
return err
Expand Down Expand Up @@ -297,6 +299,7 @@ func init() {
PlannedReparentShard.Flags().DurationVar(&plannedReparentShardOptions.TolerableReplicationLag, "tolerable-replication-lag", 0, "Amount of replication lag that is considered acceptable for a tablet to be eligible for promotion when Vitess makes the choice of a new primary.")
PlannedReparentShard.Flags().StringVar(&plannedReparentShardOptions.NewPrimaryAliasStr, "new-primary", "", "Alias of a tablet that should be the new primary.")
PlannedReparentShard.Flags().StringVar(&plannedReparentShardOptions.AvoidPrimaryAliasStr, "avoid-primary", "", "Alias of a tablet that should not be the primary; i.e. \"reparent to any other tablet if this one is the primary\".")
PlannedReparentShard.Flags().BoolVar(&plannedReparentShardOptions.AllowCrossCellPromotion, "allow-cross-cell-promotion", false, "Allow cross cell promotion")
Root.AddCommand(PlannedReparentShard)

Root.AddCommand(ReparentTablet)
Expand Down
34 changes: 20 additions & 14 deletions go/test/endtoend/reparent/plannedreparent/reparent_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -125,14 +125,8 @@ func TestReparentReplicaOffline(t *testing.T) {
require.Error(t, err)

// Assert that PRS failed
if clusterInstance.VtctlMajorVersion <= 17 {
assert.True(t, utils.SetReplicationSourceFailed(tablets[3], out))
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])
} else {
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc")
utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
}

assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc")
utils.CheckPrimaryTablet(t, clusterInstance, tablets[0])
}

func TestReparentAvoid(t *testing.T) {
Expand All @@ -155,20 +149,32 @@ func TestReparentAvoid(t *testing.T) {
require.NoError(t, err)
utils.ValidateTopology(t, clusterInstance, false)

// tablets[1 is in the same cell and tablets[3] is in a different cell, so we must land on tablets[1
// tablets[1] is in the same cell and tablets[3] is in a different cell, so we must land on tablets[1]
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])

// If we kill the tablet in the same cell as primary then reparent --avoid_tablet will fail.
utils.StopTablet(t, tablets[0], true)
out, err := utils.PrsAvoid(t, clusterInstance, tablets[1])
require.Error(t, err)
if clusterInstance.VtctlMajorVersion <= 17 {
assert.Contains(t, out, "cannot find a tablet to reparent to in the same cell as the current primary")
} else {
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc = latest balancer error")
}
assert.Contains(t, out, "rpc error: code = DeadlineExceeded desc = latest balancer error")
utils.ValidateTopology(t, clusterInstance, false)
utils.CheckPrimaryTablet(t, clusterInstance, tablets[1])

t.Run("Allow cross cell promotion", func(t *testing.T) {
if clusterInstance.VtctlMajorVersion <= 20 {
t.Skip("Allow Cross Cell Promotion was added in v21")
}
utils.DeleteTablet(t, clusterInstance, tablets[0])
// Perform a graceful reparent operation and verify it fails because we have no replicas in the same cell as the primary.
out, err = utils.PrsAvoid(t, clusterInstance, tablets[1])
require.Error(t, err)
assert.Contains(t, out, "is not in the same cell as the previous primary")

// If we run PRS with allow cross cell promotion then it should succeed and should promote the replica in another cell.
_, err = utils.PrsAvoid(t, clusterInstance, tablets[1], "--allow-cross-cell-promotion")
require.NoError(t, err)
utils.CheckPrimaryTablet(t, clusterInstance, tablets[3])
})
}

func TestReparentFromOutside(t *testing.T) {
Expand Down
11 changes: 6 additions & 5 deletions go/test/endtoend/reparent/utils/utils.go
Original file line number Diff line number Diff line change
Expand Up @@ -293,17 +293,17 @@ func execute(t *testing.T, conn *mysql.Conn, query string) *sqltypes.Result {
// region ers, prs

// Prs runs PRS
func Prs(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, false, "", "")
func Prs(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, extraArgs ...string) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, false, "", "", extraArgs...)
}

// PrsAvoid runs PRS
func PrsAvoid(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, true, "", "")
func PrsAvoid(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, extraArgs ...string) (string, error) {
return PrsWithTimeout(t, clusterInstance, tab, true, "", "", extraArgs...)
}

// PrsWithTimeout runs PRS
func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string) (string, error) {
func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster, tab *cluster.Vttablet, avoid bool, actionTimeout, waitTimeout string, extraArgs ...string) (string, error) {
args := []string{
"PlannedReparentShard",
fmt.Sprintf("%s/%s", KeyspaceName, ShardName)}
Expand All @@ -319,6 +319,7 @@ func PrsWithTimeout(t *testing.T, clusterInstance *cluster.LocalProcessCluster,
args = append(args, "--new-primary")
}
args = append(args, tab.Alias)
args = append(args, extraArgs...)
out, err := clusterInstance.VtctldClientProcess.ExecuteCommandWithOutput(args...)
return out, err
}
Expand Down
Loading

0 comments on commit 88d7033

Please sign in to comment.