Skip to content

Improve tablet types to wait #17622

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
wants to merge 10 commits into from
11 changes: 11 additions & 0 deletions changelog/22.0/22.0.0/summary.md
Original file line number Diff line number Diff line change
Expand Up @@ -42,6 +42,8 @@
- [ACL enforcement and reloading](#reloading-vttablet-acl)
- **[VTAdmin](#vtadmin)**
- [Updated to node v22.13.1](#updated-node)
- **[VTGate Flags](#vtgate-flags)**
- [vtgate behavior for `--tablet_types_to_wait`](#vtgate-init-tabletgw-wait-tablet-type)

## <a id="major-changes"/>Major Changes</a>

Expand Down Expand Up @@ -371,3 +373,12 @@ When a tablet is started with `--enforce-tableacl-config` it will exit with an e

Building `vtadmin-web` now requires node >= v22.13.0 (LTS). Breaking changes from v20 to v22 can be found at https://nodejs.org/en/blog/release/v22.13.0 -- with no known issues that apply to VTAdmin.
Full details on the node v20.12.2 release can be found at https://nodejs.org/en/blog/release/v22.13.1.

## <a id="vtgate-flags"/>VTGate Flags

### <a id="vtgate-init-tabletgw-wait-tablet-type"/> vtgate behavior for `--tablet_types_to_wait`
Previously, if waiting for tablets took longer than the time specified in `--gateway_initial_tablet_timeout`, `vtgate`
would log a warning and start serving queries. Any queries received by `vtgate` for the tablet types that were not
available would fail. In this release, this behavior has been changed so that if waiting times out, a warning is logged, and `vtgate` retries until it
succeeds. This means it does not start serving queries until it has received health checks for all tablet types, preventing query
errors - at the expense of potentially taking longer to start serving.
6 changes: 4 additions & 2 deletions examples/region_sharding/101_initial_cluster.sh
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ fi
CELL=zone1 ../common/scripts/vtctld-up.sh

# start unsharded keyspace and tablet
CELL=zone1 TABLET_UID=100 ../common/scripts/mysqlctl-up.sh
SHARD=0 CELL=zone1 KEYSPACE=main TABLET_UID=100 ../common/scripts/vttablet-up.sh
for T_UID in 100 101; do
CELL=zone1 TABLET_UID="${T_UID}" ../common/scripts/mysqlctl-up.sh
SHARD=0 CELL=zone1 KEYSPACE=main TABLET_UID="${T_UID}" ../common/scripts/vttablet-up.sh
done

# set the correct durability policy for the keyspace
vtctldclient --server localhost:15999 SetKeyspaceDurabilityPolicy --durability-policy=none main || fail "Failed to set keyspace durability policy on the main keyspace"
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -19,85 +19,19 @@ package loadkeyspace
import (
"os"
"path"
"strings"
"testing"
"time"

"github.com/stretchr/testify/require"

"vitess.io/vitess/go/test/endtoend/utils"

"vitess.io/vitess/go/test/endtoend/cluster"
)

var (
clusterInstance *cluster.LocalProcessCluster
hostname = "localhost"
keyspaceName = "ks"
cell = "zone1"
sqlSchema = `
create table vt_user (
id bigint,
name varchar(64),
primary key (id)
) Engine=InnoDB;

create table main (
id bigint,
val varchar(128),
primary key(id)
) Engine=InnoDB;

create table test_table (
id bigint,
val varchar(128),
primary key(id)
) Engine=InnoDB;
`
)

func TestLoadKeyspaceWithNoTablet(t *testing.T) {
var err error

clusterInstance = cluster.NewCluster(cell, hostname)
defer clusterInstance.Teardown()

// Start topo server
err = clusterInstance.StartTopo()
require.NoError(t, err)

// create keyspace
keyspace := &cluster.Keyspace{
Name: keyspaceName,
SchemaSQL: sqlSchema,
}
clusterInstance.VtTabletExtraArgs = append(clusterInstance.VtTabletExtraArgs, "--queryserver-config-schema-change-signal")
err = clusterInstance.StartUnshardedKeyspace(*keyspace, 0, false)
require.NoError(t, err)

// teardown vttablets
for _, vttablet := range clusterInstance.Keyspaces[0].Shards[0].Vttablets {
err = vttablet.VttabletProcess.TearDown()
require.NoError(t, err)
utils.TimeoutAction(t, 1*time.Minute, "timeout - teardown of VTTablet", func() bool {
return vttablet.VttabletProcess.GetStatus() == ""
})
}

// Start vtgate with the schema_change_signal flag
clusterInstance.VtGateExtraArgs = append(clusterInstance.VtGateExtraArgs, "--schema_change_signal")
err = clusterInstance.StartVtgate()
require.NoError(t, err)

// After starting VTGate we need to leave enough time for resolveAndLoadKeyspace to reach
// the schema tracking timeout (5 seconds).
utils.TimeoutAction(t, 5*time.Minute, "timeout - could not find 'Unable to get initial schema reload' in 'vtgate-stderr.txt'", func() bool {
logDir := clusterInstance.VtgateProcess.LogDir
all, _ := os.ReadFile(path.Join(logDir, "vtgate-stderr.txt"))
return strings.Contains(string(all), "Unable to get initial schema reload")
})
}

func TestNoInitialKeyspace(t *testing.T) {
var err error

Expand Down
3 changes: 1 addition & 2 deletions go/vt/vtgate/tabletgateway.go
Original file line number Diff line number Diff line change
Expand Up @@ -207,9 +207,8 @@ func (gw *TabletGateway) WaitForTablets(ctx context.Context, tabletTypesToWait [
case context.DeadlineExceeded:
// In this scenario, we were able to reach the
// topology service, but some tablets may not be
// ready. We just warn and keep going.
// ready.
log.Warningf("Timeout waiting for all keyspaces / shards to have healthy tablets of types %v, may be in degraded mode", tabletTypesToWait)
err = nil
}
}()

Expand Down
17 changes: 15 additions & 2 deletions go/vt/vtgate/vtgate.go
Original file line number Diff line number Diff line change
Expand Up @@ -321,8 +321,21 @@ func Init(
// TabletGateway can create it's own healthcheck
gw := NewTabletGateway(ctx, hc, serv, cell)
gw.RegisterStats()
if err := gw.WaitForTablets(ctx, tabletTypesToWait); err != nil {
log.Fatalf("tabletGateway.WaitForTablets failed: %v", err)

// Retry loop for potential time-outs waiting for all tablets.
OuterLoop:
for {
err := gw.WaitForTablets(ctx, tabletTypesToWait)
switch {
case err == nil:
break OuterLoop
case errors.Is(err, context.DeadlineExceeded):
log.Warning("TabletGateway timed out waiting for tablets to become available - retrying.")

continue
default:
log.Fatalf("tabletGateway.WaitForTablets failed: %v", err)
}
}

dynamicConfig := NewDynamicViperConfig()
Expand Down
Loading