Skip to content

Commit 9c64cbd

Browse files
committed
Add unit test
Signed-off-by: Matt Lord <mattalord@gmail.com>
1 parent f86320e commit 9c64cbd

File tree

3 files changed

+151
-6
lines changed

3 files changed

+151
-6
lines changed

go/vt/vtgate/vstream_manager.go

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -555,18 +555,19 @@ func (vs *vstream) streamFromTablet(ctx context.Context, sgtid *binlogdatapb.Sha
555555
case ctx.Err() != nil:
556556
err = fmt.Errorf("context has ended")
557557
case shr == nil || shr.RealtimeStats == nil || shr.Target == nil:
558-
err = fmt.Errorf("health check failed")
558+
err = fmt.Errorf("health check failed on %s", topoproto.TabletAliasString(tablet.Alias))
559559
case vs.tabletType != shr.Target.TabletType:
560-
err = fmt.Errorf("tablet type has changed from %s to %s, restarting vstream",
561-
vs.tabletType, shr.Target.TabletType)
560+
err = fmt.Errorf("tablet %s type has changed from %s to %s, restarting vstream",
561+
topoproto.TabletAliasString(tablet.Alias), vs.tabletType, shr.Target.TabletType)
562562
case shr.RealtimeStats.HealthError != "":
563563
err = fmt.Errorf("tablet %s is no longer healthy: %s, restarting vstream",
564-
tablet.Alias, shr.RealtimeStats.HealthError)
564+
topoproto.TabletAliasString(tablet.Alias), shr.RealtimeStats.HealthError)
565565
case shr.RealtimeStats.ReplicationLagSeconds > uint32(discovery.GetLowReplicationLag().Seconds()):
566566
err = fmt.Errorf("tablet %s has a replication lag of %d seconds which is beyond the value provided in --discovery_low_replication_lag of %s so the tablet is no longer considered healthy, restarting vstream",
567567
topoproto.TabletAliasString(tablet.Alias), shr.RealtimeStats.ReplicationLagSeconds, discovery.GetLowReplicationLag())
568568
}
569569
if err != nil {
570+
log.Warningf("Tablet state changed: %s, attempting to restart", err)
570571
errCh <- err
571572
return err
572573
}
@@ -597,7 +598,6 @@ func (vs *vstream) streamFromTablet(ctx context.Context, sgtid *binlogdatapb.Sha
597598
case <-ctx.Done():
598599
return ctx.Err()
599600
case streamErr := <-errCh:
600-
log.Warningf("Tablet state changed: %s, attempting to restart", streamErr)
601601
return vterrors.New(vtrpcpb.Code_UNAVAILABLE, streamErr.Error())
602602
case <-journalDone:
603603
// Unreachable.

go/vt/vtgate/vstream_manager_test.go

Lines changed: 131 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -32,12 +32,16 @@ import (
3232
"vitess.io/vitess/go/stats"
3333
"vitess.io/vitess/go/test/utils"
3434
"vitess.io/vitess/go/vt/discovery"
35+
"vitess.io/vitess/go/vt/log"
36+
"vitess.io/vitess/go/vt/logutil"
3537
"vitess.io/vitess/go/vt/srvtopo"
3638
"vitess.io/vitess/go/vt/topo"
39+
"vitess.io/vitess/go/vt/topo/topoproto"
3740
"vitess.io/vitess/go/vt/vterrors"
3841
"vitess.io/vitess/go/vt/vttablet/sandboxconn"
3942

4043
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
44+
querypb "vitess.io/vitess/go/vt/proto/query"
4145
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
4246
vtgatepb "vitess.io/vitess/go/vt/proto/vtgate"
4347
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
@@ -1559,6 +1563,133 @@ func TestKeyspaceHasBeenSharded(t *testing.T) {
15591563
}
15601564
}
15611565

1566+
// TestVStreamManagerHealthcheckResponseHandling tests the handling of healthcheck responses by
1567+
// the vstream manager to confirm that we are correctly restarting the vstream when we should.
1568+
func TestVStreamManagerHealthcheckResponseHandling(t *testing.T) {
1569+
ctx := utils.LeakCheckContext(t)
1570+
1571+
// Capture the vstream warning log. Otherwise we need to re-implement the vstream error
1572+
// handling in SandboxConn's implementation and then we're not actually testing the
1573+
// production code.
1574+
logger := logutil.NewMemoryLogger()
1575+
log.Warningf = logger.Warningf
1576+
1577+
cell := "aa"
1578+
ks := "TestVStream"
1579+
shard := "0"
1580+
tabletType := topodatapb.TabletType_REPLICA
1581+
_ = createSandbox(ks)
1582+
healthChan := make(chan *discovery.TabletHealth)
1583+
hc := discovery.NewFakeHealthCheck(healthChan)
1584+
st := getSandboxTopo(ctx, cell, ks, []string{shard})
1585+
vsm := newTestVStreamManager(ctx, hc, st, cell)
1586+
vgtid := &binlogdatapb.VGtid{
1587+
ShardGtids: []*binlogdatapb.ShardGtid{{
1588+
Keyspace: ks,
1589+
Shard: shard,
1590+
}},
1591+
}
1592+
source := hc.AddTestTablet(cell, "1.1.1.1", 1001, ks, shard, tabletType, true, 0, nil)
1593+
tabletAlias := topoproto.TabletAliasString(source.Tablet().Alias)
1594+
addTabletToSandboxTopo(t, ctx, st, ks, shard, source.Tablet())
1595+
target := &querypb.Target{
1596+
Cell: cell,
1597+
Keyspace: ks,
1598+
Shard: shard,
1599+
TabletType: tabletType,
1600+
}
1601+
highLag := uint32(discovery.GetLowReplicationLag().Seconds()) + 1
1602+
1603+
type testcase struct {
1604+
name string
1605+
hcRes *querypb.StreamHealthResponse
1606+
wantErr string
1607+
}
1608+
testcases := []testcase{
1609+
{
1610+
name: "all healthy", // Will hit the context timeout
1611+
},
1612+
{
1613+
name: "failure",
1614+
hcRes: &querypb.StreamHealthResponse{
1615+
TabletAlias: source.Tablet().Alias,
1616+
Target: nil, // This is seen as a healthcheck stream failure
1617+
},
1618+
wantErr: fmt.Sprintf("health check failed on %s", tabletAlias),
1619+
},
1620+
{
1621+
name: "tablet type changed",
1622+
hcRes: &querypb.StreamHealthResponse{
1623+
TabletAlias: source.Tablet().Alias,
1624+
Target: &querypb.Target{
1625+
Cell: cell,
1626+
Keyspace: ks,
1627+
Shard: shard,
1628+
TabletType: topodatapb.TabletType_PRIMARY,
1629+
},
1630+
PrimaryTermStartTimestamp: time.Now().Unix(),
1631+
RealtimeStats: &querypb.RealtimeStats{},
1632+
},
1633+
wantErr: fmt.Sprintf("tablet %s type has changed from %s to %s",
1634+
tabletAlias, tabletType, topodatapb.TabletType_PRIMARY.String()),
1635+
},
1636+
{
1637+
name: "unhealthy",
1638+
hcRes: &querypb.StreamHealthResponse{
1639+
TabletAlias: source.Tablet().Alias,
1640+
Target: target,
1641+
RealtimeStats: &querypb.RealtimeStats{
1642+
HealthError: "unhealthy",
1643+
},
1644+
},
1645+
wantErr: fmt.Sprintf("tablet %s is no longer healthy", tabletAlias),
1646+
},
1647+
{
1648+
name: "replication lag too high",
1649+
hcRes: &querypb.StreamHealthResponse{
1650+
TabletAlias: source.Tablet().Alias,
1651+
Target: target,
1652+
RealtimeStats: &querypb.RealtimeStats{
1653+
ReplicationLagSeconds: highLag,
1654+
},
1655+
},
1656+
wantErr: fmt.Sprintf("%s has a replication lag of %d seconds which is beyond the value provided",
1657+
tabletAlias, highLag),
1658+
},
1659+
}
1660+
1661+
warningIdx := 0
1662+
for _, tc := range testcases {
1663+
t.Run(tc.name, func(t *testing.T) {
1664+
done := make(chan struct{})
1665+
go func() {
1666+
sctx, cancel := context.WithTimeout(ctx, 3*time.Second)
1667+
defer cancel()
1668+
defer close(done)
1669+
// SandboxConn's VStream implementation always waits for the context to timeout.
1670+
err := vsm.VStream(sctx, tabletType, vgtid, nil, nil, func(events []*binlogdatapb.VEvent) error {
1671+
require.Fail(t, "unexpected event", "Receieved unexpected events: %v", events)
1672+
return nil
1673+
})
1674+
if tc.wantErr != "" { // Otherwise we simply expect the context to timeout
1675+
if len(logger.Events) < warningIdx+1 {
1676+
require.Fail(t, "expected vstream error", "vstream did not encounter a healthstream error, expected: %s", tc.wantErr)
1677+
}
1678+
loggedErr := logger.Events[warningIdx]
1679+
warningIdx++
1680+
if loggedErr == nil || !strings.Contains(loggedErr.String(), tc.wantErr) {
1681+
require.Fail(t, "unexpected vstream error", "vstream ended with error: %v, which did not contain: %s", err, tc.wantErr)
1682+
}
1683+
}
1684+
}()
1685+
if tc.wantErr != "" {
1686+
source.SetStreamHealthResponse(tc.hcRes)
1687+
}
1688+
<-done
1689+
})
1690+
}
1691+
}
1692+
15621693
func newTestVStreamManager(ctx context.Context, hc discovery.HealthCheck, serv srvtopo.Server, cell string) *vstreamManager {
15631694
gw := NewTabletGateway(ctx, hc, serv, cell)
15641695
srvResolver := srvtopo.NewResolver(serv, gw, cell)

go/vt/vttablet/sandboxconn/sandboxconn.go

Lines changed: 15 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -133,6 +133,8 @@ type SandboxConn struct {
133133
getSchemaResult []SchemaResult
134134

135135
parser *sqlparser.Parser
136+
137+
streamHealthResponse *querypb.StreamHealthResponse
136138
}
137139

138140
type SchemaResult struct {
@@ -475,8 +477,20 @@ func (sbc *SandboxConn) MessageAck(ctx context.Context, target *querypb.Target,
475477
// SandboxSQRowCount is the default number of fake splits returned.
476478
var SandboxSQRowCount = int64(10)
477479

478-
// StreamHealth always mocks a "healthy" result.
480+
func (sbc *SandboxConn) SetStreamHealthResponse(res *querypb.StreamHealthResponse) {
481+
sbc.mapMu.Lock()
482+
defer sbc.mapMu.Unlock()
483+
sbc.streamHealthResponse = res
484+
}
485+
486+
// StreamHealth always mocks a "healthy" result by default. If you want to overrid this behavior you
487+
// call SetStreamHealthResponse.
479488
func (sbc *SandboxConn) StreamHealth(ctx context.Context, callback func(*querypb.StreamHealthResponse) error) error {
489+
sbc.mapMu.Lock()
490+
defer sbc.mapMu.Unlock()
491+
if sbc.streamHealthResponse != nil {
492+
return callback(sbc.streamHealthResponse)
493+
}
480494
return nil
481495
}
482496

0 commit comments

Comments
 (0)