@@ -32,12 +32,16 @@ import (
32
32
"vitess.io/vitess/go/stats"
33
33
"vitess.io/vitess/go/test/utils"
34
34
"vitess.io/vitess/go/vt/discovery"
35
+ "vitess.io/vitess/go/vt/log"
36
+ "vitess.io/vitess/go/vt/logutil"
35
37
"vitess.io/vitess/go/vt/srvtopo"
36
38
"vitess.io/vitess/go/vt/topo"
39
+ "vitess.io/vitess/go/vt/topo/topoproto"
37
40
"vitess.io/vitess/go/vt/vterrors"
38
41
"vitess.io/vitess/go/vt/vttablet/sandboxconn"
39
42
40
43
binlogdatapb "vitess.io/vitess/go/vt/proto/binlogdata"
44
+ querypb "vitess.io/vitess/go/vt/proto/query"
41
45
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
42
46
vtgatepb "vitess.io/vitess/go/vt/proto/vtgate"
43
47
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
@@ -1559,6 +1563,133 @@ func TestKeyspaceHasBeenSharded(t *testing.T) {
1559
1563
}
1560
1564
}
1561
1565
1566
+ // TestVStreamManagerHealthcheckResponseHandling tests the handling of healthcheck responses by
1567
+ // the vstream manager to confirm that we are correctly restarting the vstream when we should.
1568
+ func TestVStreamManagerHealthcheckResponseHandling (t * testing.T ) {
1569
+ ctx := utils .LeakCheckContext (t )
1570
+
1571
+ // Capture the vstream warning log. Otherwise we need to re-implement the vstream error
1572
+ // handling in SandboxConn's implementation and then we're not actually testing the
1573
+ // production code.
1574
+ logger := logutil .NewMemoryLogger ()
1575
+ log .Warningf = logger .Warningf
1576
+
1577
+ cell := "aa"
1578
+ ks := "TestVStream"
1579
+ shard := "0"
1580
+ tabletType := topodatapb .TabletType_REPLICA
1581
+ _ = createSandbox (ks )
1582
+ healthChan := make (chan * discovery.TabletHealth )
1583
+ hc := discovery .NewFakeHealthCheck (healthChan )
1584
+ st := getSandboxTopo (ctx , cell , ks , []string {shard })
1585
+ vsm := newTestVStreamManager (ctx , hc , st , cell )
1586
+ vgtid := & binlogdatapb.VGtid {
1587
+ ShardGtids : []* binlogdatapb.ShardGtid {{
1588
+ Keyspace : ks ,
1589
+ Shard : shard ,
1590
+ }},
1591
+ }
1592
+ source := hc .AddTestTablet (cell , "1.1.1.1" , 1001 , ks , shard , tabletType , true , 0 , nil )
1593
+ tabletAlias := topoproto .TabletAliasString (source .Tablet ().Alias )
1594
+ addTabletToSandboxTopo (t , ctx , st , ks , shard , source .Tablet ())
1595
+ target := & querypb.Target {
1596
+ Cell : cell ,
1597
+ Keyspace : ks ,
1598
+ Shard : shard ,
1599
+ TabletType : tabletType ,
1600
+ }
1601
+ highLag := uint32 (discovery .GetLowReplicationLag ().Seconds ()) + 1
1602
+
1603
+ type testcase struct {
1604
+ name string
1605
+ hcRes * querypb.StreamHealthResponse
1606
+ wantErr string
1607
+ }
1608
+ testcases := []testcase {
1609
+ {
1610
+ name : "all healthy" , // Will hit the context timeout
1611
+ },
1612
+ {
1613
+ name : "failure" ,
1614
+ hcRes : & querypb.StreamHealthResponse {
1615
+ TabletAlias : source .Tablet ().Alias ,
1616
+ Target : nil , // This is seen as a healthcheck stream failure
1617
+ },
1618
+ wantErr : fmt .Sprintf ("health check failed on %s" , tabletAlias ),
1619
+ },
1620
+ {
1621
+ name : "tablet type changed" ,
1622
+ hcRes : & querypb.StreamHealthResponse {
1623
+ TabletAlias : source .Tablet ().Alias ,
1624
+ Target : & querypb.Target {
1625
+ Cell : cell ,
1626
+ Keyspace : ks ,
1627
+ Shard : shard ,
1628
+ TabletType : topodatapb .TabletType_PRIMARY ,
1629
+ },
1630
+ PrimaryTermStartTimestamp : time .Now ().Unix (),
1631
+ RealtimeStats : & querypb.RealtimeStats {},
1632
+ },
1633
+ wantErr : fmt .Sprintf ("tablet %s type has changed from %s to %s" ,
1634
+ tabletAlias , tabletType , topodatapb .TabletType_PRIMARY .String ()),
1635
+ },
1636
+ {
1637
+ name : "unhealthy" ,
1638
+ hcRes : & querypb.StreamHealthResponse {
1639
+ TabletAlias : source .Tablet ().Alias ,
1640
+ Target : target ,
1641
+ RealtimeStats : & querypb.RealtimeStats {
1642
+ HealthError : "unhealthy" ,
1643
+ },
1644
+ },
1645
+ wantErr : fmt .Sprintf ("tablet %s is no longer healthy" , tabletAlias ),
1646
+ },
1647
+ {
1648
+ name : "replication lag too high" ,
1649
+ hcRes : & querypb.StreamHealthResponse {
1650
+ TabletAlias : source .Tablet ().Alias ,
1651
+ Target : target ,
1652
+ RealtimeStats : & querypb.RealtimeStats {
1653
+ ReplicationLagSeconds : highLag ,
1654
+ },
1655
+ },
1656
+ wantErr : fmt .Sprintf ("%s has a replication lag of %d seconds which is beyond the value provided" ,
1657
+ tabletAlias , highLag ),
1658
+ },
1659
+ }
1660
+
1661
+ warningIdx := 0
1662
+ for _ , tc := range testcases {
1663
+ t .Run (tc .name , func (t * testing.T ) {
1664
+ done := make (chan struct {})
1665
+ go func () {
1666
+ sctx , cancel := context .WithTimeout (ctx , 3 * time .Second )
1667
+ defer cancel ()
1668
+ defer close (done )
1669
+ // SandboxConn's VStream implementation always waits for the context to timeout.
1670
+ err := vsm .VStream (sctx , tabletType , vgtid , nil , nil , func (events []* binlogdatapb.VEvent ) error {
1671
+ require .Fail (t , "unexpected event" , "Receieved unexpected events: %v" , events )
1672
+ return nil
1673
+ })
1674
+ if tc .wantErr != "" { // Otherwise we simply expect the context to timeout
1675
+ if len (logger .Events ) < warningIdx + 1 {
1676
+ require .Fail (t , "expected vstream error" , "vstream did not encounter a healthstream error, expected: %s" , tc .wantErr )
1677
+ }
1678
+ loggedErr := logger .Events [warningIdx ]
1679
+ warningIdx ++
1680
+ if loggedErr == nil || ! strings .Contains (loggedErr .String (), tc .wantErr ) {
1681
+ require .Fail (t , "unexpected vstream error" , "vstream ended with error: %v, which did not contain: %s" , err , tc .wantErr )
1682
+ }
1683
+ }
1684
+ }()
1685
+ if tc .wantErr != "" {
1686
+ source .SetStreamHealthResponse (tc .hcRes )
1687
+ }
1688
+ <- done
1689
+ })
1690
+ }
1691
+ }
1692
+
1562
1693
func newTestVStreamManager (ctx context.Context , hc discovery.HealthCheck , serv srvtopo.Server , cell string ) * vstreamManager {
1563
1694
gw := NewTabletGateway (ctx , hc , serv , cell )
1564
1695
srvResolver := srvtopo .NewResolver (serv , gw , cell )
0 commit comments