Skip to content

Commit

Permalink
If we run out of tablets, surface the previous error
Browse files Browse the repository at this point in the history
Signed-off-by: twthorn <thomaswilliamthornton@gmail.com>
  • Loading branch information
twthorn committed Aug 6, 2024
1 parent fe75145 commit c303404
Show file tree
Hide file tree
Showing 2 changed files with 14 additions and 13 deletions.
21 changes: 9 additions & 12 deletions go/vt/vtgate/vstream_manager.go
Original file line number Diff line number Diff line change
Expand Up @@ -511,6 +511,7 @@ func (vs *vstream) streamFromTablet(ctx context.Context, sgtid *binlogdatapb.Sha
// It will be closed when all journal events converge.
var journalDone chan struct{}
ignoreTablets := make([]*topodatapb.TabletAlias, 0)
var prevErr error

backoffIndex := 0
for {
Expand Down Expand Up @@ -552,6 +553,12 @@ func (vs *vstream) streamFromTablet(ctx context.Context, sgtid *binlogdatapb.Sha
if err != nil {
return tabletPickerErr(err)
}
if len(tp.GetMatchingTablets(ctx)) == 0 {
tperr := vterrors.Wrapf(prevErr, "zero matching tablets for %s tablet for VStream in %s/%s within the %s cell(s)",
vs.tabletType.String(), sgtid.GetKeyspace(), sgtid.GetShard(), strings.Join(cells, ","))
log.Errorf("%v", tperr)
return tperr
}
// Create a child context with a stricter timeout when picking a tablet.
// This will prevent hanging in the case no tablets are found.
tpCtx, tpCancel := context.WithTimeout(ctx, tabletPickerContextTimeout)
Expand Down Expand Up @@ -738,11 +745,9 @@ func (vs *vstream) streamFromTablet(ctx context.Context, sgtid *binlogdatapb.Sha
}

vs.lastError.Record(err)
prevErr = err

if shouldFailNow(err) {
log.Errorf("VStream for %s/%s error: %v", sgtid.Keyspace, sgtid.Shard, err)
return err
} else if vs.lastError.ShouldRetry() {
if vs.lastError.ShouldRetry() {
log.Infof("Retrying tablet, count: %d, alias: %v, hostname: %s", backoffIndex, tablet.GetAlias(), tablet.GetHostname())
retryDelay := vs.backoffStrategy.Backoff(backoffIndex)
backoffIndex++
Expand All @@ -758,14 +763,6 @@ func (vs *vstream) streamFromTablet(ctx context.Context, sgtid *binlogdatapb.Sha
}
}

func shouldFailNow(err error) bool {
errCode := vterrors.Code(err)
if errCode == vtrpcpb.Code_UNKNOWN && strings.Contains(err.Error(), "not all journaling participants are in the stream") {
return true
}
return false
}

// sendAll sends a group of events together while holding the lock.
func (vs *vstream) sendAll(ctx context.Context, sgtid *binlogdatapb.ShardGtid, eventss [][]*binlogdatapb.VEvent) error {
vs.mu.Lock()
Expand Down
6 changes: 5 additions & 1 deletion go/vt/vtgate/vstream_manager_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -460,7 +460,7 @@ func TestVStreamRetriableErrors(t *testing.T) {
if tcase.shouldSwitchTablets {
// Retry just once before trying another tablet.
vsm.maxTimeInError = 1 * time.Nanosecond
vsm.baseRetryDelay = 1 * time.Millisecond
vsm.baseRetryDelay = 1 * time.Nanosecond
} else {
// Retry at least once on the same tablet.
vsm.maxTimeInError = 1 * time.Second
Expand Down Expand Up @@ -937,6 +937,8 @@ func TestVStreamJournalPartialMatch(t *testing.T) {
hc := discovery.NewFakeHealthCheck(nil)
st := getSandboxTopo(ctx, cell, ks, []string{"-20", "-10", "10-20"})
vsm := newTestVStreamManager(ctx, hc, st, "aa")
vsm.maxTimeInError = 1 * time.Nanosecond
vsm.baseRetryDelay = 1 * time.Nanosecond
sbc1 := hc.AddTestTablet("aa", "1.1.1.1", 1002, ks, "-10", topodatapb.TabletType_PRIMARY, true, 1, nil)
addTabletToSandboxTopo(t, ctx, st, ks, "-10", sbc1.Tablet())
sbc2 := hc.AddTestTablet("aa", "1.1.1.1", 1003, ks, "10-20", topodatapb.TabletType_PRIMARY, true, 1, nil)
Expand Down Expand Up @@ -1584,6 +1586,8 @@ func TestVStreamManagerHealthCheckResponseHandling(t *testing.T) {
hc := discovery.NewFakeHealthCheck(nil)
st := getSandboxTopo(ctx, cell, ks, []string{shard})
vsm := newTestVStreamManager(ctx, hc, st, cell)
vsm.maxTimeInError = 1 * time.Nanosecond
vsm.baseRetryDelay = 1 * time.Nanosecond
vgtid := &binlogdatapb.VGtid{
ShardGtids: []*binlogdatapb.ShardGtid{{
Keyspace: ks,
Expand Down

0 comments on commit c303404

Please sign in to comment.