Skip to content

Commit e299b70

Browse files
authored
Merge branch 'slack-15.0' into txthrottler_backport_14693
2 parents 7d18296 + d7f4675 commit e299b70

File tree

11 files changed

+508
-97
lines changed

11 files changed

+508
-97
lines changed

go/vt/discovery/tablet_picker.go

Lines changed: 51 additions & 50 deletions
Original file line numberDiff line numberDiff line change
@@ -17,28 +17,26 @@ limitations under the License.
1717
package discovery
1818

1919
import (
20+
"context"
2021
"fmt"
22+
"io"
2123
"math/rand"
2224
"sort"
2325
"strings"
2426
"sync"
2527
"time"
2628

2729
"vitess.io/vitess/go/stats"
28-
30+
"vitess.io/vitess/go/vt/grpcclient"
31+
"vitess.io/vitess/go/vt/log"
32+
"vitess.io/vitess/go/vt/topo"
2933
"vitess.io/vitess/go/vt/topo/topoproto"
30-
31-
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
32-
34+
"vitess.io/vitess/go/vt/vterrors"
3335
"vitess.io/vitess/go/vt/vttablet/tabletconn"
3436

35-
"vitess.io/vitess/go/vt/log"
36-
37-
"context"
38-
37+
querypb "vitess.io/vitess/go/vt/proto/query"
3938
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
40-
"vitess.io/vitess/go/vt/topo"
41-
"vitess.io/vitess/go/vt/vterrors"
39+
vtrpcpb "vitess.io/vitess/go/vt/proto/vtrpc"
4240
)
4341

4442
type TabletPickerCellPreference int
@@ -291,13 +289,12 @@ func (tp *TabletPicker) orderByTabletType(candidates []*topo.TabletInfo) []*topo
291289
return candidates
292290
}
293291

294-
// PickForStreaming picks an available tablet.
292+
// PickForStreaming picks a tablet that is healthy and serving.
295293
// Selection is based on CellPreference.
296294
// See prioritizeTablets for prioritization logic.
297295
func (tp *TabletPicker) PickForStreaming(ctx context.Context) (*topodatapb.Tablet, error) {
298-
rand.Seed(time.Now().UnixNano())
299-
// keep trying at intervals (tabletPickerRetryDelay) until a tablet is found
300-
// or the context is canceled
296+
// Keep trying at intervals (tabletPickerRetryDelay) until a healthy
297+
// serving tablet is found or the context is cancelled.
301298
for {
302299
select {
303300
case <-ctx.Done():
@@ -330,15 +327,15 @@ func (tp *TabletPicker) PickForStreaming(ctx context.Context) (*topodatapb.Table
330327
} else if tp.inOrder {
331328
candidates = tp.orderByTabletType(candidates)
332329
} else {
333-
// Randomize candidates
330+
// Randomize candidates.
334331
rand.Shuffle(len(candidates), func(i, j int) {
335332
candidates[i], candidates[j] = candidates[j], candidates[i]
336333
})
337334
}
338335
if len(candidates) == 0 {
339-
// if no candidates were found, sleep and try again
336+
// If no viable candidates were found, sleep and try again.
340337
tp.incNoTabletFoundStat()
341-
log.Infof("No tablet found for streaming, shard %s.%s, cells %v, tabletTypes %v, sleeping for %.3f seconds",
338+
log.Infof("No healthy serving tablet found for streaming, shard %s.%s, cells %v, tabletTypes %v, sleeping for %.3f seconds.",
342339
tp.keyspace, tp.shard, tp.cells, tp.tabletTypes, float64(GetTabletPickerRetryDelay().Milliseconds())/1000.0)
343340
timer := time.NewTimer(GetTabletPickerRetryDelay())
344341
select {
@@ -349,34 +346,24 @@ func (tp *TabletPicker) PickForStreaming(ctx context.Context) (*topodatapb.Table
349346
}
350347
continue
351348
}
352-
for _, ti := range candidates {
353-
// try to connect to tablet
354-
if conn, err := tabletconn.GetDialer()(ti.Tablet, true); err == nil {
355-
// OK to use ctx here because it is not actually used by the underlying Close implementation
356-
_ = conn.Close(ctx)
357-
log.Infof("tablet picker found tablet %s", ti.Tablet.String())
358-
return ti.Tablet, nil
359-
}
360-
// err found
361-
log.Warningf("unable to connect to tablet for alias %v", ti.Alias)
362-
}
363-
// Got here? Means we iterated all tablets and did not find a healthy one
364-
tp.incNoTabletFoundStat()
349+
log.Infof("Tablet picker found a healthy serving tablet for streaming: %s", candidates[0].Tablet.String())
350+
return candidates[0].Tablet, nil
365351
}
366352
}
367353

368-
// GetMatchingTablets returns a list of TabletInfo for tablets
369-
// that match the cells, keyspace, shard and tabletTypes for this TabletPicker
354+
// GetMatchingTablets returns a list of TabletInfo for healthy
355+
// serving tablets that match the cells, keyspace, shard and
356+
// tabletTypes for this TabletPicker.
370357
func (tp *TabletPicker) GetMatchingTablets(ctx context.Context) []*topo.TabletInfo {
371-
// Special handling for PRIMARY tablet type
372-
// Since there is only one primary, we ignore cell and find the primary
358+
// Special handling for PRIMARY tablet type: since there is only
359+
// one primary per shard, we ignore cell and find the primary.
373360
aliases := make([]*topodatapb.TabletAlias, 0)
374361
if len(tp.tabletTypes) == 1 && tp.tabletTypes[0] == topodatapb.TabletType_PRIMARY {
375362
shortCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
376363
defer cancel()
377364
si, err := tp.ts.GetShard(shortCtx, tp.keyspace, tp.shard)
378365
if err != nil {
379-
log.Errorf("error getting shard %s/%s: %s", tp.keyspace, tp.shard, err.Error())
366+
log.Errorf("Error getting shard %s/%s: %v", tp.keyspace, tp.shard, err)
380367
return nil
381368
}
382369
if _, ignore := tp.ignoreTablets[si.PrimaryAlias.String()]; !ignore {
@@ -385,37 +372,37 @@ func (tp *TabletPicker) GetMatchingTablets(ctx context.Context) []*topo.TabletIn
385372
} else {
386373
actualCells := make([]string, 0)
387374
for _, cell := range tp.cells {
388-
// check if cell is actually an alias
389-
// non-blocking read so that this is fast
375+
// Check if cell is actually an alias; using a
376+
// non-blocking read so that this is fast.
390377
shortCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
391378
defer cancel()
392379
_, err := tp.ts.GetCellInfo(shortCtx, cell, false)
393380
if err != nil {
394-
// not a valid cell, check whether it is a cell alias
381+
// Not a valid cell, check whether it is a cell alias...
395382
shortCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
396383
defer cancel()
397384
alias, err := tp.ts.GetCellsAlias(shortCtx, cell, false)
398-
// if we get an error, either cellAlias doesn't exist or it isn't a cell alias at all. Ignore and continue
385+
// If we get an error, either cellAlias doesn't exist or
386+
// it isn't a cell alias at all; ignore and continue.
399387
if err == nil {
400388
actualCells = append(actualCells, alias.Cells...)
401389
} else {
402390
log.Infof("Unable to resolve cell %s, ignoring", cell)
403391
}
404392
} else {
405-
// valid cell, add it to our list
393+
// Valid cell, add it to our list.
406394
actualCells = append(actualCells, cell)
407395
}
408396
}
409397

410398
for _, cell := range actualCells {
411399
shortCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
412400
defer cancel()
413-
// match cell, keyspace and shard
401+
// Match cell, keyspace, and shard.
414402
sri, err := tp.ts.GetShardReplication(shortCtx, cell, tp.keyspace, tp.shard)
415403
if err != nil {
416404
continue
417405
}
418-
419406
for _, node := range sri.Nodes {
420407
if _, ignore := tp.ignoreTablets[node.TabletAlias.String()]; !ignore {
421408
aliases = append(aliases, node.TabletAlias)
@@ -427,33 +414,47 @@ func (tp *TabletPicker) GetMatchingTablets(ctx context.Context) []*topo.TabletIn
427414
if len(aliases) == 0 {
428415
return nil
429416
}
417+
430418
shortCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
431419
defer cancel()
432420
tabletMap, err := tp.ts.GetTabletMap(shortCtx, aliases, nil)
433421
if err != nil {
434-
log.Warningf("error fetching tablets from topo: %v", err)
435-
// If we get a partial result we can still use it, otherwise return
422+
log.Warningf("Error fetching tablets from topo: %v", err)
423+
// If we get a partial result we can still use it, otherwise return.
436424
if len(tabletMap) == 0 {
437425
return nil
438426
}
439427
}
428+
440429
tablets := make([]*topo.TabletInfo, 0, len(aliases))
441430
for _, tabletAlias := range aliases {
442431
tabletInfo, ok := tabletMap[topoproto.TabletAliasString(tabletAlias)]
443432
if !ok {
444-
// Either tablet disappeared on us, or we got a partial result (GetTabletMap ignores
445-
// topo.ErrNoNode). Just log a warning
446-
log.Warningf("failed to load tablet %v", tabletAlias)
433+
// Either tablet disappeared on us, or we got a partial result
434+
// (GetTabletMap ignores topo.ErrNoNode); just log a warning.
435+
log.Warningf("Tablet picker failed to load tablet %v", tabletAlias)
447436
} else if topoproto.IsTypeInList(tabletInfo.Type, tp.tabletTypes) {
448-
tablets = append(tablets, tabletInfo)
437+
// Try to connect to the tablet and confirm that it's usable.
438+
if conn, err := tabletconn.GetDialer()(tabletInfo.Tablet, grpcclient.FailFast(true)); err == nil {
439+
// Ensure that the tablet is healthy and serving.
440+
shortCtx, cancel := context.WithTimeout(ctx, topo.RemoteOperationTimeout)
441+
defer cancel()
442+
if err := conn.StreamHealth(shortCtx, func(shr *querypb.StreamHealthResponse) error {
443+
if shr != nil && shr.Serving && shr.RealtimeStats != nil && shr.RealtimeStats.HealthError == "" {
444+
return io.EOF // End the stream
445+
}
446+
return vterrors.New(vtrpcpb.Code_INTERNAL, "tablet is not healthy and serving")
447+
}); err == nil || err == io.EOF {
448+
tablets = append(tablets, tabletInfo)
449+
}
450+
_ = conn.Close(ctx)
451+
}
449452
}
450453
}
451454
return tablets
452455
}
453456

454457
func init() {
455-
// TODO(sougou): consolidate this call to be once per process.
456-
rand.Seed(time.Now().UnixNano())
457458
globalTPStats = newTabletPickerStats()
458459
}
459460

go/vt/discovery/tablet_picker_test.go

Lines changed: 54 additions & 11 deletions
Original file line numberDiff line numberDiff line change
@@ -22,10 +22,11 @@ import (
2222
"github.com/stretchr/testify/require"
2323
"google.golang.org/protobuf/proto"
2424

25-
querypb "vitess.io/vitess/go/vt/proto/query"
26-
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
2725
"vitess.io/vitess/go/vt/topo"
2826
"vitess.io/vitess/go/vt/topo/memorytopo"
27+
28+
querypb "vitess.io/vitess/go/vt/proto/query"
29+
topodatapb "vitess.io/vitess/go/vt/proto/topodata"
2930
)
3031

3132
func TestPickPrimary(t *testing.T) {
@@ -503,6 +504,45 @@ func TestPickErrorOnlySpecified(t *testing.T) {
503504
require.Greater(t, globalTPStats.noTabletFoundError.Counts()["cell.ks.0.replica"], int64(0))
504505
}
505506

507+
// TestPickFallbackType tests that when providing a list of tablet types to
508+
// pick from, with the list in preference order, that when the primary/first
509+
// type has no available healthy serving tablets that we select a healthy
510+
// serving tablet from the secondary/second type.
511+
func TestPickFallbackType(t *testing.T) {
512+
cells := []string{"cell1", "cell2"}
513+
localCell := cells[0]
514+
tabletTypes := "replica,primary"
515+
options := TabletPickerOptions{
516+
TabletOrder: "InOrder",
517+
}
518+
te := newPickerTestEnv(t, cells)
519+
520+
// This one should be selected even though it's the secondary type
521+
// as it is healthy and serving.
522+
primaryTablet := addTablet(te, 100, topodatapb.TabletType_PRIMARY, localCell, true, true)
523+
defer deleteTablet(t, te, primaryTablet)
524+
525+
// Replica tablet should not be selected as it is unhealthy.
526+
replicaTablet := addTablet(te, 200, topodatapb.TabletType_REPLICA, localCell, false, false)
527+
defer deleteTablet(t, te, replicaTablet)
528+
529+
ctx, cancel := context.WithTimeout(context.Background(), 200*time.Millisecond)
530+
defer cancel()
531+
_, err := te.topoServ.UpdateShardFields(ctx, te.keyspace, te.shard, func(si *topo.ShardInfo) error {
532+
si.PrimaryAlias = primaryTablet.Alias
533+
return nil
534+
})
535+
require.NoError(t, err)
536+
537+
tp, err := NewTabletPicker(context.Background(), te.topoServ, cells, localCell, te.keyspace, te.shard, tabletTypes, options)
538+
require.NoError(t, err)
539+
ctx2, cancel2 := context.WithTimeout(context.Background(), 1*time.Second)
540+
defer cancel2()
541+
tablet, err := tp.PickForStreaming(ctx2)
542+
require.NoError(t, err)
543+
assert.True(t, proto.Equal(primaryTablet, tablet), "Pick: %v, want %v", tablet, primaryTablet)
544+
}
545+
506546
type pickerTestEnv struct {
507547
t *testing.T
508548
keyspace string
@@ -551,18 +591,21 @@ func addTablet(te *pickerTestEnv, id int, tabletType topodatapb.TabletType, cell
551591
err := te.topoServ.CreateTablet(context.Background(), tablet)
552592
require.NoError(te.t, err)
553593

594+
shr := &querypb.StreamHealthResponse{
595+
Serving: serving,
596+
Target: &querypb.Target{
597+
Keyspace: te.keyspace,
598+
Shard: te.shard,
599+
TabletType: tabletType,
600+
},
601+
RealtimeStats: &querypb.RealtimeStats{HealthError: "tablet is unhealthy"},
602+
}
554603
if healthy {
555-
_ = createFixedHealthConn(tablet, &querypb.StreamHealthResponse{
556-
Serving: serving,
557-
Target: &querypb.Target{
558-
Keyspace: te.keyspace,
559-
Shard: te.shard,
560-
TabletType: tabletType,
561-
},
562-
RealtimeStats: &querypb.RealtimeStats{HealthError: ""},
563-
})
604+
shr.RealtimeStats.HealthError = ""
564605
}
565606

607+
_ = createFixedHealthConn(tablet, shr)
608+
566609
return tablet
567610
}
568611

go/vt/vterrors/last_error.go

Lines changed: 88 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,88 @@
1+
/*
2+
Copyright 2022 The Vitess Authors.
3+
4+
Licensed under the Apache License, Version 2.0 (the "License");
5+
you may not use this file except in compliance with the License.
6+
You may obtain a copy of the License at
7+
8+
http://www.apache.org/licenses/LICENSE-2.0
9+
10+
Unless required by applicable law or agreed to in writing, software
11+
distributed under the License is distributed on an "AS IS" BASIS,
12+
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
See the License for the specific language governing permissions and
14+
limitations under the License.
15+
*/
16+
17+
package vterrors
18+
19+
import (
20+
"sync"
21+
"time"
22+
23+
"vitess.io/vitess/go/vt/log"
24+
)
25+
26+
/*
27+
* LastError tracks the most recent error for any ongoing process and how long it has persisted.
28+
* The err field should be a vterror to ensure we have meaningful error codes, causes, stack
29+
* traces, etc.
30+
*/
31+
type LastError struct {
32+
name string
33+
err error
34+
firstSeen time.Time
35+
lastSeen time.Time
36+
mu sync.Mutex
37+
maxTimeInError time.Duration // if error persists for this long, shouldRetry() will return false
38+
}
39+
40+
func NewLastError(name string, maxTimeInError time.Duration) *LastError {
41+
return &LastError{
42+
name: name,
43+
maxTimeInError: maxTimeInError,
44+
}
45+
}
46+
47+
func (le *LastError) Record(err error) {
48+
le.mu.Lock()
49+
defer le.mu.Unlock()
50+
if err == nil {
51+
le.err = nil
52+
le.firstSeen = time.Time{}
53+
le.lastSeen = time.Time{}
54+
return
55+
}
56+
if !Equals(err, le.err) {
57+
le.firstSeen = time.Now()
58+
le.lastSeen = time.Now()
59+
le.err = err
60+
} else {
61+
// same error seen
62+
if time.Since(le.lastSeen) > le.maxTimeInError {
63+
// reset firstSeen, since it has been long enough since the last time we saw this error
64+
log.Infof("Resetting firstSeen for %s, since it is too long since the last one", le.name)
65+
le.firstSeen = time.Now()
66+
}
67+
le.lastSeen = time.Now()
68+
}
69+
}
70+
71+
func (le *LastError) ShouldRetry() bool {
72+
le.mu.Lock()
73+
defer le.mu.Unlock()
74+
if le.maxTimeInError == 0 {
75+
// The value of 0 means "no time limit"
76+
return true
77+
}
78+
if le.firstSeen.IsZero() {
79+
return true
80+
}
81+
if time.Since(le.firstSeen) <= le.maxTimeInError {
82+
// within the max time range
83+
return true
84+
}
85+
log.Errorf("%s: the same error was encountered continuously since %s, it is now assumed to be unrecoverable; any affected operations will need to be manually restarted once error '%s' has been addressed",
86+
le.name, le.firstSeen.UTC(), le.err)
87+
return false
88+
}

0 commit comments

Comments
 (0)