Skip to content

Commit 7019af3

Browse files
committed
Fix another bug in cancel: reverting the denied tables list
Signed-off-by: Matt Lord <mattalord@gmail.com>
1 parent 45adff7 commit 7019af3

File tree

4 files changed

+48
-18
lines changed

4 files changed

+48
-18
lines changed

go/test/endtoend/vreplication/vreplication_test.go

Lines changed: 14 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -918,16 +918,27 @@ func shardCustomer(t *testing.T, testReverse bool, cells []*Cell, sourceCellOrAl
918918
addTestRows()
919919
timeout := lagDuration * 2 // 6s
920920
// Use the default max-replication-lag-allowed value of 30s.
921-
out, err = vc.VtctldClient.ExecuteCommandWithOutput(workflowType, "--workflow", workflow, "--target-keyspace", targetKs,
922-
"SwitchTraffic", "--tablet-types=primary", "--timeout", timeout.String())
921+
// We run the command in a goroutine so that we can unblock things
922+
// after the timeout is reached -- as the vplayer query is blocking
923+
// on the table lock.
924+
wg := sync.WaitGroup{}
925+
wg.Add(1)
926+
go func() {
927+
defer wg.Done()
928+
out, err = vc.VtctldClient.ExecuteCommandWithOutput(workflowType, "--workflow", workflow, "--target-keyspace", targetKs,
929+
"SwitchTraffic", "--tablet-types=primary", "--timeout", timeout.String())
930+
}()
931+
time.Sleep(timeout)
932+
// Now we can unblock things and let it continue.
933+
unlockTargetTable()
934+
wg.Wait()
923935
// It should fail due to the command context timeout and we should
924936
// successfully cancel.
925937
require.Error(t, err)
926938
require.Contains(t, out, "failed to sync up replication between the source and target")
927939
require.NotContains(t, out, "cancel migration failed")
928940
// Confirm that queries still work fine.
929941
execVtgateQuery(t, vtgateConn, sourceKs, "select * from customer limit 1")
930-
unlockTargetTable()
931942
deleteTestRows()
932943
waitForTargetToCatchup()
933944
})

go/vt/vtctl/workflow/server.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2951,7 +2951,7 @@ func (s *Server) switchWrites(ctx context.Context, req *vtctldatapb.WorkflowSwit
29512951
}
29522952
}
29532953
if cerr := sw.cancelMigration(ctx, sm); cerr != nil {
2954-
err = vterrors.Wrap(err, fmt.Sprintf("(%v)", cerr))
2954+
err = vterrors.Errorf(vtrpcpb.Code_CANCELED, "%v\n\n%v", err, cerr)
29552955
}
29562956
return handleError(fmt.Sprintf("failed to stop the workflow streams in the %s keyspace", ts.SourceKeyspaceName()), err)
29572957
}
@@ -2963,7 +2963,7 @@ func (s *Server) switchWrites(ctx context.Context, req *vtctldatapb.WorkflowSwit
29632963
for cnt := 1; cnt <= lockTablesCycles; cnt++ {
29642964
if err := ts.executeLockTablesOnSource(ctx); err != nil {
29652965
if cerr := sw.cancelMigration(ctx, sm); cerr != nil {
2966-
err = vterrors.Wrap(err, fmt.Sprintf("(%v)", cerr))
2966+
err = vterrors.Errorf(vtrpcpb.Code_CANCELED, "%v\n\n%v", err, cerr)
29672967
}
29682968
return handleError(fmt.Sprintf("failed to execute LOCK TABLES (attempt %d of %d) on sources", cnt, lockTablesCycles), err)
29692969
}

go/vt/vtctl/workflow/stream_migrator.go

Lines changed: 11 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -203,12 +203,15 @@ func (sm *StreamMigrator) Templates() []*VReplicationStream {
203203
}
204204

205205
// CancelStreamMigrations cancels the stream migrations.
206-
func (sm *StreamMigrator) CancelStreamMigrations(ctx context.Context) {
206+
func (sm *StreamMigrator) CancelStreamMigrations(ctx context.Context) error {
207207
if sm.streams == nil {
208-
return
208+
return nil
209209
}
210+
errs := &concurrency.AllErrorRecorder{}
210211

211-
_ = sm.deleteTargetStreams(ctx)
212+
if err := sm.deleteTargetStreams(ctx); err != nil {
213+
errs.RecordError(err)
214+
}
212215

213216
// Restart the source streams, but leave the Reshard workflow's reverse
214217
// variant stopped.
@@ -221,8 +224,13 @@ func (sm *StreamMigrator) CancelStreamMigrations(ctx context.Context) {
221224
return err
222225
})
223226
if err != nil {
227+
errs.RecordError(err)
224228
sm.logger.Errorf("Cancel stream migrations failed: could not restart source streams: %v", err)
225229
}
230+
if errs.HasErrors() {
231+
return errs.AggrError(vterrors.Aggregate)
232+
}
233+
return nil
226234
}
227235

228236
// MigrateStreams migrates N streams

go/vt/vtctl/workflow/traffic_switcher.go

Lines changed: 21 additions & 10 deletions
Original file line numberDiff line numberDiff line change
@@ -754,7 +754,7 @@ func (ts *trafficSwitcher) changeShardsAccess(ctx context.Context, keyspace stri
754754

755755
func (ts *trafficSwitcher) allowTargetWrites(ctx context.Context) error {
756756
if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
757-
return ts.switchDeniedTables(ctx)
757+
return ts.switchDeniedTables(ctx, false)
758758
}
759759
return ts.changeShardsAccess(ctx, ts.TargetKeyspaceName(), ts.TargetShards(), allowWrites)
760760
}
@@ -1062,7 +1062,7 @@ func (ts *trafficSwitcher) waitForCatchup(ctx context.Context, filteredReplicati
10621062
func (ts *trafficSwitcher) stopSourceWrites(ctx context.Context) error {
10631063
var err error
10641064
if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
1065-
err = ts.switchDeniedTables(ctx)
1065+
err = ts.switchDeniedTables(ctx, false)
10661066
} else {
10671067
err = ts.changeShardsAccess(ctx, ts.SourceKeyspaceName(), ts.SourceShards(), disallowWrites)
10681068
}
@@ -1075,16 +1075,25 @@ func (ts *trafficSwitcher) stopSourceWrites(ctx context.Context) error {
10751075

10761076
// switchDeniedTables switches the denied tables rules for the traffic switch.
10771077
// They are removed on the source side and added on the target side.
1078-
func (ts *trafficSwitcher) switchDeniedTables(ctx context.Context) error {
1078+
// If backward is true, then we swap this logic, removing on the target side
1079+
// and adding on the source side. You would want to do that e.g. when canceling
1080+
// a failed (and currently partial) traffic switch as the source and target
1081+
// have already been switched in the trafficSwitcher.
1082+
func (ts *trafficSwitcher) switchDeniedTables(ctx context.Context, backward bool) error {
10791083
if ts.MigrationType() != binlogdatapb.MigrationType_TABLES {
10801084
return nil
10811085
}
10821086

1087+
rmsource, rmtarget := false, true
1088+
if backward {
1089+
rmsource, rmtarget = true, false
1090+
}
1091+
10831092
egrp, ectx := errgroup.WithContext(ctx)
10841093
egrp.Go(func() error {
10851094
return ts.ForAllSources(func(source *MigrationSource) error {
10861095
if _, err := ts.TopoServer().UpdateShardFields(ctx, ts.SourceKeyspaceName(), source.GetShard().ShardName(), func(si *topo.ShardInfo) error {
1087-
return si.UpdateDeniedTables(ectx, topodatapb.TabletType_PRIMARY, nil, false, ts.Tables())
1096+
return si.UpdateDeniedTables(ectx, topodatapb.TabletType_PRIMARY, nil, rmsource, ts.Tables())
10881097
}); err != nil {
10891098
return err
10901099
}
@@ -1107,7 +1116,7 @@ func (ts *trafficSwitcher) switchDeniedTables(ctx context.Context) error {
11071116
egrp.Go(func() error {
11081117
return ts.ForAllTargets(func(target *MigrationTarget) error {
11091118
if _, err := ts.TopoServer().UpdateShardFields(ectx, ts.TargetKeyspaceName(), target.GetShard().ShardName(), func(si *topo.ShardInfo) error {
1110-
return si.UpdateDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, true, ts.Tables())
1119+
return si.UpdateDeniedTables(ctx, topodatapb.TabletType_PRIMARY, nil, rmtarget, ts.Tables())
11111120
}); err != nil {
11121121
return err
11131122
}
@@ -1153,12 +1162,12 @@ func (ts *trafficSwitcher) cancelMigration(ctx context.Context, sm *StreamMigrat
11531162
// canceled by the parent context.
11541163
wcCtx := context.WithoutCancel(ctx)
11551164
// Now we create a child context from that which has a timeout.
1156-
cmTimeout := 60 * time.Second
1165+
cmTimeout := 2 * time.Minute
11571166
cmCtx, cmCancel := context.WithTimeout(wcCtx, cmTimeout)
11581167
defer cmCancel()
11591168

11601169
if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
1161-
err = ts.switchDeniedTables(cmCtx)
1170+
err = ts.switchDeniedTables(cmCtx, true /* revert */)
11621171
} else {
11631172
err = ts.changeShardsAccess(cmCtx, ts.SourceKeyspaceName(), ts.SourceShards(), allowWrites)
11641173
}
@@ -1167,7 +1176,10 @@ func (ts *trafficSwitcher) cancelMigration(ctx context.Context, sm *StreamMigrat
11671176
ts.Logger().Errorf("Cancel migration failed: could not revert denied tables / shard access: %v", err)
11681177
}
11691178

1170-
sm.CancelStreamMigrations(cmCtx)
1179+
if err := sm.CancelStreamMigrations(cmCtx); err != nil {
1180+
cancelErrs.RecordError(fmt.Errorf("could not cancel stream migrations: %v", err))
1181+
ts.Logger().Errorf("Cancel migration failed: could not cancel stream migrations: %v", err)
1182+
}
11711183

11721184
err = ts.ForAllTargets(func(target *MigrationTarget) error {
11731185
query := fmt.Sprintf("update _vt.vreplication set state='Running', message='' where db_name=%s and workflow=%s",
@@ -1180,8 +1192,7 @@ func (ts *trafficSwitcher) cancelMigration(ctx context.Context, sm *StreamMigrat
11801192
ts.Logger().Errorf("Cancel migration failed: could not restart vreplication: %v", err)
11811193
}
11821194

1183-
err = ts.deleteReverseVReplication(cmCtx)
1184-
if err != nil {
1195+
if err := ts.deleteReverseVReplication(cmCtx); err != nil {
11851196
cancelErrs.RecordError(fmt.Errorf("could not delete reverse vreplication streams: %v", err))
11861197
ts.Logger().Errorf("Cancel migration failed: could not delete reverse vreplication streams: %v", err)
11871198
}

0 commit comments

Comments
 (0)