Skip to content

Commit e5017e8

Browse files
committed
Draft fix
Signed-off-by: Matt Lord <mattalord@gmail.com>
1 parent 37316cc commit e5017e8

File tree

3 files changed

+38
-8
lines changed

3 files changed

+38
-8
lines changed

go/vt/vtctl/workflow/server.go

Lines changed: 13 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -3174,6 +3174,19 @@ func (s *Server) switchWrites(ctx context.Context, req *vtctldatapb.WorkflowSwit
31743174
return 0, sw.logs(), nil
31753175
}
31763176

3177+
// We stop writes on the source before stopping the streams so that the catchup
3178+
// time is lessened and other workflows that we have to migrate such as
3179+
// materialize workflows that are within a single keyspace (source and target)
3180+
// also have a chance to catch up as well as those are internally generated
3181+
// GTIDs within the shard. For materialization streams that we migrate where
3182+
// the source and target are the keyspace being resharded, we wait for those
3183+
// to catchup in the stopStreams path before we actually stop them.
3184+
ts.Logger().Infof("Stopping source writes")
3185+
if err := sw.stopSourceWrites(ctx); err != nil {
3186+
sw.cancelMigration(ctx, sm)
3187+
return handleError(fmt.Sprintf("failed to stop writes in the %s keyspace", ts.SourceKeyspaceName()), err)
3188+
}
3189+
31773190
ts.Logger().Infof("Stopping streams")
31783191
sourceWorkflows, err = sw.stopStreams(ctx, sm)
31793192
if err != nil {
@@ -3186,12 +3199,6 @@ func (s *Server) switchWrites(ctx context.Context, req *vtctldatapb.WorkflowSwit
31863199
return handleError("failed to stop the workflow streams", err)
31873200
}
31883201

3189-
ts.Logger().Infof("Stopping source writes")
3190-
if err := sw.stopSourceWrites(ctx); err != nil {
3191-
sw.cancelMigration(ctx, sm)
3192-
return handleError(fmt.Sprintf("failed to stop writes in the %s keyspace", ts.SourceKeyspaceName()), err)
3193-
}
3194-
31953202
if ts.MigrationType() == binlogdatapb.MigrationType_TABLES {
31963203
ts.Logger().Infof("Executing LOCK TABLES on source tables %d times", lockTablesCycles)
31973204
// Doing this twice with a pause in-between to catch any writes that may have raced in between

go/vt/vtctl/workflow/stream_migrator.go

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -250,7 +250,7 @@ func (sm *StreamMigrator) LegacyStopStreams(ctx context.Context) ([]string, erro
250250
return sm.legacyVerifyStreamPositions(ctx, positions)
251251
}
252252

253-
// StopStreams stops streams
253+
// StopStreams stops streams.
254254
func (sm *StreamMigrator) StopStreams(ctx context.Context) ([]string, error) {
255255
if sm.streams == nil {
256256
return nil, nil
@@ -684,6 +684,26 @@ func (sm *StreamMigrator) stopSourceStreams(ctx context.Context) error {
684684
return nil
685685
}
686686

687+
// For materialize workflows where the source and target are both the keyspace
688+
// that is being resharded, we need to wait for those to catchup as well.
689+
// New writes have already been blocked on the source, but the materialization
690+
// workflow(s) may still need to catchup with writes that happend just before
691+
// writes were stopped on the source.
692+
for _, vrs := range tabletStreams {
693+
if vrs.WorkflowType == binlogdatapb.VReplicationWorkflowType_Materialize && vrs.BinlogSource.Keyspace == sm.ts.TargetKeyspaceName() {
694+
tablet := source.GetPrimary().Tablet
695+
pos, err := sm.ts.TabletManagerClient().PrimaryPosition(ctx, tablet)
696+
if err != nil {
697+
return err
698+
}
699+
sm.ts.Logger().Infof("Waiting for Materialization workflow %s on %v/%v to reach position %v, starting from position %s",
700+
vrs.Workflow, sm.ts.SourceKeyspaceName(), vrs.BinlogSource.Shard, pos, vrs.Position)
701+
if err := sm.ts.TabletManagerClient().VReplicationWaitForPos(ctx, tablet, vrs.ID, pos); err != nil {
702+
return err
703+
}
704+
}
705+
}
706+
687707
query := fmt.Sprintf("update _vt.vreplication set state='Stopped', message='for cutover' where id in %s", VReplicationStreams(tabletStreams).Values())
688708
_, err := sm.ts.TabletManagerClient().VReplicationExec(ctx, source.GetPrimary().Tablet, query)
689709
if err != nil {
@@ -925,6 +945,9 @@ func (sm *StreamMigrator) createTargetStreams(ctx context.Context, tmpl []*VRepl
925945
// 1 to 1 in this scenario so we use the target shard's name and primary
926946
// tablet's position for the source.
927947
vrs.BinlogSource.Shard = target.GetShard().ShardName()
948+
// TODO: the problem is that the materialize stream may still need GTIDs
949+
// from the OLD shards at this point... so we could miss writes that
950+
// occurred on the source table(s) just before the switch.
928951
vrs.Position, err = binlogplayer.DecodePosition(target.Position)
929952
if err != nil {
930953
return err

go/vt/vtctl/workflow/traffic_switcher.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -939,7 +939,7 @@ func (ts *trafficSwitcher) waitForCatchup(ctx context.Context, filteredReplicati
939939
}); err != nil {
940940
return err
941941
}
942-
// all targets have caught up, record their positions for setting up reverse workflows
942+
// All targets have caught up, record their positions for setting up reverse workflows.
943943
return ts.ForAllTargets(func(target *MigrationTarget) error {
944944
var err error
945945
target.Position, err = ts.TabletManagerClient().PrimaryPosition(ctx, target.GetPrimary().Tablet)

0 commit comments

Comments
 (0)