From 47bdf0fcb57d15bd2800a92bd286551d6561d1d5 Mon Sep 17 00:00:00 2001 From: shunki-fujita Date: Fri, 5 Jul 2024 08:31:25 +0000 Subject: [PATCH] issue-708: Set lock_wait_timeout to SetReadOnly --- clustering/operations.go | 30 ++++++++++++++++++++++++------ pkg/dbop/nop.go | 4 ++++ pkg/dbop/operator.go | 3 +++ pkg/dbop/replication.go | 7 +++++++ 4 files changed, 38 insertions(+), 6 deletions(-) diff --git a/clustering/operations.go b/clustering/operations.go index 4c149ced0..2d52ad81e 100644 --- a/clustering/operations.go +++ b/clustering/operations.go @@ -149,14 +149,32 @@ func (p *managerProcess) clone(ctx context.Context, ss *StatusSet) (bool, error) func (p *managerProcess) switchover(ctx context.Context, ss *StatusSet) error { log := logFromContext(ctx) log.Info("begin switchover the primary", "current", ss.Primary, "next", ss.Candidate) - pdb := ss.DBOps[ss.Primary] - if err := pdb.SetReadOnly(ctx, true); err != nil { - return fmt.Errorf("failed to make instance %d read-only: %w", ss.Primary, err) + + // SetReadOnly waits for a running DML. + // Therefore, if it waits for a long time, deleteGracePeriodSeconds may be reached. + // To avoid this, set lock_wait_timeout to a short time temporarily. + // If SetReadOnly fails, kill all processes and retry. + succeeded := false + if err := pdb.SetSessionLockWaitTimeout(ctx, 15); err != nil { + return fmt.Errorf("failed to set lock_wait_timeout: %w", err) + } + for i := 0; i < 2; i++ { + if err := pdb.SetReadOnly(ctx, true); err != nil { + log.Error(err, "failed to set read-only mode", "instance", ss.Primary) + } else { + succeeded = true + } + time.Sleep(100 * time.Millisecond) + if err := pdb.KillConnections(ctx); err != nil { + return fmt.Errorf("failed to kill connections in instance %d: %w", ss.Primary, err) + } + if succeeded { + break + } } - time.Sleep(100 * time.Millisecond) - if err := pdb.KillConnections(ctx); err != nil { - return fmt.Errorf("failed to kill connections in instance %d: %w", ss.Primary, err) + if !succeeded { + return fmt.Errorf("failed to set read-only mode in instance %d", ss.Primary) } pst, err := pdb.GetStatus(ctx) if err != nil { diff --git a/pkg/dbop/nop.go b/pkg/dbop/nop.go index a07191cf7..b19a84be5 100644 --- a/pkg/dbop/nop.go +++ b/pkg/dbop/nop.go @@ -51,6 +51,10 @@ func (o NopOperator) WaitForGTID(ctx context.Context, gtidSet string, timeoutSec return ErrNop } +func (o NopOperator) SetSessionLockWaitTimeout(ctx context.Context, timeoutSeconds int) error { + return ErrNop +} + func (o NopOperator) SetReadOnly(context.Context, bool) error { return ErrNop } diff --git a/pkg/dbop/operator.go b/pkg/dbop/operator.go index 855703373..5d4592d17 100644 --- a/pkg/dbop/operator.go +++ b/pkg/dbop/operator.go @@ -54,6 +54,9 @@ type Operator interface { // If `timeoutSeconds` is zero, this will not timeout. WaitForGTID(ctx context.Context, gtidSet string, timeoutSeconds int) error + // SetSessionLockWaitTimeout set @@session.lock_wait_timeout to `timeoutSeconds`. + SetSessionLockWaitTimeout(ctx context.Context, timeoutSeconds int) error + // SetReadOnly makes the instance super_read_only if `true` is passed. // Otherwise, this stops the replication and makes the instance writable. SetReadOnly(context.Context, bool) error diff --git a/pkg/dbop/replication.go b/pkg/dbop/replication.go index 999d67b53..cf7cb97c2 100644 --- a/pkg/dbop/replication.go +++ b/pkg/dbop/replication.go @@ -71,6 +71,13 @@ func (o *operator) WaitForGTID(ctx context.Context, gtid string, timeoutSeconds return nil } +func (o *operator) SetSessionLockWaitTimeout(ctx context.Context, timeoutSeconds int) error { + if _, err := o.db.ExecContext(ctx, "SET SESSION lock_wait_timeout=?", timeoutSeconds); err != nil { + return fmt.Errorf("failed to set lock_wait_timeout: %w", err) + } + return nil +} + func (o *operator) SetReadOnly(ctx context.Context, readOnly bool) error { if readOnly { if _, err := o.db.ExecContext(ctx, "SET GLOBAL super_read_only=1"); err != nil {