From 3873ddf57009bd22e87130789e9f38424a620648 Mon Sep 17 00:00:00 2001 From: shunki-fujita Date: Fri, 5 Jul 2024 08:31:25 +0000 Subject: [PATCH] issue-708: Set lock_wait_timeout to SetReadOnly --- clustering/operations.go | 34 ++++++++++++++++++++++++++++------ pkg/dbop/replication.go | 8 +++++++- 2 files changed, 35 insertions(+), 7 deletions(-) diff --git a/clustering/operations.go b/clustering/operations.go index 4c149ced..736a937c 100644 --- a/clustering/operations.go +++ b/clustering/operations.go @@ -149,14 +149,36 @@ func (p *managerProcess) clone(ctx context.Context, ss *StatusSet) (bool, error) func (p *managerProcess) switchover(ctx context.Context, ss *StatusSet) error { log := logFromContext(ctx) log.Info("begin switchover the primary", "current", ss.Primary, "next", ss.Candidate) - pdb := ss.DBOps[ss.Primary] - if err := pdb.SetReadOnly(ctx, true); err != nil { - return fmt.Errorf("failed to make instance %d read-only: %w", ss.Primary, err) + + // SetReadOnly waits for a running DML. + // Therefore, if it waits for a long time, deleteGracePeriodSeconds may be reached. + // To avoid this, set a timeout on SetReadOnly. + // If SetReadOnly fails, kill all processes and retry. + readOnly := false + for i := 0; i < 2; i++ { + ctxTimeout, cancel := context.WithTimeout(ctx, 15*time.Second) + defer cancel() + errCh := make(chan error) + go func() { + errCh <- pdb.SetReadOnly(ctxTimeout, true) + }() + err := <-errCh + if err != nil { + log.Error(err, "failed to set read-only mode", "instance", ss.Primary) + } else { + readOnly = true + } + time.Sleep(100 * time.Millisecond) + if err = pdb.KillConnections(ctx); err != nil { + return fmt.Errorf("failed to kill connections in instance %d: %w", ss.Primary, err) + } + if readOnly { + break + } } - time.Sleep(100 * time.Millisecond) - if err := pdb.KillConnections(ctx); err != nil { - return fmt.Errorf("failed to kill connections in instance %d: %w", ss.Primary, err) + if !readOnly { + return fmt.Errorf("failed to set read-only mode in instance %d", ss.Primary) } pst, err := pdb.GetStatus(ctx) if err != nil { diff --git a/pkg/dbop/replication.go b/pkg/dbop/replication.go index 999d67b5..c7672f58 100644 --- a/pkg/dbop/replication.go +++ b/pkg/dbop/replication.go @@ -73,8 +73,14 @@ func (o *operator) WaitForGTID(ctx context.Context, gtid string, timeoutSeconds func (o *operator) SetReadOnly(ctx context.Context, readOnly bool) error { if readOnly { + if _, err := o.db.ExecContext(ctx, "SET SESSION lock_wait_timeout=15"); err != nil { + return fmt.Errorf("failed to set @@SESSION.lock_wait_timeout: %w", err) + } if _, err := o.db.ExecContext(ctx, "SET GLOBAL super_read_only=1"); err != nil { - return fmt.Errorf("failed to set super_read_only=1: %w", err) + return fmt.Errorf("failed to set @@GLOBAL.super_read_only=1: %w", err) + } + if _, err := o.db.ExecContext(ctx, "SET SESSION lock_wait_timeout=@@GLOBAL.lock_wait_timeout"); err != nil { + return fmt.Errorf("failed to unset @@SESSION.lock_wait_timeout: %w", err) } return nil }