From c83b571ab6a43db5ae0fda640aa871eb12b0209f Mon Sep 17 00:00:00 2001 From: Otavio Salvador Date: Thu, 5 Mar 2026 13:28:50 -0300 Subject: [PATCH 1/5] fix(ssh): add WebSocket ping/pong to per-session revdial connections MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit In V1 (revdial) transport, the main control connection has bidirectional keepalive (ping/pong + JSON keep-alive), but per-session WebSocket connections created via dial-back have none. During idle SSH sessions, the only traffic on the per-session WebSocket is the agent's SSH keepalive (agent→server, one-way every 30s). The server→agent direction is completely silent, causing intermediaries (load balancers, NAT, firewalls) to detect a half-idle connection and close it. Call Ping() on the agent-side wsconnadapter in grabConn() to start sending WebSocket ping frames every 30 seconds. gorilla/websocket automatically responds to pings with pong frames during NextReader(), creating bidirectional traffic that keeps the connection alive through all intermediaries. Placing this on the agent side distributes the goroutine cost across agents rather than concentrating it on the server. Fixes: #5946 --- pkg/revdial/revdial.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/revdial/revdial.go b/pkg/revdial/revdial.go index 8486b81c926..7ccd232020a 100644 --- a/pkg/revdial/revdial.go +++ b/pkg/revdial/revdial.go @@ -439,8 +439,11 @@ func (ln *Listener) grabConn(path string) { return } + c := wsconnadapter.New(wsConn) + c.Ping() + select { - case ln.connc <- wsconnadapter.New(wsConn): + case ln.connc <- c: case <-ln.donec: } } From e7546692dda80b6665cd76c9bec9afe52d4d4c54 Mon Sep 17 00:00:00 2001 From: Otavio Salvador Date: Tue, 10 Mar 2026 08:59:54 -0300 Subject: [PATCH 2/5] fix(ssh): fix adapter close race with sync.Once Fix race condition in wsconnadapter Close() where concurrent callers (e.g. pong timeout AfterFunc and normal teardown) could panic on send-to-closed-channel or double-close the WebSocket connection. Use sync.Once to guarantee both channel and connection cleanup happen exactly once. --- pkg/wsconnadapter/wsconnadapter.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/pkg/wsconnadapter/wsconnadapter.go b/pkg/wsconnadapter/wsconnadapter.go index e43c481ea97..5a615d5d878 100644 --- a/pkg/wsconnadapter/wsconnadapter.go +++ b/pkg/wsconnadapter/wsconnadapter.go @@ -31,6 +31,7 @@ type Adapter struct { reader io.Reader stopPingCh chan struct{} pongCh chan bool + closeOnce sync.Once Logger *log.Entry CreatedAt time.Time } @@ -182,19 +183,18 @@ func (a *Adapter) Write(b []byte) (int, error) { } func (a *Adapter) Close() error { - select { - case <-a.stopPingCh: - a.Logger.Debug("stop ping message received") - default: + var err error + + a.closeOnce.Do(func() { if a.stopPingCh != nil { - a.stopPingCh <- struct{}{} close(a.stopPingCh) - a.Logger.Debug("stop ping channel closed") } - } - return a.conn.Close() + err = a.conn.Close() + }) + + return err } func (a *Adapter) LocalAddr() net.Addr { From 92e27c95f2c14ac18f6be97fe971881bc0c28b16 Mon Sep 17 00:00:00 2001 From: Otavio Salvador Date: Fri, 6 Mar 2026 09:31:48 -0300 Subject: [PATCH 3/5] fix(ssh): make WebSocket adapter Ping() initialization concurrency-safe The previous nil-check guard on pongCh was racy: two concurrent callers could both see nil and create duplicate channels and goroutines, leaking the first set. Use sync.Once to guarantee initialization happens exactly once, consistent with the Close() fix in the previous commit. --- pkg/wsconnadapter/wsconnadapter.go | 75 ++++++++++++++---------------- 1 file changed, 36 insertions(+), 39 deletions(-) diff --git a/pkg/wsconnadapter/wsconnadapter.go b/pkg/wsconnadapter/wsconnadapter.go index 5a615d5d878..d8a62a8a4ed 100644 --- a/pkg/wsconnadapter/wsconnadapter.go +++ b/pkg/wsconnadapter/wsconnadapter.go @@ -31,6 +31,7 @@ type Adapter struct { reader io.Reader stopPingCh chan struct{} pongCh chan bool + pingOnce sync.Once closeOnce sync.Once Logger *log.Entry CreatedAt time.Time @@ -73,53 +74,49 @@ func New(conn *websocket.Conn, options ...Option) *Adapter { } func (a *Adapter) Ping() chan bool { - if a.pongCh != nil { - a.Logger.Debug("pong channel is not null") + a.pingOnce.Do(func() { + a.stopPingCh = make(chan struct{}) + a.pongCh = make(chan bool) - return a.pongCh - } - - a.stopPingCh = make(chan struct{}) - a.pongCh = make(chan bool) - - timeout := time.AfterFunc(pongTimeout, func() { - a.Logger.Debug("close connection due pong timeout") - - _ = a.Close() - }) + timeout := time.AfterFunc(pongTimeout, func() { + a.Logger.Debug("close connection due pong timeout") - a.conn.SetPongHandler(func(_ string) error { - timeout.Reset(pongTimeout) - a.Logger.Trace("pong timeout") + _ = a.Close() + }) - // non-blocking channel write - select { - case a.pongCh <- true: - a.Logger.Trace("write true to pong channel") - default: - } + a.conn.SetPongHandler(func(_ string) error { + timeout.Reset(pongTimeout) + a.Logger.Trace("pong timeout") - return nil - }) + // non-blocking channel write + select { + case a.pongCh <- true: + a.Logger.Trace("write true to pong channel") + default: + } - // ping loop - go func() { - ticker := time.NewTicker(pingInterval) - defer ticker.Stop() + return nil + }) - for { - select { - case <-ticker.C: - if err := a.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(5*time.Second)); err != nil { - a.Logger.WithError(err).Error("failed to write ping message") + // ping loop + go func() { + ticker := time.NewTicker(pingInterval) + defer ticker.Stop() + + for { + select { + case <-ticker.C: + if err := a.conn.WriteControl(websocket.PingMessage, []byte{}, time.Now().Add(5*time.Second)); err != nil { + a.Logger.WithError(err).Error("failed to write ping message") + } + case <-a.stopPingCh: + a.Logger.Debug("stop ping message received") + + return } - case <-a.stopPingCh: - a.Logger.Debug("stop ping message received") - - return } - } - }() + }() + }) return a.pongCh } From b8e5f44f3fde4db25bca4634c2945f006148adce Mon Sep 17 00:00:00 2001 From: Otavio Salvador Date: Mon, 9 Mar 2026 15:14:43 -0300 Subject: [PATCH 4/5] test(ssh): increase timeouts in terminal window resize test The terminal_window_size_change test had a 2s per-attempt timeout and 5s overall deadline for reading stty output over a PTY. Under load, terminal I/O can exceed these tight limits, causing ~40% flakiness locally. Increase to 10s per-attempt and 30s overall. Verified with 20 consecutive runs (0 failures). --- tests/ssh_test.go | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/tests/ssh_test.go b/tests/ssh_test.go index ed835740d2b..f39eeeb1d88 100644 --- a/tests/ssh_test.go +++ b/tests/ssh_test.go @@ -981,7 +981,7 @@ func testSSHWithVersion(t *testing.T, connectionVersion int) { // interrupted, so each attempt runs in a goroutine with a // per-attempt timeout. On timeout we abort immediately // (the deferred sess/conn Close unblocks the reader). - deadline := time.Now().Add(5 * time.Second) + deadline := time.Now().Add(30 * time.Second) var lastOutput string matched := false for attempt := 0; !matched && time.Now().Before(deadline); attempt++ { @@ -1002,7 +1002,7 @@ func testSSHWithVersion(t *testing.T, connectionVersion int) { require.NoError(t, r.err) lastOutput = r.output matched = (r.output == expected) - case <-time.After(2 * time.Second): + case <-time.After(10 * time.Second): require.Fail(t, "timeout reading stty output", "marker=%s expected=%s", marker, expected) } From fc4113692221aefdaf0681ac552be649531cfdc4 Mon Sep 17 00:00:00 2001 From: Otavio Salvador Date: Mon, 9 Mar 2026 15:14:52 -0300 Subject: [PATCH 5/5] fix(ssh): return stored error from subsequent Close() calls The sync.Once Close() previously returned nil on repeated calls. Store the error from the first close so callers always receive it. --- pkg/wsconnadapter/wsconnadapter.go | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/pkg/wsconnadapter/wsconnadapter.go b/pkg/wsconnadapter/wsconnadapter.go index d8a62a8a4ed..c95915eb571 100644 --- a/pkg/wsconnadapter/wsconnadapter.go +++ b/pkg/wsconnadapter/wsconnadapter.go @@ -33,6 +33,7 @@ type Adapter struct { pongCh chan bool pingOnce sync.Once closeOnce sync.Once + closeErr error Logger *log.Entry CreatedAt time.Time } @@ -180,18 +181,16 @@ func (a *Adapter) Write(b []byte) (int, error) { } func (a *Adapter) Close() error { - var err error - a.closeOnce.Do(func() { if a.stopPingCh != nil { close(a.stopPingCh) a.Logger.Debug("stop ping channel closed") } - err = a.conn.Close() + a.closeErr = a.conn.Close() }) - return err + return a.closeErr } func (a *Adapter) LocalAddr() net.Addr {