From aa30a22ca70d3124eb5ae1158afa1bd011e562cf Mon Sep 17 00:00:00 2001 From: Nikhil Sonti Date: Thu, 26 Feb 2026 13:29:03 -0800 Subject: [PATCH 1/4] feat: cdp keep alive and exit strategy --- apps/server/src/api/routes/health.ts | 10 +- apps/server/src/api/server.ts | 2 +- apps/server/src/browser/backends/cdp.ts | 108 ++++++++++++++++++++-- apps/server/src/browser/browser.ts | 4 + packages/shared/src/constants/limits.ts | 1 + packages/shared/src/constants/timeouts.ts | 4 + 6 files changed, 118 insertions(+), 11 deletions(-) diff --git a/apps/server/src/api/routes/health.ts b/apps/server/src/api/routes/health.ts index 61471ee2..dee29ed2 100644 --- a/apps/server/src/api/routes/health.ts +++ b/apps/server/src/api/routes/health.ts @@ -5,9 +5,15 @@ */ import { Hono } from 'hono' +import type { Browser } from '../../browser/browser' -export function createHealthRoute() { +interface HealthDeps { + browser?: Browser +} + +export function createHealthRoute(deps: HealthDeps = {}) { return new Hono().get('/', (c) => { - return c.json({ status: 'ok' }) + const cdpConnected = deps.browser?.isCdpConnected() ?? true + return c.json({ status: 'ok', cdpConnected }) }) } diff --git a/apps/server/src/api/server.ts b/apps/server/src/api/server.ts index ea058c19..35fb2a80 100644 --- a/apps/server/src/api/server.ts +++ b/apps/server/src/api/server.ts @@ -73,7 +73,7 @@ export async function createHttpServer(config: HttpServerConfig) { const app = new Hono() .use('/*', cors(defaultCorsConfig)) - .route('/health', createHealthRoute()) + .route('/health', createHealthRoute({ browser })) .route( '/shutdown', createShutdownRoute({ onShutdown: onShutdown ?? (() => {}) }), diff --git a/apps/server/src/browser/backends/cdp.ts b/apps/server/src/browser/backends/cdp.ts index 54187d62..6f7caa81 100644 --- a/apps/server/src/browser/backends/cdp.ts +++ b/apps/server/src/browser/backends/cdp.ts @@ -13,6 +13,7 @@ import type { CdpTarget, CdpBackend as ICdpBackend } from './types' interface PendingRequest { resolve: (value: unknown) => void reject: (reason: Error) => void + timer: ReturnType } // biome-ignore lint/correctness/noUnusedVariables: declaration merging adds ProtocolApi properties to the class @@ -28,6 +29,7 @@ class CdpBackend implements ICdpBackend { private reconnecting = false private eventHandlers = new Map void)[]>() private sessionCache = new Map() + private keepaliveTimer: ReturnType | null = null constructor(config: { port: number }) { this.port = config.port @@ -44,6 +46,7 @@ class CdpBackend implements ICdpBackend { for (let attempt = 1; attempt <= maxRetries; attempt++) { try { await this.attemptConnect() + this.startKeepalive() return } catch (error) { const msg = error instanceof Error ? error.message : String(error) @@ -97,16 +100,82 @@ class CdpBackend implements ICdpBackend { }) } - private handleUnexpectedClose(): void { + private startKeepalive(): void { + this.stopKeepalive() + + const interval = TIMEOUTS.CDP_KEEPALIVE_INTERVAL + const timeout = TIMEOUTS.CDP_KEEPALIVE_TIMEOUT + + this.keepaliveTimer = setInterval(async () => { + if (!this.ws || !this.connected || this.disconnecting) return + + try { + await Promise.race([ + this.rawSend('Browser.getVersion'), + new Promise((_, reject) => + setTimeout( + () => reject(new Error('CDP keepalive timeout')), + timeout, + ), + ), + ]) + } catch { + logger.warn('CDP keepalive failed, connection may be dead') + this.handleDeadConnection() + } + }, interval) + } + + private stopKeepalive(): void { + if (this.keepaliveTimer) { + clearInterval(this.keepaliveTimer) + this.keepaliveTimer = null + } + } + + /** + * Force-close a zombie WebSocket that stopped responding but never + * fired onclose. This triggers the normal reconnection path. + */ + private handleDeadConnection(): void { if (this.disconnecting || this.reconnecting) return + this.stopKeepalive() + + if (this.ws) { + try { + this.ws.close() + } catch { + // Already dead, ignore + } + this.ws = null + } + this.connected = false + this.handleUnexpectedClose() + } + + private handleUnexpectedClose(): void { + if (this.disconnecting) return + + // Allow re-entry if a previous reconnection already finished. + // The old guard `if (this.reconnecting) return` caused permanent + // death when a freshly reconnected socket closed again before + // the .finally() callback reset the flag. + if (this.reconnecting) { + logger.warn( + 'CDP closed again while reconnecting — will retry after current attempt', + ) + return + } + + this.stopKeepalive() this.rejectPendingRequests() logger.error( 'CDP WebSocket closed unexpectedly, attempting reconnection...', ) this.reconnecting = true - this.reconnectOrCrash().finally(() => { + this.reconnectWithRetries().finally(() => { this.reconnecting = false }) } @@ -114,20 +183,24 @@ class CdpBackend implements ICdpBackend { private rejectPendingRequests(): void { const error = new Error('CDP connection lost') for (const request of this.pending.values()) { + clearTimeout(request.timer) request.reject(error) } this.pending.clear() } - private async reconnectOrCrash(): Promise { - const maxRetries = CDP_LIMITS.CONNECT_MAX_RETRIES - const retryDelay = TIMEOUTS.CDP_CONNECT_RETRY_DELAY + private async reconnectWithRetries(): Promise { + const maxRetries = CDP_LIMITS.RECONNECT_MAX_RETRIES + const delay = TIMEOUTS.CDP_RECONNECT_DELAY for (let attempt = 1; attempt <= maxRetries; attempt++) { + if (this.disconnecting) return + try { logger.info(`CDP reconnection attempt ${attempt}/${maxRetries}...`) - await Bun.sleep(retryDelay) + await Bun.sleep(delay) await this.attemptConnect() + this.startKeepalive() logger.info('CDP reconnected successfully') return } catch (error) { @@ -146,11 +219,13 @@ class CdpBackend implements ICdpBackend { async disconnect(): Promise { this.disconnecting = true + this.stopKeepalive() if (this.ws) { this.ws.close() this.ws = null this.connected = false } + this.rejectPendingRequests() } isConnected(): boolean { @@ -203,8 +278,24 @@ class CdpBackend implements ICdpBackend { const ws = this.ws return new Promise((resolve, reject) => { - this.pending.set(id, { resolve, reject }) - ws.send(JSON.stringify(message)) + const timer = setTimeout(() => { + this.pending.delete(id) + reject(new Error(`CDP request timeout: ${method} (id=${id})`)) + }, TIMEOUTS.CDP_REQUEST_TIMEOUT) + + this.pending.set(id, { resolve, reject, timer }) + + try { + ws.send(JSON.stringify(message)) + } catch (err) { + clearTimeout(timer) + this.pending.delete(id) + const msg = err instanceof Error ? err.message : String(err) + reject(new Error(`CDP send failed: ${msg}`)) + + // send() failure likely means the socket is dead + this.handleDeadConnection() + } }) } @@ -237,6 +328,7 @@ class CdpBackend implements ICdpBackend { if (message.id !== undefined) { const pending = this.pending.get(message.id) if (pending) { + clearTimeout(pending.timer) this.pending.delete(message.id) if (message.error) { pending.reject(new Error(`CDP error: ${message.error.message}`)) diff --git a/apps/server/src/browser/browser.ts b/apps/server/src/browser/browser.ts index 981a929b..5d5dceae 100644 --- a/apps/server/src/browser/browser.ts +++ b/apps/server/src/browser/browser.ts @@ -93,6 +93,10 @@ export class Browser { this.setupEventHandlers() } + isCdpConnected(): boolean { + return this.cdp.isConnected() + } + private setupEventHandlers(): void { this.cdp.Target.on('detachedFromTarget', (params) => { if (params.sessionId) { diff --git a/packages/shared/src/constants/limits.ts b/packages/shared/src/constants/limits.ts index 406b2fb8..e33c9998 100644 --- a/packages/shared/src/constants/limits.ts +++ b/packages/shared/src/constants/limits.ts @@ -27,6 +27,7 @@ export const PAGINATION = { export const CDP_LIMITS = { CONNECT_MAX_RETRIES: 3, + RECONNECT_MAX_RETRIES: 3, } as const export const CONTENT_LIMITS = { diff --git a/packages/shared/src/constants/timeouts.ts b/packages/shared/src/constants/timeouts.ts index 3d16060f..2d7e8ca8 100644 --- a/packages/shared/src/constants/timeouts.ts +++ b/packages/shared/src/constants/timeouts.ts @@ -22,6 +22,10 @@ export const TIMEOUTS = { // CDP connection CDP_CONNECT: 10_000, CDP_CONNECT_RETRY_DELAY: 1_000, + CDP_RECONNECT_DELAY: 5_000, + CDP_KEEPALIVE_INTERVAL: 30_000, + CDP_KEEPALIVE_TIMEOUT: 10_000, + CDP_REQUEST_TIMEOUT: 60_000, // External API calls KLAVIS_FETCH: 30_000, From e1bd74a32017a041d1fba1ffe0e744b2bf9b0f77 Mon Sep 17 00:00:00 2001 From: Nikhil Sonti Date: Thu, 26 Feb 2026 13:36:22 -0800 Subject: [PATCH 2/4] fix: self-kill if health checks don't arrive --- apps/server/src/api/routes/health.ts | 20 ++++++++++++++++++++ 1 file changed, 20 insertions(+) diff --git a/apps/server/src/api/routes/health.ts b/apps/server/src/api/routes/health.ts index dee29ed2..736a821a 100644 --- a/apps/server/src/api/routes/health.ts +++ b/apps/server/src/api/routes/health.ts @@ -4,15 +4,35 @@ * SPDX-License-Identifier: AGPL-3.0-or-later */ +import { EXIT_CODES } from '@browseros/shared/constants/exit-codes' import { Hono } from 'hono' import type { Browser } from '../../browser/browser' +import { logger } from '../../lib/logger' + +const HEALTH_CHECK_TIMEOUT = 5 * 60 * 1000 // 5 minutes interface HealthDeps { browser?: Browser } export function createHealthRoute(deps: HealthDeps = {}) { + let watchdogTimer: ReturnType | null = null + + function resetWatchdog() { + if (watchdogTimer) clearTimeout(watchdogTimer) + watchdogTimer = setTimeout(() => { + logger.error( + 'No health check received in 5 minutes, Chromium may be gone — exiting', + ) + process.exit(EXIT_CODES.GENERAL_ERROR) + }, HEALTH_CHECK_TIMEOUT) + } + + // Start the watchdog on creation + resetWatchdog() + return new Hono().get('/', (c) => { + resetWatchdog() const cdpConnected = deps.browser?.isCdpConnected() ?? true return c.json({ status: 'ok', cdpConnected }) }) From 11ea35818243ead9765ee3d380988f7c03947a87 Mon Sep 17 00:00:00 2001 From: Nikhil Sonti Date: Thu, 26 Feb 2026 13:36:37 -0800 Subject: [PATCH 3/4] fix: timer cdp race --- apps/server/src/browser/backends/cdp.ts | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/apps/server/src/browser/backends/cdp.ts b/apps/server/src/browser/backends/cdp.ts index 6f7caa81..e2403274 100644 --- a/apps/server/src/browser/backends/cdp.ts +++ b/apps/server/src/browser/backends/cdp.ts @@ -109,17 +109,20 @@ class CdpBackend implements ICdpBackend { this.keepaliveTimer = setInterval(async () => { if (!this.ws || !this.connected || this.disconnecting) return + let timeoutId: ReturnType | undefined try { await Promise.race([ this.rawSend('Browser.getVersion'), - new Promise((_, reject) => - setTimeout( + new Promise((_, reject) => { + timeoutId = setTimeout( () => reject(new Error('CDP keepalive timeout')), timeout, - ), - ), + ) + }), ]) + clearTimeout(timeoutId) } catch { + clearTimeout(timeoutId) logger.warn('CDP keepalive failed, connection may be dead') this.handleDeadConnection() } From 193151de982269bb90324b529f47cec1eb6314e6 Mon Sep 17 00:00:00 2001 From: Nikhil Sonti Date: Thu, 26 Feb 2026 13:52:00 -0800 Subject: [PATCH 4/4] fix: stale this --- apps/server/src/browser/backends/cdp.ts | 2 ++ 1 file changed, 2 insertions(+) diff --git a/apps/server/src/browser/backends/cdp.ts b/apps/server/src/browser/backends/cdp.ts index e2403274..ba984273 100644 --- a/apps/server/src/browser/backends/cdp.ts +++ b/apps/server/src/browser/backends/cdp.ts @@ -87,6 +87,8 @@ class CdpBackend implements ICdpBackend { } ws.onclose = () => { + // Guard against stale onclose from a replaced socket + if (this.ws !== ws) return this.connected = false this.ws = null if (opened) this.handleUnexpectedClose()