From 38781f494e47a01a3e97d5b2e0134d5bb190b6fd Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:06:03 +0200 Subject: [PATCH 1/9] feat(providers): add content detection config to X provider Add contentSelectors and contentBlockedIndicators fields to the X (Twitter) provider entry. These define the DOM selectors and text patterns used to detect when X.com blocks headless browsers from viewing feed content. Updated notes to document blocking behavior. --- scripts/providers.json | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/scripts/providers.json b/scripts/providers.json index 42462ac..d4ee8c9 100644 --- a/scripts/providers.json +++ b/scripts/providers.json @@ -53,8 +53,14 @@ "captchaTextPatterns": ["verify it's you", "complete the challenge", "confirm your identity"], "twoFactorHint": "X may prompt for a TOTP code, SMS code, or security key. X may also ask you to confirm your username or phone number as an identity check before password.", "twoFactorSelectors": ["input[data-testid=\"ocfEnterTextTextInput\"]", "input[name=\"text\"][autocomplete=\"one-time-code\"]"], + "contentSelectors": ["[data-testid=\"primaryColumn\"]", "article[data-testid=\"tweet\"]", "[data-testid=\"cellInnerDiv\"]"], + "contentBlockedIndicators": { + "selectors": ["[data-testid=\"empty_state_header_text\"]", "[data-testid=\"error-detail\"]"], + "textPatterns": ["something went wrong", "try again", "content is not available", "this page is not available"], + "emptyContentThreshold": 200 + }, "flowType": "spa", - "notes": "Multi-step SPA flow: identifier -> optional username challenge -> password -> optional 2FA. URL does not change between steps. Arkose FunCAPTCHA triggers on nearly every new IP. Most aggressive anti-bot of any provider." + "notes": "Multi-step SPA flow: identifier -> optional username challenge -> password -> optional 2FA. URL does not change between steps. Arkose FunCAPTCHA triggers on nearly every new IP. Most aggressive anti-bot of any provider. X.com blocks headless browsers from viewing feed content - pages load but timelines appear empty or show error states. Authenticated sessions with real browser profiles work reliably." }, { "slug": "reddit", From 0ee440f83d29b93b1537650246743dfdd87e1daf Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:07:00 +0200 Subject: [PATCH 2/9] feat(detect): add detectContentBlocked function for headless blocking Add a new detectContentBlocked() function that detects when sites serve pages but block actual content from headless browsers. Uses five ordered heuristics (OR logic): provider blocked selectors, provider blocked text patterns, empty content areas, generic error text with short body, and persistent loading indicators. Exports CONTENT_BLOCKED_TEXT_PATTERNS. --- scripts/auth-wall-detect.js | 174 +++++++++++++++++++++++++++++++++++- 1 file changed, 171 insertions(+), 3 deletions(-) diff --git a/scripts/auth-wall-detect.js b/scripts/auth-wall-detect.js index 7c3c1a3..183392a 100644 --- a/scripts/auth-wall-detect.js +++ b/scripts/auth-wall-detect.js @@ -1,14 +1,18 @@ 'use strict'; /** - * Auth wall detection module. + * Auth wall and content blocking detection module. * - * Detects whether a page is showing an authentication wall after navigation. + * detectAuthWall: Detects whether a page is showing an authentication wall. * Uses three heuristics (ALL must pass - AND logic): * 1. Domain cookies exist for the target URL * 2. Current page URL matches a known auth URL pattern * 3. Page DOM contains login-related elements or text * + * detectContentBlocked: Detects when a site serves a page but blocks the + * actual content (e.g. X.com serving empty timelines to headless browsers). + * Uses provider-specific and generic heuristics (OR logic - any match triggers). + * * Short-circuits: if cookie check fails, skips URL and DOM checks. */ @@ -148,4 +152,168 @@ async function detectAuthWall(page, context, targetUrl) { return { detected: false, reason: 'no_auth_elements' }; } -module.exports = { detectAuthWall, AUTH_URL_PATTERNS, AUTH_DOM_SELECTORS, AUTH_TEXT_PATTERNS }; +// --- Content blocking detection --- + +const CONTENT_BLOCKED_TEXT_PATTERNS = [ + 'something went wrong', + 'try again', + 'content is not available', + 'this page is not available', + 'page isn\'t available', + 'page not found', + 'access denied', + 'please enable javascript' +]; + +const LOADING_INDICATOR_SELECTORS = [ + '[role="progressbar"]', + '[aria-busy="true"]', + '.spinner', + '.loading' +]; + +const DEFAULT_EMPTY_CONTENT_THRESHOLD = 200; + +/** + * Detect whether the page content is blocked (e.g. by headless browser detection). + * + * Unlike auth wall detection (AND logic), content blocking uses OR logic - + * any single heuristic match triggers detection. Checks are ordered from + * most specific (provider selectors) to most generic (persistent spinners). + * + * @param {import('playwright').Page} page + * @param {object} [options={}] + * @param {string[]} [options.contentSelectors] - Provider-specific content selectors + * @param {object} [options.contentBlockedIndicators] - Provider-specific blocked indicators + * @param {string[]} [options.contentBlockedIndicators.selectors] - Selectors that indicate blocked content + * @param {string[]} [options.contentBlockedIndicators.textPatterns] - Text patterns that indicate blocked content + * @param {number} [options.contentBlockedIndicators.emptyContentThreshold] - Min chars for content to be considered present + * @param {number} [options.timeout] - Not currently used, reserved for future async checks + * @returns {Promise<{ detected: boolean, reason: string, details?: object }>} + */ +async function detectContentBlocked(page, options = {}) { + const { contentSelectors, contentBlockedIndicators } = options; + const blockedSelectors = contentBlockedIndicators?.selectors || []; + const blockedTextPatterns = contentBlockedIndicators?.textPatterns || []; + const emptyThreshold = contentBlockedIndicators?.emptyContentThreshold || DEFAULT_EMPTY_CONTENT_THRESHOLD; + + // 1. Provider-specific blocked selectors + if (blockedSelectors.length > 0) { + try { + const results = await Promise.allSettled( + blockedSelectors.map(async (sel) => ({ sel, el: await page.$(sel) })) + ); + for (const r of results) { + if (r.status === 'fulfilled' && r.value.el) { + return { + detected: true, + reason: 'provider_blocked_selector', + details: { selector: r.value.sel } + }; + } + } + } catch { + // DOM query failed - continue to next check + } + } + + // 2. Provider-specific blocked text patterns + if (blockedTextPatterns.length > 0) { + try { + const bodyText = (await page.textContent('body') || '').slice(0, 5000).toLowerCase(); + const matched = blockedTextPatterns.find(pattern => bodyText.includes(pattern)); + if (matched) { + return { + detected: true, + reason: 'provider_blocked_text', + details: { pattern: matched } + }; + } + } catch { + // textContent failed - continue to next check + } + } + + // 3. Provider content selectors exist but contain very little text + if (contentSelectors && contentSelectors.length > 0) { + try { + let totalContentLength = 0; + let anyContentSelectorFound = false; + + const results = await Promise.allSettled( + contentSelectors.map(async (sel) => { + const el = await page.$(sel); + if (!el) return { sel, found: false, length: 0 }; + const text = await el.textContent() || ''; + return { sel, found: true, length: text.trim().length }; + }) + ); + + for (const r of results) { + if (r.status === 'fulfilled' && r.value.found) { + anyContentSelectorFound = true; + totalContentLength += r.value.length; + } + } + + if (anyContentSelectorFound && totalContentLength < emptyThreshold) { + return { + detected: true, + reason: 'content_empty', + details: { contentLength: totalContentLength, threshold: emptyThreshold } + }; + } + } catch { + // DOM query failed - continue to next check + } + } + + // 4. Generic text patterns + short main content area + try { + const bodyText = (await page.textContent('body') || '').slice(0, 5000).toLowerCase(); + const genericMatch = CONTENT_BLOCKED_TEXT_PATTERNS.find(pattern => bodyText.includes(pattern)); + if (genericMatch && bodyText.length < 500) { + return { + detected: true, + reason: 'generic_blocked_text', + details: { pattern: genericMatch, bodyLength: bodyText.length } + }; + } + } catch { + // textContent failed - continue to next check + } + + // 5. Persistent loading indicators (spinners still visible) + try { + const results = await Promise.allSettled( + LOADING_INDICATOR_SELECTORS.map(async (sel) => { + const el = await page.$(sel); + if (!el) return { sel, visible: false }; + const visible = await el.isVisible(); + return { sel, visible }; + }) + ); + for (const r of results) { + if (r.status === 'fulfilled' && r.value.visible) { + return { + detected: true, + reason: 'persistent_loader', + details: { selector: r.value.sel } + }; + } + } + } catch { + // DOM query failed - no detection + } + + return { detected: false, reason: 'content_ok' }; +} + +module.exports = { + detectAuthWall, + detectContentBlocked, + AUTH_URL_PATTERNS, + AUTH_DOM_SELECTORS, + AUTH_TEXT_PATTERNS, + CONTENT_BLOCKED_TEXT_PATTERNS +}; From c6dd7d6739ec26aab2d796ce68a2e69497fb702d Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:07:40 +0200 Subject: [PATCH 3/9] feat(stealth): enhance anti-bot evasion in browser init script Expand the addInitScript block with additional stealth measures: - Spoof window.chrome object (present in real Chrome, missing in headless) - Spoof navigator.plugins with non-empty PluginArray-like object - Set navigator.languages to ['en-US', 'en'] - Override WebGL vendor/renderer to Intel Inc. / Intel Iris OpenGL Engine - Override permissions.query for 'notifications' to return denied state --- scripts/browser-launcher.js | 37 +++++++++++++++++++++++++++++++++++++ 1 file changed, 37 insertions(+) diff --git a/scripts/browser-launcher.js b/scripts/browser-launcher.js index 1bd9268..7650ba4 100644 --- a/scripts/browser-launcher.js +++ b/scripts/browser-launcher.js @@ -106,7 +106,44 @@ async function launchBrowser(sessionName, options = {}) { // Anti-bot init script on all pages await context.addInitScript(() => { + // Hide webdriver flag Object.defineProperty(navigator, 'webdriver', { get: () => false }); + + // Spoof window.chrome object (present in real Chrome, missing in headless) + if (!window.chrome) { + window.chrome = { runtime: {}, csi: function() {}, loadTimes: function() {} }; + } + + // Spoof navigator.plugins to appear non-empty (headless has empty PluginArray) + Object.defineProperty(navigator, 'plugins', { + get: () => { + const arr = [{ name: 'Chrome PDF Plugin', filename: 'internal-pdf-viewer', description: 'Portable Document Format' }]; + arr.item = (i) => arr[i]; + arr.namedItem = (n) => arr.find(p => p.name === n) || null; + arr.refresh = () => {}; + return arr; + } + }); + + // Set navigator.languages (headless may report empty or single-entry) + Object.defineProperty(navigator, 'languages', { get: () => ['en-US', 'en'] }); + + // Override WebGL renderer to common hardware (Intel Iris) + const getParameter = WebGLRenderingContext.prototype.getParameter; + WebGLRenderingContext.prototype.getParameter = function(param) { + if (param === 0x9245) return 'Intel Inc.'; // UNMASKED_VENDOR_WEBGL + if (param === 0x9246) return 'Intel Iris OpenGL Engine'; // UNMASKED_RENDERER_WEBGL + return getParameter.call(this, param); + }; + + // Override permissions.query for 'notifications' (headless returns 'prompt') + const origQuery = navigator.permissions.query.bind(navigator.permissions); + navigator.permissions.query = (params) => { + if (params.name === 'notifications') { + return Promise.resolve({ state: 'denied', onchange: null }); + } + return origQuery(params); + }; }); // Get or create the first page From bbc94361ef77707d955068073e6d4d90ba4cfb28 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:08:53 +0200 Subject: [PATCH 4/9] feat(goto): integrate content blocking detection into goto action Import detectContentBlocked and add matchProviderByDomain helper with lazy-loaded Map for O(1) provider lookup. After goto navigation and waitForLoaded, detect content blocking using provider-specific config from providers.json. When detected, add contentBlocked, warning, reason, and suggestion fields to the result. Add --no-content-block-detect flag to skip detection. --- scripts/web-ctl.js | 58 +++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 55 insertions(+), 3 deletions(-) diff --git a/scripts/web-ctl.js b/scripts/web-ctl.js index ebcaf2a..72ebf85 100755 --- a/scripts/web-ctl.js +++ b/scripts/web-ctl.js @@ -3,7 +3,7 @@ const sessionStore = require('./session-store'); const { launchBrowser, closeBrowser, randomDelay, waitForStable, waitForLoaded, canLaunchHeaded } = require('./browser-launcher'); -const { detectAuthWall } = require('./auth-wall-detect'); +const { detectAuthWall, detectContentBlocked } = require('./auth-wall-detect'); const { runAuthFlow } = require('./auth-flow'); const { checkAuthSuccess } = require('./auth-check'); const { sanitizeWebContent, wrapOutput } = require('./redact'); @@ -21,7 +21,7 @@ const BOOLEAN_FLAGS = new Set([ '--allow-evaluate', '--no-snapshot', '--wait-stable', '--vnc', '--exact', '--accept', '--submit', '--dismiss', '--auto', '--snapshot-collapse', '--snapshot-text-only', '--snapshot-compact', - '--snapshot-full', '--no-auth-wall-detect', '--ensure-auth', '--wait-loaded', + '--snapshot-full', '--no-auth-wall-detect', '--no-content-block-detect', '--ensure-auth', '--wait-loaded', ]); function validateSessionName(name) { @@ -57,6 +57,37 @@ function parseOptions(args) { return opts; } +/** + * Match a provider by domain from providers.json. + * Uses a lazy-loaded Map keyed by domain for O(1) lookup. + */ +let _providerDomainMap = null; +function matchProviderByDomain(url) { + if (!_providerDomainMap) { + _providerDomainMap = new Map(); + try { + const providers = require('./providers.json'); + for (const p of providers) { + try { + const domain = new URL(p.loginUrl).hostname; + _providerDomainMap.set(domain, p); + } catch { + // Skip provider with invalid loginUrl + } + } + } catch { + // providers.json load failed - return null for all lookups + } + } + + try { + const domain = new URL(url).hostname; + return _providerDomainMap.get(domain) || null; + } catch { + return null; + } +} + /** * Convert selector string to Playwright locator. */ @@ -1044,8 +1075,28 @@ async function runAction(sessionName, action, actionArgs, opts) { if (opts.waitLoaded) { await waitForLoaded(page, { timeout: loadedTimeout }); } + // Content blocking detection (e.g. X.com empty feeds in headless) + let contentBlockResult = null; + if (!opts.noContentBlockDetect) { + const provider = matchProviderByDomain(url); + contentBlockResult = await detectContentBlocked(page, { + contentSelectors: provider?.contentSelectors, + contentBlockedIndicators: provider?.contentBlockedIndicators + }); + } const snapshot = await getSnapshot(page, opts); - result = { url: page.url(), status: response ? response.status() : null, ...(opts.waitLoaded && { waitLoaded: true }), ...(snapshot != null && { snapshot }) }; + result = { + url: page.url(), + status: response ? response.status() : null, + ...(opts.waitLoaded && { waitLoaded: true }), + ...(contentBlockResult?.detected && { + contentBlocked: true, + warning: 'content_blocked', + contentBlockedReason: contentBlockResult.reason, + suggestion: "Site may be blocking headless browsers. Try: (1) authenticate with 'session auth --provider ', (2) use --ensure-auth for headed mode" + }), + ...(snapshot != null && { snapshot }) + }; break; } @@ -1270,6 +1321,7 @@ Run actions: goto Navigate to URL [--ensure-auth] Poll for auth completion instead of timed checkpoint [--wait-loaded] Wait for async content to finish rendering + [--no-content-block-detect] Skip content blocking detection [--timeout ] Wait timeout (default: 15000) snapshot Get accessibility tree click Click element From 8063a109d337120f0d7f2beec1265349f101ebf4 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:10:01 +0200 Subject: [PATCH 5/9] test: add comprehensive tests for detectContentBlocked Add 19 tests covering all detection heuristics: - Provider-specific blocked selectors and text patterns - Empty content area detection with threshold - Generic error text with short body - Persistent loading indicators (visible vs invisible) - Error handling for page.$() and textContent() failures - Default emptyContentThreshold of 200 - X.com-specific: empty feed, error state, no false positives - CONTENT_BLOCKED_TEXT_PATTERNS export validation --- tests/content-blocked.test.js | 306 ++++++++++++++++++++++++++++++++++ 1 file changed, 306 insertions(+) create mode 100644 tests/content-blocked.test.js diff --git a/tests/content-blocked.test.js b/tests/content-blocked.test.js new file mode 100644 index 0000000..58bc670 --- /dev/null +++ b/tests/content-blocked.test.js @@ -0,0 +1,306 @@ +'use strict'; + +const { describe, it } = require('node:test'); +const assert = require('node:assert/strict'); +const { + detectContentBlocked, + CONTENT_BLOCKED_TEXT_PATTERNS +} = require('../scripts/auth-wall-detect'); + +// --- Mock helpers --- + +function mockPage({ selectors, bodyText, elementTexts, visibleSelectors } = {}) { + return { + $: async (sel) => { + if (selectors && selectors.includes(sel)) { + const text = (elementTexts && elementTexts[sel]) || ''; + return { + textContent: async () => text, + isVisible: async () => (visibleSelectors ? visibleSelectors.includes(sel) : false) + }; + } + return null; + }, + textContent: async (sel) => sel === 'body' ? (bodyText || '') : '' + }; +} + +// --- Tests --- + +describe('detectContentBlocked', () => { + + it('returns detected: false when page has sufficient content', async () => { + const page = mockPage({ + selectors: ['[data-testid="primaryColumn"]'], + elementTexts: { '[data-testid="primaryColumn"]': 'x'.repeat(300) }, + bodyText: 'Lots of content on this page including tweets and posts' + }); + const result = await detectContentBlocked(page, { + contentSelectors: ['[data-testid="primaryColumn"]'], + contentBlockedIndicators: { + selectors: ['[data-testid="empty_state_header_text"]'], + textPatterns: ['something went wrong'], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + + it('returns detected: true when provider contentBlockedIndicators.selectors match', async () => { + const page = mockPage({ + selectors: ['[data-testid="empty_state_header_text"]'], + bodyText: 'Some page content' + }); + const result = await detectContentBlocked(page, { + contentBlockedIndicators: { + selectors: ['[data-testid="empty_state_header_text"]', '[data-testid="error-detail"]'], + textPatterns: [], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, true); + assert.equal(result.reason, 'provider_blocked_selector'); + assert.equal(result.details.selector, '[data-testid="empty_state_header_text"]'); + }); + + it('returns detected: true when provider contentBlockedIndicators.textPatterns match', async () => { + const page = mockPage({ + bodyText: 'Something went wrong. Try reloading.' + }); + const result = await detectContentBlocked(page, { + contentBlockedIndicators: { + selectors: [], + textPatterns: ['something went wrong', 'try again'], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, true); + assert.equal(result.reason, 'provider_blocked_text'); + assert.equal(result.details.pattern, 'something went wrong'); + }); + + it('returns detected: true when provider contentSelectors exist but are empty (below threshold)', async () => { + const page = mockPage({ + selectors: ['[data-testid="primaryColumn"]'], + elementTexts: { '[data-testid="primaryColumn"]': 'X' }, + bodyText: 'Some page with navigation but no feed content' + }); + const result = await detectContentBlocked(page, { + contentSelectors: ['[data-testid="primaryColumn"]', 'article[data-testid="tweet"]'], + contentBlockedIndicators: { + selectors: [], + textPatterns: [], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, true); + assert.equal(result.reason, 'content_empty'); + assert.ok(result.details.contentLength < 200); + assert.equal(result.details.threshold, 200); + }); + + it('returns detected: true with generic text patterns + short main content', async () => { + const shortBody = 'Something went wrong'; + const page = mockPage({ bodyText: shortBody }); + const result = await detectContentBlocked(page); + assert.equal(result.detected, true); + assert.equal(result.reason, 'generic_blocked_text'); + assert.equal(result.details.pattern, 'something went wrong'); + assert.ok(result.details.bodyLength < 500); + }); + + it('does NOT trigger generic text patterns when body is long', async () => { + const longBody = 'Something went wrong earlier but here is lots of content. ' + 'x'.repeat(600); + const page = mockPage({ bodyText: longBody }); + const result = await detectContentBlocked(page); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + + it('returns detected: true when loading indicators persist (spinner still visible)', async () => { + const page = mockPage({ + selectors: ['[role="progressbar"]'], + visibleSelectors: ['[role="progressbar"]'], + bodyText: 'Loading content...' + 'x'.repeat(600) + }); + const result = await detectContentBlocked(page); + assert.equal(result.detected, true); + assert.equal(result.reason, 'persistent_loader'); + assert.equal(result.details.selector, '[role="progressbar"]'); + }); + + it('does NOT trigger on invisible loading indicators', async () => { + const page = mockPage({ + selectors: ['[role="progressbar"]'], + visibleSelectors: [], + bodyText: 'Normal page content with lots of text. ' + 'x'.repeat(600) + }); + const result = await detectContentBlocked(page); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + + it('returns detected: false with no options and normal content', async () => { + const page = mockPage({ + bodyText: 'This is a normal page with plenty of content. ' + 'x'.repeat(600) + }); + const result = await detectContentBlocked(page); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + + it('handles page.$() errors gracefully', async () => { + const page = { + $: async () => { throw new Error('Frame detached'); }, + textContent: async () => 'Normal page with plenty of content. ' + 'x'.repeat(600) + }; + const result = await detectContentBlocked(page, { + contentSelectors: ['div.content'], + contentBlockedIndicators: { + selectors: ['div.error'], + textPatterns: [], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + + it('handles page.textContent() errors gracefully', async () => { + const page = { + $: async () => null, + textContent: async () => { throw new Error('Frame detached'); } + }; + const result = await detectContentBlocked(page, { + contentBlockedIndicators: { + selectors: [], + textPatterns: ['error'], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + + it('uses default emptyContentThreshold of 200', async () => { + const page = mockPage({ + selectors: ['div.feed'], + elementTexts: { 'div.feed': 'x'.repeat(150) }, + bodyText: 'Normal page' + }); + // No emptyContentThreshold specified - should default to 200 + const result = await detectContentBlocked(page, { + contentSelectors: ['div.feed'], + contentBlockedIndicators: { + selectors: [], + textPatterns: [] + } + }); + assert.equal(result.detected, true); + assert.equal(result.reason, 'content_empty'); + assert.equal(result.details.threshold, 200); + }); + + it('does not flag content_empty when threshold is met', async () => { + const page = mockPage({ + selectors: ['div.feed'], + elementTexts: { 'div.feed': 'x'.repeat(250) }, + bodyText: 'Normal page' + }); + const result = await detectContentBlocked(page, { + contentSelectors: ['div.feed'], + contentBlockedIndicators: { + selectors: [], + textPatterns: [], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); +}); + +describe('detectContentBlocked - X.com-specific', () => { + + it('detects empty feed (primaryColumn exists, no tweets)', async () => { + const page = mockPage({ + selectors: ['[data-testid="primaryColumn"]'], + elementTexts: { '[data-testid="primaryColumn"]': 'Home' }, + bodyText: 'Home What is happening?!' + }); + const result = await detectContentBlocked(page, { + contentSelectors: ['[data-testid="primaryColumn"]', 'article[data-testid="tweet"]', '[data-testid="cellInnerDiv"]'], + contentBlockedIndicators: { + selectors: ['[data-testid="empty_state_header_text"]', '[data-testid="error-detail"]'], + textPatterns: ['something went wrong', 'try again', 'content is not available', 'this page is not available'], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, true); + assert.equal(result.reason, 'content_empty'); + }); + + it('detects error state (Something went wrong)', async () => { + const page = mockPage({ + selectors: ['[data-testid="error-detail"]'], + bodyText: 'Something went wrong. Try reloading.' + }); + const result = await detectContentBlocked(page, { + contentSelectors: ['[data-testid="primaryColumn"]', 'article[data-testid="tweet"]'], + contentBlockedIndicators: { + selectors: ['[data-testid="empty_state_header_text"]', '[data-testid="error-detail"]'], + textPatterns: ['something went wrong', 'try again'], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, true); + // Should match provider_blocked_selector first (error-detail found) + assert.equal(result.reason, 'provider_blocked_selector'); + assert.equal(result.details.selector, '[data-testid="error-detail"]'); + }); + + it('no false positive when tweets exist', async () => { + const tweetContent = 'Just posted a long tweet with lots of interesting content about programming. '.repeat(10); + const page = mockPage({ + selectors: ['[data-testid="primaryColumn"]', 'article[data-testid="tweet"]', '[data-testid="cellInnerDiv"]'], + elementTexts: { + '[data-testid="primaryColumn"]': tweetContent, + 'article[data-testid="tweet"]': tweetContent.slice(0, 200), + '[data-testid="cellInnerDiv"]': tweetContent.slice(0, 200) + }, + bodyText: 'Home ' + tweetContent + }); + const result = await detectContentBlocked(page, { + contentSelectors: ['[data-testid="primaryColumn"]', 'article[data-testid="tweet"]', '[data-testid="cellInnerDiv"]'], + contentBlockedIndicators: { + selectors: ['[data-testid="empty_state_header_text"]', '[data-testid="error-detail"]'], + textPatterns: ['something went wrong', 'try again', 'content is not available', 'this page is not available'], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); +}); + +describe('CONTENT_BLOCKED_TEXT_PATTERNS', () => { + + it('is a non-empty array', () => { + assert.ok(Array.isArray(CONTENT_BLOCKED_TEXT_PATTERNS)); + assert.ok(CONTENT_BLOCKED_TEXT_PATTERNS.length > 0); + }); + + it('contains expected patterns', () => { + assert.ok(CONTENT_BLOCKED_TEXT_PATTERNS.includes('something went wrong')); + assert.ok(CONTENT_BLOCKED_TEXT_PATTERNS.includes('try again')); + assert.ok(CONTENT_BLOCKED_TEXT_PATTERNS.includes('access denied')); + }); + + it('all entries are lowercase strings', () => { + for (const pattern of CONTENT_BLOCKED_TEXT_PATTERNS) { + assert.equal(typeof pattern, 'string'); + assert.equal(pattern, pattern.toLowerCase(), `Pattern "${pattern}" should be lowercase`); + } + }); +}); From 7c7186744688328866cd7c99c9ef858deae31447 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:10:43 +0200 Subject: [PATCH 6/9] test: verify detectContentBlocked and CONTENT_BLOCKED_TEXT_PATTERNS exports Add two tests to the existing auth-wall-detect test suite confirming that the new detectContentBlocked function and CONTENT_BLOCKED_TEXT_PATTERNS constant are properly exported from the module. --- tests/auth-wall-detect.test.js | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/tests/auth-wall-detect.test.js b/tests/auth-wall-detect.test.js index e952425..bddb77a 100644 --- a/tests/auth-wall-detect.test.js +++ b/tests/auth-wall-detect.test.js @@ -4,9 +4,11 @@ const { describe, it } = require('node:test'); const assert = require('node:assert/strict'); const { detectAuthWall, + detectContentBlocked, AUTH_URL_PATTERNS, AUTH_DOM_SELECTORS, - AUTH_TEXT_PATTERNS + AUTH_TEXT_PATTERNS, + CONTENT_BLOCKED_TEXT_PATTERNS } = require('../scripts/auth-wall-detect'); // --- Mock helpers --- @@ -235,4 +237,13 @@ describe('detectAuthWall', () => { assert.ok(AUTH_DOM_SELECTORS.length > 0, 'AUTH_DOM_SELECTORS should not be empty'); assert.ok(AUTH_TEXT_PATTERNS.length > 0, 'AUTH_TEXT_PATTERNS should not be empty'); }); + + it('exports detectContentBlocked as a function', () => { + assert.equal(typeof detectContentBlocked, 'function'); + }); + + it('exports CONTENT_BLOCKED_TEXT_PATTERNS as a non-empty array', () => { + assert.ok(Array.isArray(CONTENT_BLOCKED_TEXT_PATTERNS), 'CONTENT_BLOCKED_TEXT_PATTERNS should be an array'); + assert.ok(CONTENT_BLOCKED_TEXT_PATTERNS.length > 0, 'CONTENT_BLOCKED_TEXT_PATTERNS should not be empty'); + }); }); From 8eb272d42b59fc066a9f51fc412755429b2966f9 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:16:42 +0200 Subject: [PATCH 7/9] test: add integration tests for content-block detection in goto --- tests/content-blocked.test.js | 45 +++++++++++++++++++++++++++++++++++ tests/web-ctl-actions.test.js | 37 ++++++++++++++++++++++++++++ 2 files changed, 82 insertions(+) diff --git a/tests/content-blocked.test.js b/tests/content-blocked.test.js index 58bc670..b84b43e 100644 --- a/tests/content-blocked.test.js +++ b/tests/content-blocked.test.js @@ -304,3 +304,48 @@ describe('CONTENT_BLOCKED_TEXT_PATTERNS', () => { } }); }); + +describe('matchProviderByDomain', () => { + const fs = require('fs'); + const path = require('path'); + const webCtlSource = fs.readFileSync( + path.join(__dirname, '..', 'scripts', 'web-ctl.js'), + 'utf8' + ); + + it('matchProviderByDomain function exists in web-ctl.js', () => { + assert.ok( + webCtlSource.includes('function matchProviderByDomain(url)'), + 'matchProviderByDomain function should be defined in web-ctl.js' + ); + }); + + it('handles URL parsing with new URL', () => { + assert.ok( + webCtlSource.includes('new URL(url)'), + 'matchProviderByDomain should parse URL using new URL()' + ); + }); + + it('returns null for unmatched domains', () => { + assert.ok( + webCtlSource.includes('|| null'), + 'matchProviderByDomain should return null for unmatched domains' + ); + }); + + it('lazy-loads provider domain map', () => { + assert.ok( + webCtlSource.includes('_providerDomainMap'), + 'matchProviderByDomain should use lazy-loaded provider domain map' + ); + assert.ok( + webCtlSource.includes('if (!_providerDomainMap)'), + 'matchProviderByDomain should check if domain map needs initialization' + ); + assert.ok( + webCtlSource.includes('new Map()'), + 'matchProviderByDomain should create a Map for caching' + ); + }); +}); diff --git a/tests/web-ctl-actions.test.js b/tests/web-ctl-actions.test.js index c392a50..8490c92 100644 --- a/tests/web-ctl-actions.test.js +++ b/tests/web-ctl-actions.test.js @@ -760,6 +760,43 @@ describe('auth wall detection in goto', () => { }); }); +describe('content blocking detection in goto', () => { + const fs = require('fs'); + const path = require('path'); + const webCtlSource = fs.readFileSync( + path.join(__dirname, '..', 'scripts', 'web-ctl.js'), + 'utf8' + ); + + it('goto case imports detectContentBlocked', () => { + assert.ok( + webCtlSource.includes('detectContentBlocked'), + 'goto case should import detectContentBlocked' + ); + }); + + it('goto case checks noContentBlockDetect flag', () => { + assert.ok( + webCtlSource.includes('noContentBlockDetect'), + 'goto case should check noContentBlockDetect flag' + ); + }); + + it('goto case calls matchProviderByDomain', () => { + assert.ok( + webCtlSource.includes('matchProviderByDomain'), + 'goto case should match provider by domain' + ); + }); + + it('--no-content-block-detect is a valid boolean flag', () => { + assert.ok( + webCtlSource.includes("'--no-content-block-detect'"), + '--no-content-block-detect should be in BOOLEAN_FLAGS' + ); + }); +}); + describe('--ensure-auth flag', () => { const fs = require('fs'); const path = require('path'); From e34ef2f0244260586f03e337e9199fa7cf289037 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:20:39 +0200 Subject: [PATCH 8/9] refactor: cache bodyText, export LOADING_INDICATOR_SELECTORS, add edge case tests - Cache bodyText fetch in detectContentBlocked to avoid redundant DOM query - Export LOADING_INDICATOR_SELECTORS for testability - Add empty contentSelectors array edge case test - Add LOADING_INDICATOR_SELECTORS validation tests --- scripts/auth-wall-detect.js | 37 +++++++++++++++++----------------- tests/content-blocked.test.js | 38 ++++++++++++++++++++++++++++++++++- 2 files changed, 56 insertions(+), 19 deletions(-) diff --git a/scripts/auth-wall-detect.js b/scripts/auth-wall-detect.js index 183392a..c7caf0c 100644 --- a/scripts/auth-wall-detect.js +++ b/scripts/auth-wall-detect.js @@ -197,6 +197,14 @@ async function detectContentBlocked(page, options = {}) { const blockedTextPatterns = contentBlockedIndicators?.textPatterns || []; const emptyThreshold = contentBlockedIndicators?.emptyContentThreshold || DEFAULT_EMPTY_CONTENT_THRESHOLD; + // Fetch body text once, reuse in steps 2 and 4 + let bodyText = null; + try { + bodyText = (await page.textContent('body') || '').slice(0, 5000).toLowerCase(); + } catch { + // textContent failed - text-based checks will be skipped + } + // 1. Provider-specific blocked selectors if (blockedSelectors.length > 0) { try { @@ -218,19 +226,14 @@ async function detectContentBlocked(page, options = {}) { } // 2. Provider-specific blocked text patterns - if (blockedTextPatterns.length > 0) { - try { - const bodyText = (await page.textContent('body') || '').slice(0, 5000).toLowerCase(); - const matched = blockedTextPatterns.find(pattern => bodyText.includes(pattern)); - if (matched) { - return { - detected: true, - reason: 'provider_blocked_text', - details: { pattern: matched } - }; - } - } catch { - // textContent failed - continue to next check + if (blockedTextPatterns.length > 0 && bodyText !== null) { + const matched = blockedTextPatterns.find(pattern => bodyText.includes(pattern)); + if (matched) { + return { + detected: true, + reason: 'provider_blocked_text', + details: { pattern: matched } + }; } } @@ -269,8 +272,7 @@ async function detectContentBlocked(page, options = {}) { } // 4. Generic text patterns + short main content area - try { - const bodyText = (await page.textContent('body') || '').slice(0, 5000).toLowerCase(); + if (bodyText !== null) { const genericMatch = CONTENT_BLOCKED_TEXT_PATTERNS.find(pattern => bodyText.includes(pattern)); if (genericMatch && bodyText.length < 500) { return { @@ -279,8 +281,6 @@ async function detectContentBlocked(page, options = {}) { details: { pattern: genericMatch, bodyLength: bodyText.length } }; } - } catch { - // textContent failed - continue to next check } // 5. Persistent loading indicators (spinners still visible) @@ -315,5 +315,6 @@ module.exports = { AUTH_URL_PATTERNS, AUTH_DOM_SELECTORS, AUTH_TEXT_PATTERNS, - CONTENT_BLOCKED_TEXT_PATTERNS + CONTENT_BLOCKED_TEXT_PATTERNS, + LOADING_INDICATOR_SELECTORS }; diff --git a/tests/content-blocked.test.js b/tests/content-blocked.test.js index b84b43e..06f4f9b 100644 --- a/tests/content-blocked.test.js +++ b/tests/content-blocked.test.js @@ -4,7 +4,8 @@ const { describe, it } = require('node:test'); const assert = require('node:assert/strict'); const { detectContentBlocked, - CONTENT_BLOCKED_TEXT_PATTERNS + CONTENT_BLOCKED_TEXT_PATTERNS, + LOADING_INDICATOR_SELECTORS } = require('../scripts/auth-wall-detect'); // --- Mock helpers --- @@ -202,6 +203,22 @@ describe('detectContentBlocked', () => { assert.equal(result.details.threshold, 200); }); + it('skips content check when contentSelectors is empty array', async () => { + const page = mockPage({ + bodyText: 'Normal page with plenty of content. ' + 'x'.repeat(600) + }); + const result = await detectContentBlocked(page, { + contentSelectors: [], + contentBlockedIndicators: { + selectors: [], + textPatterns: [], + emptyContentThreshold: 200 + } + }); + assert.equal(result.detected, false); + assert.equal(result.reason, 'content_ok'); + }); + it('does not flag content_empty when threshold is met', async () => { const page = mockPage({ selectors: ['div.feed'], @@ -305,6 +322,25 @@ describe('CONTENT_BLOCKED_TEXT_PATTERNS', () => { }); }); +describe('LOADING_INDICATOR_SELECTORS', () => { + + it('is a non-empty array', () => { + assert.ok(Array.isArray(LOADING_INDICATOR_SELECTORS)); + assert.ok(LOADING_INDICATOR_SELECTORS.length > 0); + }); + + it('contains expected selectors', () => { + assert.ok(LOADING_INDICATOR_SELECTORS.includes('[role="progressbar"]')); + assert.ok(LOADING_INDICATOR_SELECTORS.includes('[aria-busy="true"]')); + }); + + it('all entries are strings', () => { + for (const sel of LOADING_INDICATOR_SELECTORS) { + assert.equal(typeof sel, 'string'); + } + }); +}); + describe('matchProviderByDomain', () => { const fs = require('fs'); const path = require('path'); From 117ffc1cc8beda4638f7ea0d72578f98bc892157 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Thu, 26 Feb 2026 03:25:29 +0200 Subject: [PATCH 9/9] docs: document content blocking detection in goto action --- CHANGELOG.md | 1 + README.md | 3 ++- skills/web-browse/SKILL.md | 6 ++++-- 3 files changed, 7 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index c22c2ed..13305df 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -27,6 +27,7 @@ - `--min-wait ` flag for `session auth` to configure grace period before auth success polling starts (default: 5 seconds, clamped to 0-300) - `--max-field-length ` flag for `extract` macro to configure maximum characters per extracted field (default: 500, max: 2000) - `--wait-loaded` flag for goto action - waits for async-rendered content to finish loading before taking the snapshot. Combines network idle, DOM stability, and loading indicator absence detection (spinners, skeletons, progress bars, aria-busy). Use `--timeout ` to set wait timeout (default: 15000ms) +- Automatic content blocking detection in goto action - detects when sites serve pages but block content from headless browsers (e.g., X.com empty timelines). Uses provider-specific heuristics (content selectors, blocked indicators) and generic checks (empty content, persistent spinners). Response includes `contentBlocked: true`, `warning: 'content_blocked'`, and recovery suggestions. Disable with `--no-content-block-detect` flag ### Fixed - Smart default snapshot scoping now includes complementary ARIA landmarks (`