From 5b0d93e92585cae9d4748a8e993b12f9cf4afe10 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Tue, 24 Feb 2026 17:32:10 +0200 Subject: [PATCH 1/3] feat(extract): add table-aware field detection for table structures - Skip TH header row when computing TR group signatures so mixed TH/TD tables pass the allSame check - Boost scoring for TR groups under TBODY/TABLE/THEAD - Add column-indexed extraction (column_1, column_2, ...) for headerless tables instead of falling back to generic extractItem - Support column_N fields in selector mode extractField - Update and add tests for headerless table extraction and selector mode column_N fields --- scripts/macros.js | 57 ++++++++++++++++++++++++++++++--- tests/macros.test.js | 76 ++++++++++++++++++++++++++++++++++++++++---- 2 files changed, 123 insertions(+), 10 deletions(-) diff --git a/scripts/macros.js b/scripts/macros.js index 15ca547..556ef89 100644 --- a/scripts/macros.js +++ b/scripts/macros.js @@ -753,6 +753,20 @@ async function extract(page, actionArgs, opts, helpers) { return img ? img.getAttribute('src') : null; } default: { + // Table column_N field support + var colMatch = name.match(/^column_(\d+)$/); + if (colMatch) { + var colNum = parseInt(colMatch[1], 10); + var tdIdx = 0; + var ch = el.children; + for (var ci = 0; ci < ch.length; ci++) { + if (ch[ci].tagName === 'TD') { + tdIdx++; + if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim()); + } + } + return null; + } // Generic: try [class*=name] (sanitize for defense-in-depth) var safeName = name.replace(/[^a-zA-Z0-9_-]/g, ''); if (!safeName) return null; @@ -919,12 +933,25 @@ async function extract(page, actionArgs, opts, helpers) { var keys = Object.keys(groups); for (var k = 0; k < keys.length; k++) { var group = groups[keys[k]]; - if (group.elements.length < 3) continue; - // Cache signature on first element, then compare - var sig = getSignature(group.elements[0]); + // For table TR groups, skip header row (all-TH children) in signature check + var sigStart = 0; + if (group.tag === 'TR') { + var pt = group.parent.tagName; + if (pt === 'TBODY' || pt === 'TABLE' || pt === 'THEAD') { + var firstKids = group.elements[0].children; + var allTH = firstKids.length > 0; + for (var th = 0; th < firstKids.length; th++) { + if (firstKids[th].tagName !== 'TH') { allTH = false; break; } + } + if (allTH) sigStart = 1; + } + } + if (group.elements.length - sigStart < 3) continue; + + var sig = getSignature(group.elements[sigStart]); var allSame = true; - for (var s = 1; s < group.elements.length; s++) { + for (var s = sigStart + 1; s < group.elements.length; s++) { if (getSignature(group.elements[s]) !== sig) { allSame = false; break; @@ -935,6 +962,10 @@ async function extract(page, actionArgs, opts, helpers) { var score = group.elements.length; if (isContentArea(group.parent)) score *= 3; if (isNavArea(group.parent)) score *= 0.3; + if (group.tag === 'TR') { + var pt2 = group.parent.tagName; + if (pt2 === 'TBODY' || pt2 === 'TABLE' || pt2 === 'THEAD') score *= 2; + } if (score > bestScore) { bestScore = score; @@ -1022,6 +1053,22 @@ async function extract(page, actionArgs, opts, helpers) { return item; } + function extractTableRowIndexed(tr) { + var item = {}; + var colIdx = 0; + var cells = tr.children; + for (var i = 0; i < cells.length; i++) { + if (cells[i].tagName === 'TD') { + colIdx++; + var cellText = truncate((cells[i].textContent || '').trim()); + if (cellText) item['column_' + colIdx] = cellText; + } + } + var a = tr.querySelector('a[href]'); + if (a) item.url = a.getAttribute('href'); + return item; + } + var tableHeaders = null; var headerRow = null; if (isTableGroup(bestGroup)) { @@ -1038,6 +1085,8 @@ async function extract(page, actionArgs, opts, helpers) { var item; if (tableHeaders && els[e] !== headerRow) { item = extractTableRow(els[e], tableHeaders); + } else if (!tableHeaders && isTableGroup(bestGroup)) { + item = extractTableRowIndexed(els[e]); } else if (!tableHeaders) { item = extractItem(els[e]); } else { diff --git a/tests/macros.test.js b/tests/macros.test.js index 31d67a2..653c838 100644 --- a/tests/macros.test.js +++ b/tests/macros.test.js @@ -1474,6 +1474,45 @@ describe('extract selector mode', () => { assert.equal(result.url, 'https://example.com/products'); assert.equal(result.snapshot, '(stub)'); }); + + it('extracts column_N fields from table rows', async () => { + const page = { + url: () => 'https://example.com/table', + $$eval: async (sel, fn, ...args) => fn( + [{ + children: [ + { tagName: 'TD', textContent: 'Alice' }, + { tagName: 'TD', textContent: 'Admin' }, + ], + querySelector: () => null, + }, { + children: [ + { tagName: 'TD', textContent: 'Bob' }, + { tagName: 'TD', textContent: 'User' }, + ], + querySelector: () => null, + }, { + children: [ + { tagName: 'TD', textContent: 'Carol' }, + { tagName: 'TD', textContent: 'Mod' }, + ], + querySelector: () => null, + }], + ...args + ), + }; + + const result = await macros['extract'](page, [], { + selector: 'table tr', + fields: 'column_1,column_2', + }, stubHelpers); + + assert.equal(result.mode, 'selector'); + assert.equal(result.count, 3); + assert.equal(result.items[0].column_1, 'Alice'); + assert.equal(result.items[0].column_2, 'Admin'); + assert.equal(result.items[1].column_1, 'Bob'); + }); }); describe('extract auto-detect mode', () => { @@ -1602,14 +1641,14 @@ describe('extract auto-detect table mode', () => { assert.ok(result.fields.includes('Integrations')); }); - it('falls back to generic extraction when table has no headers', async () => { + it('extracts column-indexed fields when table has no headers', async () => { const page = { url: () => 'https://example.com/data', evaluate: async () => ({ items: [ - { text: 'row 1 cell A row 1 cell B' }, - { text: 'row 2 cell A row 2 cell B' }, - { text: 'row 3 cell A row 3 cell B' }, + { column_1: 'row 1 cell A', column_2: 'row 1 cell B' }, + { column_1: 'row 2 cell A', column_2: 'row 2 cell B' }, + { column_1: 'row 3 cell A', column_2: 'row 3 cell B' }, ], selector: 'table > tbody > tr', count: 3, @@ -1620,8 +1659,33 @@ describe('extract auto-detect table mode', () => { assert.equal(result.mode, 'auto'); assert.equal(result.count, 3); - assert.ok(result.items[0].text, 'should have text field for headerless table'); - assert.ok(result.fields.includes('text')); + assert.equal(result.items[0].column_1, 'row 1 cell A'); + assert.equal(result.items[0].column_2, 'row 1 cell B'); + assert.ok(result.fields.includes('column_1')); + assert.ok(result.fields.includes('column_2')); + }); + + it('headerless table includes url when link present', async () => { + const page = { + url: () => 'https://example.com/links-no-headers', + evaluate: async () => ({ + items: [ + { column_1: 'Alpha', column_2: 'First', url: '/items/alpha' }, + { column_1: 'Beta', column_2: 'Second', url: '/items/beta' }, + { column_1: 'Gamma', column_2: 'Third', url: '/items/gamma' }, + ], + selector: 'table > tbody > tr', + count: 3, + }), + }; + + const result = await macros['extract'](page, [], { auto: true }, stubHelpers); + + assert.equal(result.count, 3); + assert.equal(result.items[0].column_1, 'Alpha'); + assert.equal(result.items[0].url, '/items/alpha'); + assert.ok(result.fields.includes('column_1')); + assert.ok(result.fields.includes('url')); }); it('ignores extra cells beyond header count', async () => { From a62317bf2cc56193c842e421c438ec9838a0dbdf Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Tue, 24 Feb 2026 17:39:20 +0200 Subject: [PATCH 2/3] fix: address review findings in table-aware extract - Restore element count guard before sigStart logic to prevent potential access to elements[0] on empty groups - Cache parent tagName once instead of accessing twice (pt and pt2) - Add indexOf prefix check before regex in column_N extraction --- scripts/macros.js | 32 +++++++++++++++++--------------- 1 file changed, 17 insertions(+), 15 deletions(-) diff --git a/scripts/macros.js b/scripts/macros.js index 556ef89..34b9308 100644 --- a/scripts/macros.js +++ b/scripts/macros.js @@ -754,15 +754,17 @@ async function extract(page, actionArgs, opts, helpers) { } default: { // Table column_N field support - var colMatch = name.match(/^column_(\d+)$/); - if (colMatch) { - var colNum = parseInt(colMatch[1], 10); - var tdIdx = 0; - var ch = el.children; - for (var ci = 0; ci < ch.length; ci++) { - if (ch[ci].tagName === 'TD') { - tdIdx++; - if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim()); + if (name.indexOf('column_') === 0) { + var colMatch = name.match(/^column_(\d+)$/); + if (colMatch) { + var colNum = parseInt(colMatch[1], 10); + var tdIdx = 0; + var ch = el.children; + for (var ci = 0; ci < ch.length; ci++) { + if (ch[ci].tagName === 'TD') { + tdIdx++; + if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim()); + } } } return null; @@ -933,12 +935,15 @@ async function extract(page, actionArgs, opts, helpers) { var keys = Object.keys(groups); for (var k = 0; k < keys.length; k++) { var group = groups[keys[k]]; + if (group.elements.length < 3) continue; // For table TR groups, skip header row (all-TH children) in signature check var sigStart = 0; + var isTableTR = false; if (group.tag === 'TR') { - var pt = group.parent.tagName; - if (pt === 'TBODY' || pt === 'TABLE' || pt === 'THEAD') { + var parentTag = group.parent.tagName; + if (parentTag === 'TBODY' || parentTag === 'TABLE' || parentTag === 'THEAD') { + isTableTR = true; var firstKids = group.elements[0].children; var allTH = firstKids.length > 0; for (var th = 0; th < firstKids.length; th++) { @@ -962,10 +967,7 @@ async function extract(page, actionArgs, opts, helpers) { var score = group.elements.length; if (isContentArea(group.parent)) score *= 3; if (isNavArea(group.parent)) score *= 0.3; - if (group.tag === 'TR') { - var pt2 = group.parent.tagName; - if (pt2 === 'TBODY' || pt2 === 'TABLE' || pt2 === 'THEAD') score *= 2; - } + if (isTableTR) score *= 2; if (score > bestScore) { bestScore = score; From 36788acf7a8b61373587b2479522222e9830d858 Mon Sep 17 00:00:00 2001 From: Avi Fenesh Date: Tue, 24 Feb 2026 17:47:30 +0200 Subject: [PATCH 3/3] docs: update table-aware extraction documentation Document headerless table column-indexed extraction (column_1, column_2) and selector mode column_N field support in CHANGELOG, README, and SKILL.md. --- CHANGELOG.md | 2 +- README.md | 2 +- skills/web-browse/SKILL.md | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/CHANGELOG.md b/CHANGELOG.md index b432acb..c9d21bb 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -6,7 +6,7 @@ - `--snapshot-max-lines ` flag to truncate snapshot output to a maximum number of lines, with a `... (K more lines)` marker when lines are omitted - `--snapshot-collapse` flag to collapse repeated consecutive siblings of the same ARIA type - keeps first 2 with subtrees, replaces the rest with `... (K more )` markers. Works recursively on nested structures - `--snapshot-text-only` flag to strip structural container nodes (list, listitem, group, region, main, form, table, row, grid, generic, etc.) and keep only content-bearing nodes. Labeled structural nodes are preserved. Indentation is re-compressed -- `extract` macro for structured data extraction from repeated list items with two modes: selector mode (`--selector --fields f1,f2,...`) for targeted extraction and auto-detect mode (`--auto`) that finds repeated patterns automatically using structural signature matching. Auto-detect is table-aware - when a table with `` headers is detected, returns per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single concatenated `text` field +- `extract` macro for structured data extraction from repeated list items with two modes: selector mode (`--selector --fields f1,f2,...`) for targeted extraction and auto-detect mode (`--auto`) that finds repeated patterns automatically using structural signature matching. Auto-detect is table-aware - when a table with `` headers is detected, returns per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single concatenated `text` field. Tables without headers use column-indexed fields (`column_1`, `column_2`, etc.). Table groups receive a scoring boost and tolerate mixed TH/TD header rows. Selector mode supports `column_N` field names for extracting specific table columns by index - Auto-create sessions on first `run` command - sessions are created automatically if they don't exist, eliminating the need for explicit `session start` before browsing. Response includes `autoCreated: true` flag when a session was auto-created. - `next-page` macro to auto-detect and follow pagination links using multiple heuristics (rel="next", ARIA roles, CSS patterns, page numbers) - `paginate` macro to collect items across paginated pages with `--selector`, `--max-pages` (default 5, max 20), and `--max-items` (default 100, max 500) options diff --git a/README.md b/README.md index 32e98dd..0b15f18 100644 --- a/README.md +++ b/README.md @@ -128,7 +128,7 @@ web-ctl session end github | `extract` | `run extract --selector [--fields f1,f2] [--max-items N] [--max-field-length N]` | `{ url, mode, selector, fields, count, items, snapshot }` | | `extract` | `run extract --auto [--max-items N] [--max-field-length N]` | `{ url, mode, selector, fields, count, items, snapshot }` | -**Table-aware extraction**: When `--auto` detects a table with `` headers, items include per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single `text` field. Falls back to generic extraction when no headers are found. +**Table-aware extraction**: When `--auto` detects a table with `` headers, items include per-column data (e.g., `{ Service: "Runtime", Description: "..." }`). Tables without headers use column-indexed keys (`column_1`, `column_2`, etc.). In selector mode, use `--fields column_1,column_2` to extract specific columns by index. ### click vs click-wait diff --git a/skills/web-browse/SKILL.md b/skills/web-browse/SKILL.md index d5bb787..794a461 100644 --- a/skills/web-browse/SKILL.md +++ b/skills/web-browse/SKILL.md @@ -342,7 +342,7 @@ Returns: `{ url, mode, selector, fields, count, items, snapshot }` Auto-detect mode also returns the detected CSS selector, which can be reused with selector mode for subsequent pages. -**Table-aware extraction**: When auto-detect identifies a table with `` headers (in `` or first row), items include per-column data using header text as keys (e.g., `{ Service: "Runtime", Description: "..." }`). Empty headers are auto-numbered as `column_1`, `column_2`, etc. Falls back to generic field extraction (`title`, `url`, `text`) when no headers are found. +**Table-aware extraction**: When auto-detect identifies a table with `` headers (in `` or first row), items include per-column data using header text as keys (e.g., `{ Service: "Runtime", Description: "..." }`). Empty headers are auto-numbered as `column_1`, `column_2`, etc. Tables without any headers use column-indexed extraction (`column_1`, `column_2`, ...). In selector mode, use `column_N` field names (e.g., `--fields column_1,column_2`) to extract specific columns from table rows. ## Snapshot Control