Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,7 +6,7 @@
- `--snapshot-max-lines <N>` flag to truncate snapshot output to a maximum number of lines, with a `... (K more lines)` marker when lines are omitted
- `--snapshot-collapse` flag to collapse repeated consecutive siblings of the same ARIA type - keeps first 2 with subtrees, replaces the rest with `... (K more <type>)` markers. Works recursively on nested structures
- `--snapshot-text-only` flag to strip structural container nodes (list, listitem, group, region, main, form, table, row, grid, generic, etc.) and keep only content-bearing nodes. Labeled structural nodes are preserved. Indentation is re-compressed
- `extract` macro for structured data extraction from repeated list items with two modes: selector mode (`--selector <sel> --fields f1,f2,...`) for targeted extraction and auto-detect mode (`--auto`) that finds repeated patterns automatically using structural signature matching. Auto-detect is table-aware - when a table with `<th>` headers is detected, returns per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single concatenated `text` field
- `extract` macro for structured data extraction from repeated list items with two modes: selector mode (`--selector <sel> --fields f1,f2,...`) for targeted extraction and auto-detect mode (`--auto`) that finds repeated patterns automatically using structural signature matching. Auto-detect is table-aware - when a table with `<th>` headers is detected, returns per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single concatenated `text` field. Tables without headers use column-indexed fields (`column_1`, `column_2`, etc.). Table groups receive a scoring boost and tolerate mixed TH/TD header rows. Selector mode supports `column_N` field names for extracting specific table columns by index
- Auto-create sessions on first `run` command - sessions are created automatically if they don't exist, eliminating the need for explicit `session start` before browsing. Response includes `autoCreated: true` flag when a session was auto-created.
- `next-page` macro to auto-detect and follow pagination links using multiple heuristics (rel="next", ARIA roles, CSS patterns, page numbers)
- `paginate` macro to collect items across paginated pages with `--selector`, `--max-pages` (default 5, max 20), and `--max-items` (default 100, max 500) options
Expand Down
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -128,7 +128,7 @@ web-ctl session end github
| `extract` | `run <s> extract --selector <sel> [--fields f1,f2] [--max-items N] [--max-field-length N]` | `{ url, mode, selector, fields, count, items, snapshot }` |
| `extract` | `run <s> extract --auto [--max-items N] [--max-field-length N]` | `{ url, mode, selector, fields, count, items, snapshot }` |

**Table-aware extraction**: When `--auto` detects a table with `<th>` headers, items include per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single `text` field. Falls back to generic extraction when no headers are found.
**Table-aware extraction**: When `--auto` detects a table with `<th>` headers, items include per-column data (e.g., `{ Service: "Runtime", Description: "..." }`). Tables without headers use column-indexed keys (`column_1`, `column_2`, etc.). In selector mode, use `--fields column_1,column_2` to extract specific columns by index.

### click vs click-wait

Expand Down
57 changes: 54 additions & 3 deletions scripts/macros.js
Original file line number Diff line number Diff line change
Expand Up @@ -753,6 +753,22 @@ async function extract(page, actionArgs, opts, helpers) {
return img ? img.getAttribute('src') : null;
}
default: {
// Table column_N field support
if (name.indexOf('column_') === 0) {
var colMatch = name.match(/^column_(\d+)$/);
if (colMatch) {
var colNum = parseInt(colMatch[1], 10);
var tdIdx = 0;
var ch = el.children;
for (var ci = 0; ci < ch.length; ci++) {
if (ch[ci].tagName === 'TD') {
tdIdx++;
if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim());
}
}
}
return null;
}
// Generic: try [class*=name] (sanitize for defense-in-depth)
var safeName = name.replace(/[^a-zA-Z0-9_-]/g, '');
if (!safeName) return null;
Expand Down Expand Up @@ -921,10 +937,26 @@ async function extract(page, actionArgs, opts, helpers) {
var group = groups[keys[k]];
if (group.elements.length < 3) continue;

// Cache signature on first element, then compare
var sig = getSignature(group.elements[0]);
// For table TR groups, skip header row (all-TH children) in signature check
var sigStart = 0;
var isTableTR = false;
if (group.tag === 'TR') {
var parentTag = group.parent.tagName;
if (parentTag === 'TBODY' || parentTag === 'TABLE' || parentTag === 'THEAD') {
isTableTR = true;
var firstKids = group.elements[0].children;
var allTH = firstKids.length > 0;
for (var th = 0; th < firstKids.length; th++) {
if (firstKids[th].tagName !== 'TH') { allTH = false; break; }
}
if (allTH) sigStart = 1;
}
}
if (group.elements.length - sigStart < 3) continue;

var sig = getSignature(group.elements[sigStart]);
var allSame = true;
for (var s = 1; s < group.elements.length; s++) {
for (var s = sigStart + 1; s < group.elements.length; s++) {
if (getSignature(group.elements[s]) !== sig) {
allSame = false;
break;
Expand All @@ -935,6 +967,7 @@ async function extract(page, actionArgs, opts, helpers) {
var score = group.elements.length;
if (isContentArea(group.parent)) score *= 3;
if (isNavArea(group.parent)) score *= 0.3;
if (isTableTR) score *= 2;

if (score > bestScore) {
bestScore = score;
Expand Down Expand Up @@ -1022,6 +1055,22 @@ async function extract(page, actionArgs, opts, helpers) {
return item;
}

function extractTableRowIndexed(tr) {
var item = {};
var colIdx = 0;
var cells = tr.children;
for (var i = 0; i < cells.length; i++) {
if (cells[i].tagName === 'TD') {
colIdx++;
var cellText = truncate((cells[i].textContent || '').trim());
if (cellText) item['column_' + colIdx] = cellText;
}
}
var a = tr.querySelector('a[href]');
if (a) item.url = a.getAttribute('href');
return item;
}

var tableHeaders = null;
var headerRow = null;
if (isTableGroup(bestGroup)) {
Expand All @@ -1038,6 +1087,8 @@ async function extract(page, actionArgs, opts, helpers) {
var item;
if (tableHeaders && els[e] !== headerRow) {
item = extractTableRow(els[e], tableHeaders);
} else if (!tableHeaders && isTableGroup(bestGroup)) {
item = extractTableRowIndexed(els[e]);
} else if (!tableHeaders) {
item = extractItem(els[e]);
} else {
Expand Down
2 changes: 1 addition & 1 deletion skills/web-browse/SKILL.md
Original file line number Diff line number Diff line change
Expand Up @@ -342,7 +342,7 @@ Returns: `{ url, mode, selector, fields, count, items, snapshot }`

Auto-detect mode also returns the detected CSS selector, which can be reused with selector mode for subsequent pages.

**Table-aware extraction**: When auto-detect identifies a table with `<th>` headers (in `<thead>` or first row), items include per-column data using header text as keys (e.g., `{ Service: "Runtime", Description: "..." }`). Empty headers are auto-numbered as `column_1`, `column_2`, etc. Falls back to generic field extraction (`title`, `url`, `text`) when no headers are found.
**Table-aware extraction**: When auto-detect identifies a table with `<th>` headers (in `<thead>` or first row), items include per-column data using header text as keys (e.g., `{ Service: "Runtime", Description: "..." }`). Empty headers are auto-numbered as `column_1`, `column_2`, etc. Tables without any headers use column-indexed extraction (`column_1`, `column_2`, ...). In selector mode, use `column_N` field names (e.g., `--fields column_1,column_2`) to extract specific columns from table rows.

## Snapshot Control

Expand Down
76 changes: 70 additions & 6 deletions tests/macros.test.js
Original file line number Diff line number Diff line change
Expand Up @@ -1474,6 +1474,45 @@ describe('extract selector mode', () => {
assert.equal(result.url, 'https://example.com/products');
assert.equal(result.snapshot, '(stub)');
});

it('extracts column_N fields from table rows', async () => {
const page = {
url: () => 'https://example.com/table',
$$eval: async (sel, fn, ...args) => fn(
[{
children: [
{ tagName: 'TD', textContent: 'Alice' },
{ tagName: 'TD', textContent: 'Admin' },
],
querySelector: () => null,
}, {
children: [
{ tagName: 'TD', textContent: 'Bob' },
{ tagName: 'TD', textContent: 'User' },
],
querySelector: () => null,
}, {
children: [
{ tagName: 'TD', textContent: 'Carol' },
{ tagName: 'TD', textContent: 'Mod' },
],
querySelector: () => null,
}],
...args
),
};

const result = await macros['extract'](page, [], {
selector: 'table tr',
fields: 'column_1,column_2',
}, stubHelpers);

assert.equal(result.mode, 'selector');
assert.equal(result.count, 3);
assert.equal(result.items[0].column_1, 'Alice');
assert.equal(result.items[0].column_2, 'Admin');
assert.equal(result.items[1].column_1, 'Bob');
});
});

describe('extract auto-detect mode', () => {
Expand Down Expand Up @@ -1602,14 +1641,14 @@ describe('extract auto-detect table mode', () => {
assert.ok(result.fields.includes('Integrations'));
});

it('falls back to generic extraction when table has no headers', async () => {
it('extracts column-indexed fields when table has no headers', async () => {
const page = {
url: () => 'https://example.com/data',
evaluate: async () => ({
items: [
{ text: 'row 1 cell A row 1 cell B' },
{ text: 'row 2 cell A row 2 cell B' },
{ text: 'row 3 cell A row 3 cell B' },
{ column_1: 'row 1 cell A', column_2: 'row 1 cell B' },
{ column_1: 'row 2 cell A', column_2: 'row 2 cell B' },
{ column_1: 'row 3 cell A', column_2: 'row 3 cell B' },
],
selector: 'table > tbody > tr',
count: 3,
Expand All @@ -1620,8 +1659,33 @@ describe('extract auto-detect table mode', () => {

assert.equal(result.mode, 'auto');
assert.equal(result.count, 3);
assert.ok(result.items[0].text, 'should have text field for headerless table');
assert.ok(result.fields.includes('text'));
assert.equal(result.items[0].column_1, 'row 1 cell A');
assert.equal(result.items[0].column_2, 'row 1 cell B');
assert.ok(result.fields.includes('column_1'));
assert.ok(result.fields.includes('column_2'));
});

it('headerless table includes url when link present', async () => {
const page = {
url: () => 'https://example.com/links-no-headers',
evaluate: async () => ({
items: [
{ column_1: 'Alpha', column_2: 'First', url: '/items/alpha' },
{ column_1: 'Beta', column_2: 'Second', url: '/items/beta' },
{ column_1: 'Gamma', column_2: 'Third', url: '/items/gamma' },
],
selector: 'table > tbody > tr',
count: 3,
}),
};

const result = await macros['extract'](page, [], { auto: true }, stubHelpers);

assert.equal(result.count, 3);
assert.equal(result.items[0].column_1, 'Alpha');
assert.equal(result.items[0].url, '/items/alpha');
assert.ok(result.fields.includes('column_1'));
assert.ok(result.fields.includes('url'));
});

it('ignores extra cells beyond header count', async () => {
Expand Down
Loading