From 5b0d93e92585cae9d4748a8e993b12f9cf4afe10 Mon Sep 17 00:00:00 2001
From: Avi Fenesh <aviarchi1994@gmail.com>
Date: Tue, 24 Feb 2026 17:32:10 +0200
Subject: [PATCH 1/3] feat(extract): add table-aware field detection for table
 structures

- Skip TH header row when computing TR group signatures so mixed
  TH/TD tables pass the allSame check
- Boost scoring for TR groups under TBODY/TABLE/THEAD
- Add column-indexed extraction (column_1, column_2, ...) for
  headerless tables instead of falling back to generic extractItem
- Support column_N fields in selector mode extractField
- Update and add tests for headerless table extraction and
  selector mode column_N fields
---
 scripts/macros.js    | 57 ++++++++++++++++++++++++++++++---
 tests/macros.test.js | 76 ++++++++++++++++++++++++++++++++++++++++----
 2 files changed, 123 insertions(+), 10 deletions(-)

diff --git a/scripts/macros.js b/scripts/macros.js
index 15ca547..556ef89 100644
--- a/scripts/macros.js
+++ b/scripts/macros.js
@@ -753,6 +753,20 @@ async function extract(page, actionArgs, opts, helpers) {
             return img ? img.getAttribute('src') : null;
           }
           default: {
+            // Table column_N field support
+            var colMatch = name.match(/^column_(\d+)$/);
+            if (colMatch) {
+              var colNum = parseInt(colMatch[1], 10);
+              var tdIdx = 0;
+              var ch = el.children;
+              for (var ci = 0; ci < ch.length; ci++) {
+                if (ch[ci].tagName === 'TD') {
+                  tdIdx++;
+                  if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim());
+                }
+              }
+              return null;
+            }
             // Generic: try [class*=name] (sanitize for defense-in-depth)
             var safeName = name.replace(/[^a-zA-Z0-9_-]/g, '');
             if (!safeName) return null;
@@ -919,12 +933,25 @@ async function extract(page, actionArgs, opts, helpers) {
     var keys = Object.keys(groups);
     for (var k = 0; k < keys.length; k++) {
       var group = groups[keys[k]];
-      if (group.elements.length < 3) continue;
 
-      // Cache signature on first element, then compare
-      var sig = getSignature(group.elements[0]);
+      // For table TR groups, skip header row (all-TH children) in signature check
+      var sigStart = 0;
+      if (group.tag === 'TR') {
+        var pt = group.parent.tagName;
+        if (pt === 'TBODY' || pt === 'TABLE' || pt === 'THEAD') {
+          var firstKids = group.elements[0].children;
+          var allTH = firstKids.length > 0;
+          for (var th = 0; th < firstKids.length; th++) {
+            if (firstKids[th].tagName !== 'TH') { allTH = false; break; }
+          }
+          if (allTH) sigStart = 1;
+        }
+      }
+      if (group.elements.length - sigStart < 3) continue;
+
+      var sig = getSignature(group.elements[sigStart]);
       var allSame = true;
-      for (var s = 1; s < group.elements.length; s++) {
+      for (var s = sigStart + 1; s < group.elements.length; s++) {
         if (getSignature(group.elements[s]) !== sig) {
           allSame = false;
           break;
@@ -935,6 +962,10 @@ async function extract(page, actionArgs, opts, helpers) {
       var score = group.elements.length;
       if (isContentArea(group.parent)) score *= 3;
       if (isNavArea(group.parent)) score *= 0.3;
+      if (group.tag === 'TR') {
+        var pt2 = group.parent.tagName;
+        if (pt2 === 'TBODY' || pt2 === 'TABLE' || pt2 === 'THEAD') score *= 2;
+      }
 
       if (score > bestScore) {
         bestScore = score;
@@ -1022,6 +1053,22 @@ async function extract(page, actionArgs, opts, helpers) {
       return item;
     }
 
+    function extractTableRowIndexed(tr) {
+      var item = {};
+      var colIdx = 0;
+      var cells = tr.children;
+      for (var i = 0; i < cells.length; i++) {
+        if (cells[i].tagName === 'TD') {
+          colIdx++;
+          var cellText = truncate((cells[i].textContent || '').trim());
+          if (cellText) item['column_' + colIdx] = cellText;
+        }
+      }
+      var a = tr.querySelector('a[href]');
+      if (a) item.url = a.getAttribute('href');
+      return item;
+    }
+
     var tableHeaders = null;
     var headerRow = null;
     if (isTableGroup(bestGroup)) {
@@ -1038,6 +1085,8 @@ async function extract(page, actionArgs, opts, helpers) {
       var item;
       if (tableHeaders && els[e] !== headerRow) {
         item = extractTableRow(els[e], tableHeaders);
+      } else if (!tableHeaders && isTableGroup(bestGroup)) {
+        item = extractTableRowIndexed(els[e]);
       } else if (!tableHeaders) {
         item = extractItem(els[e]);
       } else {
diff --git a/tests/macros.test.js b/tests/macros.test.js
index 31d67a2..653c838 100644
--- a/tests/macros.test.js
+++ b/tests/macros.test.js
@@ -1474,6 +1474,45 @@ describe('extract selector mode', () => {
     assert.equal(result.url, 'https://example.com/products');
     assert.equal(result.snapshot, '(stub)');
   });
+
+  it('extracts column_N fields from table rows', async () => {
+    const page = {
+      url: () => 'https://example.com/table',
+      $$eval: async (sel, fn, ...args) => fn(
+        [{
+          children: [
+            { tagName: 'TD', textContent: 'Alice' },
+            { tagName: 'TD', textContent: 'Admin' },
+          ],
+          querySelector: () => null,
+        }, {
+          children: [
+            { tagName: 'TD', textContent: 'Bob' },
+            { tagName: 'TD', textContent: 'User' },
+          ],
+          querySelector: () => null,
+        }, {
+          children: [
+            { tagName: 'TD', textContent: 'Carol' },
+            { tagName: 'TD', textContent: 'Mod' },
+          ],
+          querySelector: () => null,
+        }],
+        ...args
+      ),
+    };
+
+    const result = await macros['extract'](page, [], {
+      selector: 'table tr',
+      fields: 'column_1,column_2',
+    }, stubHelpers);
+
+    assert.equal(result.mode, 'selector');
+    assert.equal(result.count, 3);
+    assert.equal(result.items[0].column_1, 'Alice');
+    assert.equal(result.items[0].column_2, 'Admin');
+    assert.equal(result.items[1].column_1, 'Bob');
+  });
 });
 
 describe('extract auto-detect mode', () => {
@@ -1602,14 +1641,14 @@ describe('extract auto-detect table mode', () => {
     assert.ok(result.fields.includes('Integrations'));
   });
 
-  it('falls back to generic extraction when table has no headers', async () => {
+  it('extracts column-indexed fields when table has no headers', async () => {
     const page = {
       url: () => 'https://example.com/data',
       evaluate: async () => ({
         items: [
-          { text: 'row 1 cell A row 1 cell B' },
-          { text: 'row 2 cell A row 2 cell B' },
-          { text: 'row 3 cell A row 3 cell B' },
+          { column_1: 'row 1 cell A', column_2: 'row 1 cell B' },
+          { column_1: 'row 2 cell A', column_2: 'row 2 cell B' },
+          { column_1: 'row 3 cell A', column_2: 'row 3 cell B' },
         ],
         selector: 'table > tbody > tr',
         count: 3,
@@ -1620,8 +1659,33 @@ describe('extract auto-detect table mode', () => {
 
     assert.equal(result.mode, 'auto');
     assert.equal(result.count, 3);
-    assert.ok(result.items[0].text, 'should have text field for headerless table');
-    assert.ok(result.fields.includes('text'));
+    assert.equal(result.items[0].column_1, 'row 1 cell A');
+    assert.equal(result.items[0].column_2, 'row 1 cell B');
+    assert.ok(result.fields.includes('column_1'));
+    assert.ok(result.fields.includes('column_2'));
+  });
+
+  it('headerless table includes url when link present', async () => {
+    const page = {
+      url: () => 'https://example.com/links-no-headers',
+      evaluate: async () => ({
+        items: [
+          { column_1: 'Alpha', column_2: 'First', url: '/items/alpha' },
+          { column_1: 'Beta', column_2: 'Second', url: '/items/beta' },
+          { column_1: 'Gamma', column_2: 'Third', url: '/items/gamma' },
+        ],
+        selector: 'table > tbody > tr',
+        count: 3,
+      }),
+    };
+
+    const result = await macros['extract'](page, [], { auto: true }, stubHelpers);
+
+    assert.equal(result.count, 3);
+    assert.equal(result.items[0].column_1, 'Alpha');
+    assert.equal(result.items[0].url, '/items/alpha');
+    assert.ok(result.fields.includes('column_1'));
+    assert.ok(result.fields.includes('url'));
   });
 
   it('ignores extra cells beyond header count', async () => {

From a62317bf2cc56193c842e421c438ec9838a0dbdf Mon Sep 17 00:00:00 2001
From: Avi Fenesh <aviarchi1994@gmail.com>
Date: Tue, 24 Feb 2026 17:39:20 +0200
Subject: [PATCH 2/3] fix: address review findings in table-aware extract

- Restore element count guard before sigStart logic to prevent potential
  access to elements[0] on empty groups
- Cache parent tagName once instead of accessing twice (pt and pt2)
- Add indexOf prefix check before regex in column_N extraction
---
 scripts/macros.js | 32 +++++++++++++++++---------------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/scripts/macros.js b/scripts/macros.js
index 556ef89..34b9308 100644
--- a/scripts/macros.js
+++ b/scripts/macros.js
@@ -754,15 +754,17 @@ async function extract(page, actionArgs, opts, helpers) {
           }
           default: {
             // Table column_N field support
-            var colMatch = name.match(/^column_(\d+)$/);
-            if (colMatch) {
-              var colNum = parseInt(colMatch[1], 10);
-              var tdIdx = 0;
-              var ch = el.children;
-              for (var ci = 0; ci < ch.length; ci++) {
-                if (ch[ci].tagName === 'TD') {
-                  tdIdx++;
-                  if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim());
+            if (name.indexOf('column_') === 0) {
+              var colMatch = name.match(/^column_(\d+)$/);
+              if (colMatch) {
+                var colNum = parseInt(colMatch[1], 10);
+                var tdIdx = 0;
+                var ch = el.children;
+                for (var ci = 0; ci < ch.length; ci++) {
+                  if (ch[ci].tagName === 'TD') {
+                    tdIdx++;
+                    if (tdIdx === colNum) return truncate((ch[ci].textContent || '').trim());
+                  }
                 }
               }
               return null;
@@ -933,12 +935,15 @@ async function extract(page, actionArgs, opts, helpers) {
     var keys = Object.keys(groups);
     for (var k = 0; k < keys.length; k++) {
       var group = groups[keys[k]];
+      if (group.elements.length < 3) continue;
 
       // For table TR groups, skip header row (all-TH children) in signature check
       var sigStart = 0;
+      var isTableTR = false;
       if (group.tag === 'TR') {
-        var pt = group.parent.tagName;
-        if (pt === 'TBODY' || pt === 'TABLE' || pt === 'THEAD') {
+        var parentTag = group.parent.tagName;
+        if (parentTag === 'TBODY' || parentTag === 'TABLE' || parentTag === 'THEAD') {
+          isTableTR = true;
           var firstKids = group.elements[0].children;
           var allTH = firstKids.length > 0;
           for (var th = 0; th < firstKids.length; th++) {
@@ -962,10 +967,7 @@ async function extract(page, actionArgs, opts, helpers) {
       var score = group.elements.length;
       if (isContentArea(group.parent)) score *= 3;
       if (isNavArea(group.parent)) score *= 0.3;
-      if (group.tag === 'TR') {
-        var pt2 = group.parent.tagName;
-        if (pt2 === 'TBODY' || pt2 === 'TABLE' || pt2 === 'THEAD') score *= 2;
-      }
+      if (isTableTR) score *= 2;
 
       if (score > bestScore) {
         bestScore = score;

From 36788acf7a8b61373587b2479522222e9830d858 Mon Sep 17 00:00:00 2001
From: Avi Fenesh <aviarchi1994@gmail.com>
Date: Tue, 24 Feb 2026 17:47:30 +0200
Subject: [PATCH 3/3] docs: update table-aware extraction documentation

Document headerless table column-indexed extraction (column_1, column_2)
and selector mode column_N field support in CHANGELOG, README, and
SKILL.md.
---
 CHANGELOG.md               | 2 +-
 README.md                  | 2 +-
 skills/web-browse/SKILL.md | 2 +-
 3 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/CHANGELOG.md b/CHANGELOG.md
index b432acb..c9d21bb 100644
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
@@ -6,7 +6,7 @@
 - `--snapshot-max-lines <N>` flag to truncate snapshot output to a maximum number of lines, with a `... (K more lines)` marker when lines are omitted
 - `--snapshot-collapse` flag to collapse repeated consecutive siblings of the same ARIA type - keeps first 2 with subtrees, replaces the rest with `... (K more <type>)` markers. Works recursively on nested structures
 - `--snapshot-text-only` flag to strip structural container nodes (list, listitem, group, region, main, form, table, row, grid, generic, etc.) and keep only content-bearing nodes. Labeled structural nodes are preserved. Indentation is re-compressed
-- `extract` macro for structured data extraction from repeated list items with two modes: selector mode (`--selector <sel> --fields f1,f2,...`) for targeted extraction and auto-detect mode (`--auto`) that finds repeated patterns automatically using structural signature matching. Auto-detect is table-aware - when a table with `<th>` headers is detected, returns per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single concatenated `text` field
+- `extract` macro for structured data extraction from repeated list items with two modes: selector mode (`--selector <sel> --fields f1,f2,...`) for targeted extraction and auto-detect mode (`--auto`) that finds repeated patterns automatically using structural signature matching. Auto-detect is table-aware - when a table with `<th>` headers is detected, returns per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single concatenated `text` field. Tables without headers use column-indexed fields (`column_1`, `column_2`, etc.). Table groups receive a scoring boost and tolerate mixed TH/TD header rows. Selector mode supports `column_N` field names for extracting specific table columns by index
 - Auto-create sessions on first `run` command - sessions are created automatically if they don't exist, eliminating the need for explicit `session start` before browsing. Response includes `autoCreated: true` flag when a session was auto-created.
 - `next-page` macro to auto-detect and follow pagination links using multiple heuristics (rel="next", ARIA roles, CSS patterns, page numbers)
 - `paginate` macro to collect items across paginated pages with `--selector`, `--max-pages` (default 5, max 20), and `--max-items` (default 100, max 500) options
diff --git a/README.md b/README.md
index 32e98dd..0b15f18 100644
--- a/README.md
+++ b/README.md
@@ -128,7 +128,7 @@ web-ctl session end github
 | `extract` | `run <s> extract --selector <sel> [--fields f1,f2] [--max-items N] [--max-field-length N]` | `{ url, mode, selector, fields, count, items, snapshot }` |
 | `extract` | `run <s> extract --auto [--max-items N] [--max-field-length N]` | `{ url, mode, selector, fields, count, items, snapshot }` |
 
-**Table-aware extraction**: When `--auto` detects a table with `<th>` headers, items include per-column data (e.g., `{ Service: "Runtime", Description: "..." }`) instead of a single `text` field. Falls back to generic extraction when no headers are found.
+**Table-aware extraction**: When `--auto` detects a table with `<th>` headers, items include per-column data (e.g., `{ Service: "Runtime", Description: "..." }`). Tables without headers use column-indexed keys (`column_1`, `column_2`, etc.). In selector mode, use `--fields column_1,column_2` to extract specific columns by index.
 
 ### click vs click-wait
 
diff --git a/skills/web-browse/SKILL.md b/skills/web-browse/SKILL.md
index d5bb787..794a461 100644
--- a/skills/web-browse/SKILL.md
+++ b/skills/web-browse/SKILL.md
@@ -342,7 +342,7 @@ Returns: `{ url, mode, selector, fields, count, items, snapshot }`
 
 Auto-detect mode also returns the detected CSS selector, which can be reused with selector mode for subsequent pages.
 
-**Table-aware extraction**: When auto-detect identifies a table with `<th>` headers (in `<thead>` or first row), items include per-column data using header text as keys (e.g., `{ Service: "Runtime", Description: "..." }`). Empty headers are auto-numbered as `column_1`, `column_2`, etc. Falls back to generic field extraction (`title`, `url`, `text`) when no headers are found.
+**Table-aware extraction**: When auto-detect identifies a table with `<th>` headers (in `<thead>` or first row), items include per-column data using header text as keys (e.g., `{ Service: "Runtime", Description: "..." }`). Empty headers are auto-numbered as `column_1`, `column_2`, etc. Tables without any headers use column-indexed extraction (`column_1`, `column_2`, ...). In selector mode, use `column_N` field names (e.g., `--fields column_1,column_2`) to extract specific columns from table rows.
 
 ## Snapshot Control