Skip to content

Commit

Permalink
instead of puppeteer.exposeFunction(), use cdp function bindings dire…
Browse files Browse the repository at this point in the history
…ctly to avoid issues custom toJSON overrides:

- add Runtime.addBinding for each exposed function, handle in one place with Runtime.bindingCalled
- convert binding names to BxFunctionBindings enum
- update to browsertrix-behaviors 0.7.1 to avoid waiting for return value
- fixes #770
  • Loading branch information
ikreymer committed Feb 21, 2025
1 parent c25c677 commit cf3a4dc
Show file tree
Hide file tree
Showing 5 changed files with 58 additions and 31 deletions.
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@webrecorder/wabac": "^2.20.8",
"browsertrix-behaviors": "^0.7.0",
"browsertrix-behaviors": "^0.7.1",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",
Expand Down
66 changes: 45 additions & 21 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js";

import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
FETCH_FUNC,
BxFunctionBindings,
DISPLAY,
ExtractSelector,
PAGE_OP_TIMEOUT_SECS,
Expand Down Expand Up @@ -768,17 +766,43 @@ export class Crawler {
await this.screencaster.screencastPage(page, cdp, workerid);
}

await page.exposeFunction(
ADD_LINK_FUNC,
(url: string) => callbacks.addLink && callbacks.addLink(url),
);
cdp.on("Runtime.bindingCalled", (params) => {
const { name, payload } = params;

switch (name as BxFunctionBindings) {
case BxFunctionBindings.AddLinkFunc:
callbacks.addLink && callbacks.addLink(payload);
break;

case BxFunctionBindings.BehaviorLogFunc:
{
const logdata: { data: string; type: string } = JSON.parse(payload);
this._behaviorLog(logdata, page.url(), workerid);
}
break;

case BxFunctionBindings.FetchFunc:
if (recorder) {
recorder.addExternalFetch(payload, cdp);
}
break;

case BxFunctionBindings.AddToSeenSet:
this.crawlState
.addToUserSet(payload)
.catch((e) => logger.warn("Adding to URL set error", e));
break;
}
});

await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.AddLinkFunc,
});

if (this.params.behaviorOpts) {
await page.exposeFunction(
BEHAVIOR_LOG_FUNC,
(logdata: { data: string; type: string }) =>
this._behaviorLog(logdata, page.url(), workerid),
);
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.BehaviorLogFunc,
});
await this.browser.addInitScript(page, behaviors);

const initScript = `
Expand All @@ -791,9 +815,11 @@ self.__bx_behaviors.selectMainBehavior();
this.behaviorsChecked = true;
}

await page.exposeFunction(FETCH_FUNC, (url: string) => {
return recorder ? recorder.addExternalFetch(url, cdp) : true;
});
if (recorder) {
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.FetchFunc,
});
}

await this.browser.addInitScript(page, initScript);
}
Expand Down Expand Up @@ -873,11 +899,9 @@ self.__bx_behaviors.selectMainBehavior();
}
}

await page.exposeFunction("__bx_addSet", (data: string) =>
this.crawlState.addToUserSet(data),
);

// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.AddToSeenSet,
});
}

async setupExecContextEvents(
Expand Down Expand Up @@ -2295,7 +2319,7 @@ self.__bx_behaviors.selectMainBehavior();
selector,
extract,
isAttribute,
addLinkFunc: ADD_LINK_FUNC,
addLinkFunc: BxFunctionBindings.AddLinkFunc,
})
.catch((e) =>
logger.warn("Link Extraction failed in frame", {
Expand Down
4 changes: 2 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ import { hideBin } from "yargs/helpers";
import { createParser } from "css-selector-parser";

import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
DEFAULT_SELECTORS,
BEHAVIOR_TYPES,
ExtractSelector,
DEFAULT_MAX_RETRIES,
BxFunctionBindings,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
Expand Down Expand Up @@ -721,7 +721,7 @@ class ArgParser {
);
}
});
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
behaviorOpts.log = BxFunctionBindings.BehaviorLogFunc;
behaviorOpts.startEarly = true;
behaviorOpts.clickSelector = argv.clickSelector;
argv.behaviorOpts = JSON.stringify(behaviorOpts);
Expand Down
9 changes: 6 additions & 3 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ export const DETECT_SITEMAP = "<detect>";

export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];

export const BEHAVIOR_LOG_FUNC = "__bx_log";
export const ADD_LINK_FUNC = "__bx_addLink";
export const FETCH_FUNC = "__bx_fetch";
export enum BxFunctionBindings {
BehaviorLogFunc = "__bx_log",
AddLinkFunc = "__bx_addLink",
FetchFunc = "__bx_fetch",
AddToSeenSet = "__bx_addSet",
}

export const MAX_DEPTH = 1000000;
export const DEFAULT_MAX_RETRIES = 2;
Expand Down
8 changes: 4 additions & 4 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1460,10 +1460,10 @@ browserslist@^4.24.0:
node-releases "^2.0.18"
update-browserslist-db "^1.1.1"

browsertrix-behaviors@^0.7.0:
version "0.7.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.0.tgz#a08b7d3e9cd449d0d76b14a438e28472124fd1a4"
integrity sha512-t0X74puXJsH8sVkkVZwEdo8L5E1PYtzX/RkVXM4fwwBIL804bOB8WIV+5Dfwov/odaukhB67KZhM00hN60SiBA==
browsertrix-behaviors@^0.7.1:
version "0.7.1"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.1.tgz#dcb30c038e4060ef2393eb001ce9f10e3ce71c39"
integrity sha512-tZ7Bv/IAWzLTNORf/yQqGHpPAQ4tP8sxql8YT491VHlCk939F1YIUrQ36XJOaSyfjmmm2WV9nCMXkDpCsw6zQg==
dependencies:
query-selector-shadow-dom "^1.0.1"

Expand Down

0 comments on commit cf3a4dc

Please sign in to comment.