From cf3a4dcb44bcf32d9c7414be3833176f105f6ebb Mon Sep 17 00:00:00 2001 From: Ilya Kreymer Date: Thu, 20 Feb 2025 20:49:48 -0800 Subject: [PATCH] instead of puppeteer.exposeFunction(), use cdp function bindings directly to avoid issues custom toJSON overrides: - add Runtime.addBinding for each exposed function, handle in one place with Runtime.bindingCalled - convert binding names to BxFunctionBindings enum - update to browsertrix-behaviors 0.7.1 to avoid waiting for return value - fixes #770 --- package.json | 2 +- src/crawler.ts | 66 +++++++++++++++++++++++++++++-------------- src/util/argParser.ts | 4 +-- src/util/constants.ts | 9 ++++-- yarn.lock | 8 +++--- 5 files changed, 58 insertions(+), 31 deletions(-) diff --git a/package.json b/package.json index fa251859..6a89e2bb 100644 --- a/package.json +++ b/package.json @@ -18,7 +18,7 @@ "dependencies": { "@novnc/novnc": "1.4.0", "@webrecorder/wabac": "^2.20.8", - "browsertrix-behaviors": "^0.7.0", + "browsertrix-behaviors": "^0.7.1", "client-zip": "^2.4.5", "css-selector-parser": "^3.0.5", "fetch-socks": "^1.3.0", diff --git a/src/crawler.ts b/src/crawler.ts index 16a3ffcd..63763be8 100644 --- a/src/crawler.ts +++ b/src/crawler.ts @@ -40,9 +40,7 @@ import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js"; import { Browser } from "./util/browser.js"; import { - ADD_LINK_FUNC, - BEHAVIOR_LOG_FUNC, - FETCH_FUNC, + BxFunctionBindings, DISPLAY, ExtractSelector, PAGE_OP_TIMEOUT_SECS, @@ -768,17 +766,43 @@ export class Crawler { await this.screencaster.screencastPage(page, cdp, workerid); } - await page.exposeFunction( - ADD_LINK_FUNC, - (url: string) => callbacks.addLink && callbacks.addLink(url), - ); + cdp.on("Runtime.bindingCalled", (params) => { + const { name, payload } = params; + + switch (name as BxFunctionBindings) { + case BxFunctionBindings.AddLinkFunc: + callbacks.addLink && callbacks.addLink(payload); + break; + + case BxFunctionBindings.BehaviorLogFunc: + { + const logdata: { data: string; type: string } = JSON.parse(payload); + this._behaviorLog(logdata, page.url(), workerid); + } + break; + + case BxFunctionBindings.FetchFunc: + if (recorder) { + recorder.addExternalFetch(payload, cdp); + } + break; + + case BxFunctionBindings.AddToSeenSet: + this.crawlState + .addToUserSet(payload) + .catch((e) => logger.warn("Adding to URL set error", e)); + break; + } + }); + + await cdp.send("Runtime.addBinding", { + name: BxFunctionBindings.AddLinkFunc, + }); if (this.params.behaviorOpts) { - await page.exposeFunction( - BEHAVIOR_LOG_FUNC, - (logdata: { data: string; type: string }) => - this._behaviorLog(logdata, page.url(), workerid), - ); + await cdp.send("Runtime.addBinding", { + name: BxFunctionBindings.BehaviorLogFunc, + }); await this.browser.addInitScript(page, behaviors); const initScript = ` @@ -791,9 +815,11 @@ self.__bx_behaviors.selectMainBehavior(); this.behaviorsChecked = true; } - await page.exposeFunction(FETCH_FUNC, (url: string) => { - return recorder ? recorder.addExternalFetch(url, cdp) : true; - }); + if (recorder) { + await cdp.send("Runtime.addBinding", { + name: BxFunctionBindings.FetchFunc, + }); + } await this.browser.addInitScript(page, initScript); } @@ -873,11 +899,9 @@ self.__bx_behaviors.selectMainBehavior(); } } - await page.exposeFunction("__bx_addSet", (data: string) => - this.crawlState.addToUserSet(data), - ); - - // await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data)); + await cdp.send("Runtime.addBinding", { + name: BxFunctionBindings.AddToSeenSet, + }); } async setupExecContextEvents( @@ -2295,7 +2319,7 @@ self.__bx_behaviors.selectMainBehavior(); selector, extract, isAttribute, - addLinkFunc: ADD_LINK_FUNC, + addLinkFunc: BxFunctionBindings.AddLinkFunc, }) .catch((e) => logger.warn("Link Extraction failed in frame", { diff --git a/src/util/argParser.ts b/src/util/argParser.ts index bb02d435..2f30d170 100644 --- a/src/util/argParser.ts +++ b/src/util/argParser.ts @@ -10,7 +10,6 @@ import { hideBin } from "yargs/helpers"; import { createParser } from "css-selector-parser"; import { - BEHAVIOR_LOG_FUNC, WAIT_UNTIL_OPTS, EXTRACT_TEXT_TYPES, SERVICE_WORKER_OPTS, @@ -18,6 +17,7 @@ import { BEHAVIOR_TYPES, ExtractSelector, DEFAULT_MAX_RETRIES, + BxFunctionBindings, } from "./constants.js"; import { ScopedSeed } from "./seeds.js"; import { interpolateFilename } from "./storage.js"; @@ -721,7 +721,7 @@ class ArgParser { ); } }); - behaviorOpts.log = BEHAVIOR_LOG_FUNC; + behaviorOpts.log = BxFunctionBindings.BehaviorLogFunc; behaviorOpts.startEarly = true; behaviorOpts.clickSelector = argv.clickSelector; argv.behaviorOpts = JSON.stringify(behaviorOpts); diff --git a/src/util/constants.ts b/src/util/constants.ts index 83ce539a..5f7a37a4 100644 --- a/src/util/constants.ts +++ b/src/util/constants.ts @@ -22,9 +22,12 @@ export const DETECT_SITEMAP = ""; export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"]; -export const BEHAVIOR_LOG_FUNC = "__bx_log"; -export const ADD_LINK_FUNC = "__bx_addLink"; -export const FETCH_FUNC = "__bx_fetch"; +export enum BxFunctionBindings { + BehaviorLogFunc = "__bx_log", + AddLinkFunc = "__bx_addLink", + FetchFunc = "__bx_fetch", + AddToSeenSet = "__bx_addSet", +} export const MAX_DEPTH = 1000000; export const DEFAULT_MAX_RETRIES = 2; diff --git a/yarn.lock b/yarn.lock index 6389bfa5..a3d99906 100644 --- a/yarn.lock +++ b/yarn.lock @@ -1460,10 +1460,10 @@ browserslist@^4.24.0: node-releases "^2.0.18" update-browserslist-db "^1.1.1" -browsertrix-behaviors@^0.7.0: - version "0.7.0" - resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.0.tgz#a08b7d3e9cd449d0d76b14a438e28472124fd1a4" - integrity sha512-t0X74puXJsH8sVkkVZwEdo8L5E1PYtzX/RkVXM4fwwBIL804bOB8WIV+5Dfwov/odaukhB67KZhM00hN60SiBA== +browsertrix-behaviors@^0.7.1: + version "0.7.1" + resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.1.tgz#dcb30c038e4060ef2393eb001ce9f10e3ce71c39" + integrity sha512-tZ7Bv/IAWzLTNORf/yQqGHpPAQ4tP8sxql8YT491VHlCk939F1YIUrQ36XJOaSyfjmmm2WV9nCMXkDpCsw6zQg== dependencies: query-selector-shadow-dom "^1.0.1"