Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

instead of puppeteer.exposeFunction(), use cdp function bindings directly to avoid issues custom toJSON overrides: #775

Open
wants to merge 1 commit into
base: main
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion package.json
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@
"dependencies": {
"@novnc/novnc": "1.4.0",
"@webrecorder/wabac": "^2.20.8",
"browsertrix-behaviors": "^0.7.0",
"browsertrix-behaviors": "^0.7.1",
"client-zip": "^2.4.5",
"css-selector-parser": "^3.0.5",
"fetch-socks": "^1.3.0",
Expand Down
66 changes: 45 additions & 21 deletions src/crawler.ts
Original file line number Diff line number Diff line change
Expand Up @@ -40,9 +40,7 @@ import { collectCustomBehaviors, getInfoString } from "./util/file_reader.js";
import { Browser } from "./util/browser.js";

import {
ADD_LINK_FUNC,
BEHAVIOR_LOG_FUNC,
FETCH_FUNC,
BxFunctionBindings,
DISPLAY,
ExtractSelector,
PAGE_OP_TIMEOUT_SECS,
Expand Down Expand Up @@ -768,17 +766,43 @@ export class Crawler {
await this.screencaster.screencastPage(page, cdp, workerid);
}

await page.exposeFunction(
ADD_LINK_FUNC,
(url: string) => callbacks.addLink && callbacks.addLink(url),
);
cdp.on("Runtime.bindingCalled", (params) => {
const { name, payload } = params;

switch (name as BxFunctionBindings) {
case BxFunctionBindings.AddLinkFunc:
callbacks.addLink && callbacks.addLink(payload);
break;

case BxFunctionBindings.BehaviorLogFunc:
{
const logdata: { data: string; type: string } = JSON.parse(payload);
this._behaviorLog(logdata, page.url(), workerid);
}
break;

case BxFunctionBindings.FetchFunc:
if (recorder) {
recorder.addExternalFetch(payload, cdp);
}
break;

case BxFunctionBindings.AddToSeenSet:
this.crawlState
.addToUserSet(payload)
.catch((e) => logger.warn("Adding to URL set error", e));
break;
}
});

await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.AddLinkFunc,
});

if (this.params.behaviorOpts) {
await page.exposeFunction(
BEHAVIOR_LOG_FUNC,
(logdata: { data: string; type: string }) =>
this._behaviorLog(logdata, page.url(), workerid),
);
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.BehaviorLogFunc,
});
await this.browser.addInitScript(page, behaviors);

const initScript = `
Expand All @@ -791,9 +815,11 @@ self.__bx_behaviors.selectMainBehavior();
this.behaviorsChecked = true;
}

await page.exposeFunction(FETCH_FUNC, (url: string) => {
return recorder ? recorder.addExternalFetch(url, cdp) : true;
});
if (recorder) {
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.FetchFunc,
});
}

await this.browser.addInitScript(page, initScript);
}
Expand Down Expand Up @@ -873,11 +899,9 @@ self.__bx_behaviors.selectMainBehavior();
}
}

await page.exposeFunction("__bx_addSet", (data: string) =>
this.crawlState.addToUserSet(data),
);

// await page.exposeFunction("__bx_hasSet", (data: string) => this.crawlState.hasUserSet(data));
await cdp.send("Runtime.addBinding", {
name: BxFunctionBindings.AddToSeenSet,
});
}

async setupExecContextEvents(
Expand Down Expand Up @@ -2295,7 +2319,7 @@ self.__bx_behaviors.selectMainBehavior();
selector,
extract,
isAttribute,
addLinkFunc: ADD_LINK_FUNC,
addLinkFunc: BxFunctionBindings.AddLinkFunc,
})
.catch((e) =>
logger.warn("Link Extraction failed in frame", {
Expand Down
4 changes: 2 additions & 2 deletions src/util/argParser.ts
Original file line number Diff line number Diff line change
Expand Up @@ -10,14 +10,14 @@ import { hideBin } from "yargs/helpers";
import { createParser } from "css-selector-parser";

import {
BEHAVIOR_LOG_FUNC,
WAIT_UNTIL_OPTS,
EXTRACT_TEXT_TYPES,
SERVICE_WORKER_OPTS,
DEFAULT_SELECTORS,
BEHAVIOR_TYPES,
ExtractSelector,
DEFAULT_MAX_RETRIES,
BxFunctionBindings,
} from "./constants.js";
import { ScopedSeed } from "./seeds.js";
import { interpolateFilename } from "./storage.js";
Expand Down Expand Up @@ -721,7 +721,7 @@ class ArgParser {
);
}
});
behaviorOpts.log = BEHAVIOR_LOG_FUNC;
behaviorOpts.log = BxFunctionBindings.BehaviorLogFunc;
behaviorOpts.startEarly = true;
behaviorOpts.clickSelector = argv.clickSelector;
argv.behaviorOpts = JSON.stringify(behaviorOpts);
Expand Down
9 changes: 6 additions & 3 deletions src/util/constants.ts
Original file line number Diff line number Diff line change
Expand Up @@ -22,9 +22,12 @@ export const DETECT_SITEMAP = "<detect>";

export const EXTRACT_TEXT_TYPES = ["to-pages", "to-warc", "final-to-warc"];

export const BEHAVIOR_LOG_FUNC = "__bx_log";
export const ADD_LINK_FUNC = "__bx_addLink";
export const FETCH_FUNC = "__bx_fetch";
export enum BxFunctionBindings {
BehaviorLogFunc = "__bx_log",
AddLinkFunc = "__bx_addLink",
FetchFunc = "__bx_fetch",
AddToSeenSet = "__bx_addSet",
}

export const MAX_DEPTH = 1000000;
export const DEFAULT_MAX_RETRIES = 2;
Expand Down
8 changes: 4 additions & 4 deletions yarn.lock
Original file line number Diff line number Diff line change
Expand Up @@ -1460,10 +1460,10 @@ browserslist@^4.24.0:
node-releases "^2.0.18"
update-browserslist-db "^1.1.1"

browsertrix-behaviors@^0.7.0:
version "0.7.0"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.0.tgz#a08b7d3e9cd449d0d76b14a438e28472124fd1a4"
integrity sha512-t0X74puXJsH8sVkkVZwEdo8L5E1PYtzX/RkVXM4fwwBIL804bOB8WIV+5Dfwov/odaukhB67KZhM00hN60SiBA==
browsertrix-behaviors@^0.7.1:
version "0.7.1"
resolved "https://registry.yarnpkg.com/browsertrix-behaviors/-/browsertrix-behaviors-0.7.1.tgz#dcb30c038e4060ef2393eb001ce9f10e3ce71c39"
integrity sha512-tZ7Bv/IAWzLTNORf/yQqGHpPAQ4tP8sxql8YT491VHlCk939F1YIUrQ36XJOaSyfjmmm2WV9nCMXkDpCsw6zQg==
dependencies:
query-selector-shadow-dom "^1.0.1"

Expand Down