Skip to content

Commit 472322e

Browse files
[Keyword search] Webcrawler folders update & backfill (#9515)
* folders update * delete folders * backfill * clean * log
1 parent 43481ea commit 472322e

File tree

2 files changed

+121
-4
lines changed

2 files changed

+121
-4
lines changed
Lines changed: 93 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,93 @@
1+
import assert from "node:assert";
2+
3+
import { concurrentExecutor } from "@dust-tt/types";
4+
import _ from "lodash";
5+
import { makeScript } from "scripts/helpers";
6+
7+
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
8+
import { upsertDataSourceFolder } from "@connectors/lib/data_sources";
9+
import { WebCrawlerFolder } from "@connectors/lib/models/webcrawler";
10+
import { ConnectorResource } from "@connectors/resources/connector_resource";
11+
12+
makeScript(
13+
{
14+
nextConnectorId: {
15+
type: "number",
16+
required: false,
17+
default: 0,
18+
},
19+
connectorId: {
20+
type: "number",
21+
required: false,
22+
default: 0,
23+
},
24+
},
25+
async ({ execute, nextConnectorId }, logger) => {
26+
logger.info(
27+
{
28+
nextConnectorId,
29+
},
30+
"Starting backfill"
31+
);
32+
33+
const connectors = await ConnectorResource.listByType("webcrawler", {});
34+
35+
// sort connectors by id and start from nextConnectorId
36+
const sortedConnectors = connectors
37+
.sort((a, b) => a.id - b.id)
38+
.filter((_, idx) => idx >= nextConnectorId);
39+
40+
for (const connector of sortedConnectors) {
41+
const dataSourceConfig = dataSourceConfigFromConnector(connector);
42+
const connectorId = connector.id;
43+
44+
const folders = await WebCrawlerFolder.findAll({
45+
where: {
46+
connectorId,
47+
},
48+
});
49+
50+
const foldersByUrl = _.keyBy(folders, "url");
51+
52+
const getParents = (folder: WebCrawlerFolder): string[] => {
53+
assert(
54+
folder.parentUrl === null || foldersByUrl[folder.parentUrl],
55+
"Parent folder not found"
56+
);
57+
const parentFolder = folder.parentUrl
58+
? foldersByUrl[folder.parentUrl]
59+
: null;
60+
return [
61+
folder.internalId,
62+
...(parentFolder ? getParents(parentFolder) : []),
63+
];
64+
};
65+
await concurrentExecutor(
66+
folders,
67+
async (folder) => {
68+
logger.info({
69+
folderId: folder.internalId,
70+
folderUrl: folder.url,
71+
execute,
72+
});
73+
if (execute) {
74+
const result = await upsertDataSourceFolder({
75+
dataSourceConfig,
76+
folderId: folder.internalId,
77+
timestampMs: folder.updatedAt.getTime(),
78+
parents: getParents(folder),
79+
title: folder.url,
80+
mimeType: "application/vnd.dust.webcrawler.folder",
81+
});
82+
logger.info({
83+
result,
84+
folderId: folder.internalId,
85+
folderUrl: folder.url,
86+
});
87+
}
88+
},
89+
{ concurrency: 8 }
90+
);
91+
}
92+
}
93+
);

connectors/src/connectors/webcrawler/temporal/activities.ts

Lines changed: 28 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -30,8 +30,10 @@ import {
3030
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config";
3131
import {
3232
deleteDataSourceDocument,
33+
deleteDataSourceFolder,
3334
MAX_SMALL_DOCUMENT_TXT_LEN,
3435
upsertDataSourceDocument,
36+
upsertDataSourceFolder,
3537
} from "@connectors/lib/data_sources";
3638
import {
3739
WebCrawlerFolder,
@@ -251,16 +253,21 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
251253
totalExtracted += extracted.length;
252254
const pageTitle = $("title").text();
253255

254-
const folders = getAllFoldersForUrl(request.url);
255-
for (const folder of folders) {
256+
// note that parentFolderUrls.length === parentFolderIds.length -1
257+
// since parentFolderIds includes the page as first element
258+
// and parentFolderUrls does not
259+
const parentFolderUrls = getAllFoldersForUrl(request.url);
260+
const parentFolderIds = getParentsForPage(request.url, false);
261+
262+
for (const [index, folder] of parentFolderUrls.entries()) {
256263
if (createdFolders.has(folder)) {
257264
continue;
258265
}
259266

260267
const logicalParent = isTopFolder(request.url)
261268
? null
262269
: getFolderForUrl(folder);
263-
await WebCrawlerFolder.upsert({
270+
const [webCrawlerFolder] = await WebCrawlerFolder.upsert({
264271
url: folder,
265272
parentUrl: logicalParent,
266273
connectorId: connector.id,
@@ -272,6 +279,19 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
272279
lastSeenAt: new Date(),
273280
});
274281

282+
await upsertDataSourceFolder({
283+
dataSourceConfig,
284+
folderId: webCrawlerFolder.internalId,
285+
timestampMs: webCrawlerFolder.updatedAt.getTime(),
286+
287+
// parent folder ids of the page are in hierarchy order from the
288+
// page to the root so for the current folder, its parents start at
289+
// index+1 (including itself as first parent) and end at the root
290+
parents: parentFolderIds.slice(index + 1),
291+
title: folder,
292+
mimeType: "application/vnd.dust.webcrawler.folder",
293+
});
294+
275295
createdFolders.add(folder);
276296
}
277297
const documentId = stableIdForUrl({
@@ -342,7 +362,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
342362
documentUrl: validatedUrl.standardized,
343363
timestampMs: new Date().getTime(),
344364
tags: [`title:${stripNullBytes(pageTitle)}`],
345-
parents: getParentsForPage(request.url, false),
365+
parents: parentFolderIds,
346366
upsertContext: {
347367
sync_type: "batch",
348368
},
@@ -552,6 +572,10 @@ export async function webCrawlerGarbageCollector(
552572
type: "delete_folder",
553573
});
554574
for (const folder of foldersToDelete) {
575+
await deleteDataSourceFolder({
576+
dataSourceConfig,
577+
folderId: folder.internalId,
578+
});
555579
await folder.destroy();
556580
}
557581
} while (foldersToDelete.length > 0);

0 commit comments

Comments
 (0)