@@ -30,8 +30,10 @@ import {
30
30
import { dataSourceConfigFromConnector } from "@connectors/lib/api/data_source_config" ;
31
31
import {
32
32
deleteDataSourceDocument ,
33
+ deleteDataSourceFolder ,
33
34
MAX_SMALL_DOCUMENT_TXT_LEN ,
34
35
upsertDataSourceDocument ,
36
+ upsertDataSourceFolder ,
35
37
} from "@connectors/lib/data_sources" ;
36
38
import {
37
39
WebCrawlerFolder ,
@@ -251,16 +253,21 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
251
253
totalExtracted += extracted . length ;
252
254
const pageTitle = $ ( "title" ) . text ( ) ;
253
255
254
- const folders = getAllFoldersForUrl ( request . url ) ;
255
- for ( const folder of folders ) {
256
+ // note that parentFolderUrls.length === parentFolderIds.length -1
257
+ // since parentFolderIds includes the page as first element
258
+ // and parentFolderUrls does not
259
+ const parentFolderUrls = getAllFoldersForUrl ( request . url ) ;
260
+ const parentFolderIds = getParentsForPage ( request . url , false ) ;
261
+
262
+ for ( const [ index , folder ] of parentFolderUrls . entries ( ) ) {
256
263
if ( createdFolders . has ( folder ) ) {
257
264
continue ;
258
265
}
259
266
260
267
const logicalParent = isTopFolder ( request . url )
261
268
? null
262
269
: getFolderForUrl ( folder ) ;
263
- await WebCrawlerFolder . upsert ( {
270
+ const [ webCrawlerFolder ] = await WebCrawlerFolder . upsert ( {
264
271
url : folder ,
265
272
parentUrl : logicalParent ,
266
273
connectorId : connector . id ,
@@ -272,6 +279,19 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
272
279
lastSeenAt : new Date ( ) ,
273
280
} ) ;
274
281
282
+ await upsertDataSourceFolder ( {
283
+ dataSourceConfig,
284
+ folderId : webCrawlerFolder . internalId ,
285
+ timestampMs : webCrawlerFolder . updatedAt . getTime ( ) ,
286
+
287
+ // parent folder ids of the page are in hierarchy order from the
288
+ // page to the root so for the current folder, its parents start at
289
+ // index+1 (including itself as first parent) and end at the root
290
+ parents : parentFolderIds . slice ( index + 1 ) ,
291
+ title : folder ,
292
+ mimeType : "application/vnd.dust.webcrawler.folder" ,
293
+ } ) ;
294
+
275
295
createdFolders . add ( folder ) ;
276
296
}
277
297
const documentId = stableIdForUrl ( {
@@ -342,7 +362,7 @@ export async function crawlWebsiteByConnectorId(connectorId: ModelId) {
342
362
documentUrl : validatedUrl . standardized ,
343
363
timestampMs : new Date ( ) . getTime ( ) ,
344
364
tags : [ `title:${ stripNullBytes ( pageTitle ) } ` ] ,
345
- parents : getParentsForPage ( request . url , false ) ,
365
+ parents : parentFolderIds ,
346
366
upsertContext : {
347
367
sync_type : "batch" ,
348
368
} ,
@@ -552,6 +572,10 @@ export async function webCrawlerGarbageCollector(
552
572
type : "delete_folder" ,
553
573
} ) ;
554
574
for ( const folder of foldersToDelete ) {
575
+ await deleteDataSourceFolder ( {
576
+ dataSourceConfig,
577
+ folderId : folder . internalId ,
578
+ } ) ;
555
579
await folder . destroy ( ) ;
556
580
}
557
581
} while ( foldersToDelete . length > 0 ) ;
0 commit comments