@@ -415,6 +453,7 @@ function CompactView({ bookmark, title, footer, className }: Props) {
+
{bookmark.content.type === BookmarkTypes.LINK &&
bookmark.content.favicon && (
;
case "bookmarkTypeIs":
return ;
+ case "bookmarkSourceIs":
+ return ;
case "hasTag":
return ;
case "isFavourited":
@@ -214,6 +220,51 @@ export function ConditionBuilder({
);
+ case "bookmarkSourceIs":
+ return (
+
+
+ onChange({
+ ...value,
+ source: source as "api" | "web" | "cli" | "mobile" | "extension" | "singlefile" | "rss" | "import",
+ })
+ }
+ >
+
+
+
+
+
+ {t("common.bookmark_sources.web")}
+
+
+ {t("common.bookmark_sources.extension")}
+
+
+ {t("common.bookmark_sources.mobile")}
+
+
+ {t("common.bookmark_sources.rss")}
+
+
+ {t("common.bookmark_sources.api")}
+
+
+ {t("common.bookmark_sources.cli")}
+
+
+ {t("common.bookmark_sources.singlefile")}
+
+
+ {t("common.bookmark_sources.import")}
+
+
+
+
+ );
+
case "hasTag":
return (
@@ -314,6 +365,9 @@ export function ConditionBuilder({
{t("settings.rules.conditions_types.bookmark_type_is")}
+
+ {t("settings.rules.conditions_types.bookmark_source_is")}
+
{t("settings.rules.conditions_types.has_tag")}
diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json
index 03206c828..5cc883bd7 100644
--- a/apps/web/lib/i18n/locales/en/translation.json
+++ b/apps/web/lib/i18n/locales/en/translation.json
@@ -41,6 +41,16 @@
"text": "Text",
"media": "Media"
},
+ "bookmark_sources": {
+ "api": "API",
+ "web": "Web",
+ "extension": "Browser Extension",
+ "cli": "CLI",
+ "mobile": "Mobile",
+ "singlefile": "SingleFile",
+ "rss": "RSS Feed",
+ "import": "Import"
+ },
"quota": "Quota",
"bookmarks": "Bookmarks",
"storage": "Storage"
@@ -370,8 +380,8 @@
"has_tag": "Has Tag",
"is_favourited": "Is Favourited",
"is_archived": "Is Archived",
- "and": "All of the following are true",
- "or": "Any of the following are true"
+ "bookmark_source_is": "Bookmark Source Is",
+ "and": "All of the following are true", "or": "Any of the following are true"
},
"actions_types": {
"add_tag": "Add Tag",
diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts
index 85d83265f..c1960cdd7 100644
--- a/apps/workers/workers/crawlerWorker.ts
+++ b/apps/workers/workers/crawlerWorker.ts
@@ -1556,55 +1556,67 @@ async function handleAsAssetBookmark(
},
},
async () => {
- const downloaded = await downloadAndStoreFile(
- url,
- userId,
- jobId,
- assetType,
- abortSignal,
- );
- if (!downloaded) {
- return;
- }
- const fileName = path.basename(new URL(url).pathname);
- await db.transaction(async (trx) => {
- await updateAsset(
- undefined,
+ let downloadedAssetId: string | undefined;
+ try {
+ const downloaded = await downloadAndStoreFile(
+ url,
+ userId,
+ jobId,
+ assetType,
+ abortSignal,
+ );
+ if (!downloaded) {
+ return;
+ }
+ downloadedAssetId = downloaded.assetId;
+ const fileName = path.basename(new URL(url).pathname);
+ await db.transaction(async (trx) => {
+ await updateAsset(
+ undefined,
+ {
+ id: downloaded.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.BOOKMARK_ASSET,
+ contentType: downloaded.contentType,
+ size: downloaded.size,
+ fileName,
+ },
+ trx,
+ );
+ await trx.insert(bookmarkAssets).values({
+ id: bookmarkId,
+ assetType,
+ assetId: downloaded.assetId,
+ content: null,
+ fileName,
+ sourceUrl: url,
+ });
+ // Switch the type of the bookmark from LINK to ASSET
+ await trx
+ .update(bookmarks)
+ .set({ type: BookmarkTypes.ASSET })
+ .where(eq(bookmarks.id, bookmarkId));
+ await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId));
+ });
+ await AssetPreprocessingQueue.enqueue(
{
- id: downloaded.assetId,
bookmarkId,
- userId,
- assetType: AssetTypes.BOOKMARK_ASSET,
- contentType: downloaded.contentType,
- size: downloaded.size,
- fileName,
+ fixMode: false,
+ },
+ {
+ groupId: userId,
},
- trx,
);
- await trx.insert(bookmarkAssets).values({
- id: bookmarkId,
- assetType,
- assetId: downloaded.assetId,
- content: null,
- fileName,
- sourceUrl: url,
- });
- // Switch the type of the bookmark from LINK to ASSET
- await trx
- .update(bookmarks)
- .set({ type: BookmarkTypes.ASSET })
- .where(eq(bookmarks.id, bookmarkId));
- await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId));
- });
- await AssetPreprocessingQueue.enqueue(
- {
- bookmarkId,
- fixMode: false,
- },
- {
- groupId: userId,
- },
- );
+ } catch (error) {
+ if (downloadedAssetId) {
+ logger.error(
+ `[Crawler][${jobId}] handleAsAssetBookmark encountered an error, cleaning up new asset ${downloadedAssetId}: ${error}`,
+ );
+ await silentDeleteAsset(userId, downloadedAssetId);
+ }
+ throw error;
+ }
},
);
}
@@ -1722,267 +1734,293 @@ async function crawlAndParseUrl(
},
},
async () => {
- let result: {
- htmlContent: string;
- screenshot: Buffer | undefined;
- pdf: Buffer | undefined;
- statusCode: number | null;
- url: string;
- };
-
- if (precrawledArchiveAssetId) {
- logger.info(
- `[Crawler][${jobId}] The page has been precrawled. Will use the precrawled archive instead.`,
- );
- const asset = await readAsset({
- userId,
- assetId: precrawledArchiveAssetId,
- });
- result = {
- htmlContent: asset.asset.toString(),
- screenshot: undefined,
- pdf: undefined,
- statusCode: 200,
- url,
+ const newAssetIds: string[] = [];
+ try {
+ let result: {
+ htmlContent: string;
+ screenshot: Buffer | undefined;
+ pdf: Buffer | undefined;
+ statusCode: number | null;
+ url: string;
};
- } else {
- result = await crawlPage(
- jobId,
- url,
- userId,
- forceStorePdf,
- abortSignal,
- );
- }
- abortSignal.throwIfAborted();
-
- const {
- htmlContent,
- screenshot,
- pdf,
- statusCode,
- url: browserUrl,
- } = result;
-
- // Track status code in Prometheus
- if (statusCode !== null) {
- crawlerStatusCodeCounter.labels(statusCode.toString()).inc();
- setSpanAttributes({
- "crawler.statusCode": statusCode,
- });
- }
- if (shouldRetryCrawlStatusCode(statusCode)) {
- if (numRetriesLeft > 0) {
- throw new Error(
- `[Crawler][${jobId}] Received status code ${statusCode}. Will retry crawl. Retries left: ${numRetriesLeft}`,
+ if (precrawledArchiveAssetId) {
+ logger.info(
+ `[Crawler][${jobId}] The page has been precrawled. Will use the precrawled archive instead.`,
+ );
+ const asset = await readAsset({
+ userId,
+ assetId: precrawledArchiveAssetId,
+ });
+ result = {
+ htmlContent: asset.asset.toString(),
+ screenshot: undefined,
+ pdf: undefined,
+ statusCode: 200,
+ url,
+ };
+ } else {
+ result = await crawlPage(
+ jobId,
+ url,
+ userId,
+ forceStorePdf,
+ abortSignal,
);
}
- logger.info(
- `[Crawler][${jobId}] Received status code ${statusCode} on latest retry attempt. Proceeding without retry.`,
- );
- }
-
- const { metadata: meta, readableContent: parsedReadableContent } =
- await runParseSubprocess(htmlContent, browserUrl, jobId, abortSignal);
- abortSignal.throwIfAborted();
+ abortSignal.throwIfAborted();
- const parseDate = (date: string | null | undefined) => {
- if (!date) {
- return null;
- }
- try {
- return new Date(date);
- } catch {
- return null;
+ const {
+ htmlContent,
+ screenshot,
+ pdf,
+ statusCode,
+ url: browserUrl,
+ } = result;
+
+ // Track status code in Prometheus
+ if (statusCode !== null) {
+ crawlerStatusCodeCounter.labels(statusCode.toString()).inc();
+ setSpanAttributes({
+ "crawler.statusCode": statusCode,
+ });
}
- };
- // Phase 1: Write metadata immediately for fast user feedback.
- // Content and asset storage happen later and can be slow (banner
- // image download, screenshot/pdf upload, etc.).
- await db
- .update(bookmarkLinks)
- .set({
- title: meta.title,
- description: meta.description,
- // Don't store data URIs as they're not valid URLs and are usually quite large
- imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
- favicon: meta.logo,
- crawlStatusCode: statusCode,
- author: meta.author,
- publisher: meta.publisher,
- datePublished: parseDate(meta.datePublished),
- dateModified: parseDate(meta.dateModified),
- })
- .where(eq(bookmarkLinks.id, bookmarkId));
-
- let readableContent = parsedReadableContent;
-
- const screenshotAssetInfo = await Promise.race([
- storeScreenshot(screenshot, userId, jobId),
- abortPromise(abortSignal),
- ]);
- abortSignal.throwIfAborted();
-
- const pdfAssetInfo = await Promise.race([
- storePdf(pdf, userId, jobId),
- abortPromise(abortSignal),
- ]);
- abortSignal.throwIfAborted();
-
- const htmlContentAssetInfo = await storeHtmlContent(
- readableContent?.content,
- userId,
- jobId,
- );
- abortSignal.throwIfAborted();
- let imageAssetInfo: DBAssetType | null = null;
- if (meta.image) {
- const downloaded = await downloadAndStoreImage(
- meta.image,
- userId,
- jobId,
- abortSignal,
- );
- if (downloaded) {
- imageAssetInfo = {
- id: downloaded.assetId,
- bookmarkId,
- userId,
- assetType: AssetTypes.LINK_BANNER_IMAGE,
- contentType: downloaded.contentType,
- size: downloaded.size,
- };
+ if (shouldRetryCrawlStatusCode(statusCode)) {
+ if (numRetriesLeft > 0) {
+ throw new Error(
+ `[Crawler][${jobId}] Received status code ${statusCode}. Will retry crawl. Retries left: ${numRetriesLeft}`,
+ );
+ }
+ logger.info(
+ `[Crawler][${jobId}] Received status code ${statusCode} on latest retry attempt. Proceeding without retry.`,
+ );
}
- }
- abortSignal.throwIfAborted();
-
- // Phase 2: Write content and asset references.
- // TODO(important): Restrict the size of content to store
- const assetDeletionTasks: Promise[] = [];
- const inlineHtmlContent =
- htmlContentAssetInfo.result === "store_inline"
- ? (readableContent?.content ?? null)
- : null;
- readableContent = null;
- await db.transaction(async (txn) => {
- await txn
+
+ const { metadata: meta, readableContent: parsedReadableContent } =
+ await runParseSubprocess(htmlContent, browserUrl, jobId, abortSignal);
+ abortSignal.throwIfAborted();
+
+ const parseDate = (date: string | null | undefined) => {
+ if (!date) {
+ return null;
+ }
+ try {
+ return new Date(date);
+ } catch {
+ return null;
+ }
+ };
+
+ // Phase 1: Write metadata immediately for fast user feedback.
+ // Content and asset storage happen later and can be slow (banner
+ // image download, screenshot/pdf upload, etc.).
+ await db
.update(bookmarkLinks)
.set({
- crawledAt: new Date(),
- htmlContent: inlineHtmlContent,
- contentAssetId:
- htmlContentAssetInfo.result === "stored"
- ? htmlContentAssetInfo.assetId
- : null,
+ title: meta.title,
+ description: meta.description,
+ // Don't store data URIs as they're not valid URLs and are usually quite large
+ imageUrl: meta.image?.startsWith("data:") ? null : meta.image,
+ favicon: meta.logo,
+ crawlStatusCode: statusCode,
+ author: meta.author,
+ publisher: meta.publisher,
+ datePublished: parseDate(meta.datePublished),
+ dateModified: parseDate(meta.dateModified),
})
.where(eq(bookmarkLinks.id, bookmarkId));
+ let readableContent = parsedReadableContent;
+
+ const screenshotAssetInfo = await Promise.race([
+ storeScreenshot(screenshot, userId, jobId),
+ abortPromise(abortSignal),
+ ]);
if (screenshotAssetInfo) {
- await updateAsset(
- oldScreenshotAssetId,
- {
- id: screenshotAssetInfo.assetId,
- bookmarkId,
- userId,
- assetType: AssetTypes.LINK_SCREENSHOT,
- contentType: screenshotAssetInfo.contentType,
- size: screenshotAssetInfo.size,
- fileName: screenshotAssetInfo.fileName,
- },
- txn,
- );
- assetDeletionTasks.push(
- silentDeleteAsset(userId, oldScreenshotAssetId),
- );
+ newAssetIds.push(screenshotAssetInfo.assetId);
}
+ abortSignal.throwIfAborted();
+
+ const pdfAssetInfo = await Promise.race([
+ storePdf(pdf, userId, jobId),
+ abortPromise(abortSignal),
+ ]);
if (pdfAssetInfo) {
- await updateAsset(
- oldPdfAssetId,
- {
- id: pdfAssetInfo.assetId,
- bookmarkId,
- userId,
- assetType: AssetTypes.LINK_PDF,
- contentType: pdfAssetInfo.contentType,
- size: pdfAssetInfo.size,
- fileName: pdfAssetInfo.fileName,
- },
- txn,
- );
- assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId));
- }
- if (imageAssetInfo) {
- await updateAsset(oldImageAssetId, imageAssetInfo, txn);
- assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId));
+ newAssetIds.push(pdfAssetInfo.assetId);
}
+ abortSignal.throwIfAborted();
+
+ const htmlContentAssetInfo = await storeHtmlContent(
+ readableContent?.content,
+ userId,
+ jobId,
+ );
if (htmlContentAssetInfo.result === "stored") {
- await updateAsset(
- oldContentAssetId,
- {
- id: htmlContentAssetInfo.assetId,
- bookmarkId,
- userId,
- assetType: AssetTypes.LINK_HTML_CONTENT,
- contentType: ASSET_TYPES.TEXT_HTML,
- size: htmlContentAssetInfo.size,
- fileName: null,
- },
- txn,
- );
- assetDeletionTasks.push(silentDeleteAsset(userId, oldContentAssetId));
- } else if (oldContentAssetId) {
- // Unlink the old content asset
- await txn.delete(assets).where(eq(assets.id, oldContentAssetId));
- assetDeletionTasks.push(silentDeleteAsset(userId, oldContentAssetId));
+ newAssetIds.push(htmlContentAssetInfo.assetId);
}
- });
-
- // Delete the old assets if any
- await Promise.all(assetDeletionTasks);
-
- return async () => {
- if (
- !precrawledArchiveAssetId &&
- (serverConfig.crawler.fullPageArchive || archiveFullPage)
- ) {
- const archiveResult = await archiveWebpage(
- htmlContent,
- browserUrl,
+ abortSignal.throwIfAborted();
+ let imageAssetInfo: DBAssetType | null = null;
+ if (meta.image) {
+ const downloaded = await downloadAndStoreImage(
+ meta.image,
userId,
jobId,
abortSignal,
);
+ if (downloaded) {
+ newAssetIds.push(downloaded.assetId);
+ imageAssetInfo = {
+ id: downloaded.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_BANNER_IMAGE,
+ contentType: downloaded.contentType,
+ size: downloaded.size,
+ };
+ }
+ }
+ abortSignal.throwIfAborted();
- if (archiveResult) {
- const {
- assetId: fullPageArchiveAssetId,
- size,
- contentType,
- } = archiveResult;
-
- await db.transaction(async (txn) => {
- await updateAsset(
- oldFullPageArchiveAssetId,
- {
- id: fullPageArchiveAssetId,
- bookmarkId,
- userId,
- assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE,
- contentType,
- size,
- fileName: null,
- },
- txn,
- );
- });
- if (oldFullPageArchiveAssetId) {
- await silentDeleteAsset(userId, oldFullPageArchiveAssetId);
+ // Phase 2: Write content and asset references.
+ // TODO(important): Restrict the size of content to store
+ const assetDeletionTasks: Promise[] = [];
+ const inlineHtmlContent =
+ htmlContentAssetInfo.result === "store_inline"
+ ? (readableContent?.content ?? null)
+ : null;
+ readableContent = null;
+ await db.transaction(async (txn) => {
+ await txn
+ .update(bookmarkLinks)
+ .set({
+ crawledAt: new Date(),
+ htmlContent: inlineHtmlContent,
+ contentAssetId:
+ htmlContentAssetInfo.result === "stored"
+ ? htmlContentAssetInfo.assetId
+ : null,
+ })
+ .where(eq(bookmarkLinks.id, bookmarkId));
+
+ if (screenshotAssetInfo) {
+ await updateAsset(
+ oldScreenshotAssetId,
+ {
+ id: screenshotAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_SCREENSHOT,
+ contentType: screenshotAssetInfo.contentType,
+ size: screenshotAssetInfo.size,
+ fileName: screenshotAssetInfo.fileName,
+ },
+ txn,
+ );
+ assetDeletionTasks.push(
+ silentDeleteAsset(userId, oldScreenshotAssetId),
+ );
+ }
+ if (pdfAssetInfo) {
+ await updateAsset(
+ oldPdfAssetId,
+ {
+ id: pdfAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_PDF,
+ contentType: pdfAssetInfo.contentType,
+ size: pdfAssetInfo.size,
+ fileName: pdfAssetInfo.fileName,
+ },
+ txn,
+ );
+ assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId));
+ }
+ if (imageAssetInfo) {
+ await updateAsset(oldImageAssetId, imageAssetInfo, txn);
+ assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId));
+ }
+ if (htmlContentAssetInfo.result === "stored") {
+ await updateAsset(
+ oldContentAssetId,
+ {
+ id: htmlContentAssetInfo.assetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_HTML_CONTENT,
+ contentType: ASSET_TYPES.TEXT_HTML,
+ size: htmlContentAssetInfo.size,
+ fileName: null,
+ },
+ txn,
+ );
+ assetDeletionTasks.push(
+ silentDeleteAsset(userId, oldContentAssetId),
+ );
+ } else if (oldContentAssetId) {
+ // Unlink the old content asset
+ await txn.delete(assets).where(eq(assets.id, oldContentAssetId));
+ assetDeletionTasks.push(
+ silentDeleteAsset(userId, oldContentAssetId),
+ );
+ }
+ });
+
+ // Delete the old assets if any
+ await Promise.all(assetDeletionTasks);
+
+ return async () => {
+ if (
+ !precrawledArchiveAssetId &&
+ (serverConfig.crawler.fullPageArchive || archiveFullPage)
+ ) {
+ const archiveResult = await archiveWebpage(
+ htmlContent,
+ browserUrl,
+ userId,
+ jobId,
+ abortSignal,
+ );
+
+ if (archiveResult) {
+ const {
+ assetId: fullPageArchiveAssetId,
+ size,
+ contentType,
+ } = archiveResult;
+
+ await db.transaction(async (txn) => {
+ await updateAsset(
+ oldFullPageArchiveAssetId,
+ {
+ id: fullPageArchiveAssetId,
+ bookmarkId,
+ userId,
+ assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE,
+ contentType,
+ size,
+ fileName: null,
+ },
+ txn,
+ );
+ });
+ if (oldFullPageArchiveAssetId) {
+ await silentDeleteAsset(userId, oldFullPageArchiveAssetId);
+ }
}
}
- }
- };
+ };
+ } catch (error) {
+ logger.error(
+ `[Crawler][${jobId}] crawlAndParseUrl encountered an error, cleaning up new assets: ${error}`,
+ );
+ // Clean up any assets that were created during this attempt but not committed
+ await Promise.all(
+ newAssetIds.map((assetId) => silentDeleteAsset(userId, assetId)),
+ );
+ throw error;
+ }
},
);
}
diff --git a/packages/shared/types/rules.ts b/packages/shared/types/rules.ts
index fd99c2666..fb8638759 100644
--- a/packages/shared/types/rules.ts
+++ b/packages/shared/types/rules.ts
@@ -79,6 +79,20 @@ const zBookmarkTypeIsCondition = z.object({
bookmarkType: z.enum(["link", "text", "asset"]),
});
+const zBookmarkSourceIsCondition = z.object({
+ type: z.literal("bookmarkSourceIs"),
+ source: z.enum([
+ "api",
+ "web",
+ "cli",
+ "mobile",
+ "extension",
+ "singlefile",
+ "rss",
+ "import",
+ ]),
+});
+
const zHasTagCondition = z.object({
type: z.literal("hasTag"),
tagId: z.string(),
@@ -100,6 +114,7 @@ const nonRecursiveCondition = z.discriminatedUnion("type", [
zTitleDoesNotContainCondition,
zImportedFromFeedCondition,
zBookmarkTypeIsCondition,
+ zBookmarkSourceIsCondition,
zHasTagCondition,
zIsFavouritedCondition,
zIsArchivedCondition,
@@ -121,6 +136,7 @@ export const zRuleEngineConditionSchema: z.ZodType =
zTitleDoesNotContainCondition,
zImportedFromFeedCondition,
zBookmarkTypeIsCondition,
+ zBookmarkSourceIsCondition,
zHasTagCondition,
zIsFavouritedCondition,
zIsArchivedCondition,
@@ -244,6 +260,7 @@ const ruleValidaitorFn = (
switch (condition.type) {
case "alwaysTrue":
case "bookmarkTypeIs":
+ case "bookmarkSourceIs":
case "isFavourited":
case "isArchived":
return true;
diff --git a/packages/trpc/lib/ruleEngine.ts b/packages/trpc/lib/ruleEngine.ts
index acfd747ee..66b845089 100644
--- a/packages/trpc/lib/ruleEngine.ts
+++ b/packages/trpc/lib/ruleEngine.ts
@@ -118,6 +118,9 @@ export class RuleEngine {
case "bookmarkTypeIs": {
return this.bookmark.type === condition.bookmarkType;
}
+ case "bookmarkSourceIs": {
+ return this.bookmark.source === condition.source;
+ }
case "hasTag": {
return this.bookmark.tagsOnBookmarks.some(
(t) => t.tagId === condition.tagId,