diff --git a/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx b/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx index 996b255ac..eab6f3a5a 100644 --- a/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx +++ b/apps/web/components/dashboard/bookmarks/BookmarkLayoutAdaptingCard.tsx @@ -21,7 +21,9 @@ import { Check, GripVertical, Image as ImageIcon, + MousePointer2, NotebookPen, + Rss, } from "lucide-react"; import { useTheme } from "next-themes"; import { toast } from "sonner"; @@ -78,6 +80,36 @@ function BottomRow({ ); } +function SourceIndicator({ bookmark }: { bookmark: ZBookmark }) { + const { t } = useTranslation(); + if (bookmark.source === "rss") { + return ( +
+ +
+ ); + } + + // Show a "manual" indicator for everything else except API/Import maybe? + // For now let's just show it for web/extension/mobile/cli + const manualSources = ["web", "extension", "mobile", "cli"]; + if (manualSources.includes(bookmark.source)) { + return ( +
+ +
+ ); + } + + return null; +} + function OwnerIndicator({ bookmark }: { bookmark: ZBookmark }) { const api = useTRPC(); const listContext = useBookmarkListContext(); @@ -313,6 +345,9 @@ function ListView({ className="left-1 top-1/2 -translate-y-1/2" /> +
+ +
{image("list", cn("size-32 rounded-lg", imgFitClass))}
@@ -375,6 +410,9 @@ function GridView({ +
+ +
{img &&
{img}
}
@@ -415,6 +453,7 @@ function CompactView({ bookmark, title, footer, className }: Props) {
+ {bookmark.content.type === BookmarkTypes.LINK && bookmark.content.favicon && ( ; case "bookmarkTypeIs": return ; + case "bookmarkSourceIs": + return ; case "hasTag": return ; case "isFavourited": @@ -214,6 +220,51 @@ export function ConditionBuilder({
); + case "bookmarkSourceIs": + return ( +
+ +
+ ); + case "hasTag": return (
@@ -314,6 +365,9 @@ export function ConditionBuilder({ {t("settings.rules.conditions_types.bookmark_type_is")} + + {t("settings.rules.conditions_types.bookmark_source_is")} + {t("settings.rules.conditions_types.has_tag")} diff --git a/apps/web/lib/i18n/locales/en/translation.json b/apps/web/lib/i18n/locales/en/translation.json index 03206c828..5cc883bd7 100644 --- a/apps/web/lib/i18n/locales/en/translation.json +++ b/apps/web/lib/i18n/locales/en/translation.json @@ -41,6 +41,16 @@ "text": "Text", "media": "Media" }, + "bookmark_sources": { + "api": "API", + "web": "Web", + "extension": "Browser Extension", + "cli": "CLI", + "mobile": "Mobile", + "singlefile": "SingleFile", + "rss": "RSS Feed", + "import": "Import" + }, "quota": "Quota", "bookmarks": "Bookmarks", "storage": "Storage" @@ -370,8 +380,8 @@ "has_tag": "Has Tag", "is_favourited": "Is Favourited", "is_archived": "Is Archived", - "and": "All of the following are true", - "or": "Any of the following are true" + "bookmark_source_is": "Bookmark Source Is", + "and": "All of the following are true", "or": "Any of the following are true" }, "actions_types": { "add_tag": "Add Tag", diff --git a/apps/workers/workers/crawlerWorker.ts b/apps/workers/workers/crawlerWorker.ts index 85d83265f..c1960cdd7 100644 --- a/apps/workers/workers/crawlerWorker.ts +++ b/apps/workers/workers/crawlerWorker.ts @@ -1556,55 +1556,67 @@ async function handleAsAssetBookmark( }, }, async () => { - const downloaded = await downloadAndStoreFile( - url, - userId, - jobId, - assetType, - abortSignal, - ); - if (!downloaded) { - return; - } - const fileName = path.basename(new URL(url).pathname); - await db.transaction(async (trx) => { - await updateAsset( - undefined, + let downloadedAssetId: string | undefined; + try { + const downloaded = await downloadAndStoreFile( + url, + userId, + jobId, + assetType, + abortSignal, + ); + if (!downloaded) { + return; + } + downloadedAssetId = downloaded.assetId; + const fileName = path.basename(new URL(url).pathname); + await db.transaction(async (trx) => { + await updateAsset( + undefined, + { + id: downloaded.assetId, + bookmarkId, + userId, + assetType: AssetTypes.BOOKMARK_ASSET, + contentType: downloaded.contentType, + size: downloaded.size, + fileName, + }, + trx, + ); + await trx.insert(bookmarkAssets).values({ + id: bookmarkId, + assetType, + assetId: downloaded.assetId, + content: null, + fileName, + sourceUrl: url, + }); + // Switch the type of the bookmark from LINK to ASSET + await trx + .update(bookmarks) + .set({ type: BookmarkTypes.ASSET }) + .where(eq(bookmarks.id, bookmarkId)); + await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId)); + }); + await AssetPreprocessingQueue.enqueue( { - id: downloaded.assetId, bookmarkId, - userId, - assetType: AssetTypes.BOOKMARK_ASSET, - contentType: downloaded.contentType, - size: downloaded.size, - fileName, + fixMode: false, + }, + { + groupId: userId, }, - trx, ); - await trx.insert(bookmarkAssets).values({ - id: bookmarkId, - assetType, - assetId: downloaded.assetId, - content: null, - fileName, - sourceUrl: url, - }); - // Switch the type of the bookmark from LINK to ASSET - await trx - .update(bookmarks) - .set({ type: BookmarkTypes.ASSET }) - .where(eq(bookmarks.id, bookmarkId)); - await trx.delete(bookmarkLinks).where(eq(bookmarkLinks.id, bookmarkId)); - }); - await AssetPreprocessingQueue.enqueue( - { - bookmarkId, - fixMode: false, - }, - { - groupId: userId, - }, - ); + } catch (error) { + if (downloadedAssetId) { + logger.error( + `[Crawler][${jobId}] handleAsAssetBookmark encountered an error, cleaning up new asset ${downloadedAssetId}: ${error}`, + ); + await silentDeleteAsset(userId, downloadedAssetId); + } + throw error; + } }, ); } @@ -1722,267 +1734,293 @@ async function crawlAndParseUrl( }, }, async () => { - let result: { - htmlContent: string; - screenshot: Buffer | undefined; - pdf: Buffer | undefined; - statusCode: number | null; - url: string; - }; - - if (precrawledArchiveAssetId) { - logger.info( - `[Crawler][${jobId}] The page has been precrawled. Will use the precrawled archive instead.`, - ); - const asset = await readAsset({ - userId, - assetId: precrawledArchiveAssetId, - }); - result = { - htmlContent: asset.asset.toString(), - screenshot: undefined, - pdf: undefined, - statusCode: 200, - url, + const newAssetIds: string[] = []; + try { + let result: { + htmlContent: string; + screenshot: Buffer | undefined; + pdf: Buffer | undefined; + statusCode: number | null; + url: string; }; - } else { - result = await crawlPage( - jobId, - url, - userId, - forceStorePdf, - abortSignal, - ); - } - abortSignal.throwIfAborted(); - - const { - htmlContent, - screenshot, - pdf, - statusCode, - url: browserUrl, - } = result; - - // Track status code in Prometheus - if (statusCode !== null) { - crawlerStatusCodeCounter.labels(statusCode.toString()).inc(); - setSpanAttributes({ - "crawler.statusCode": statusCode, - }); - } - if (shouldRetryCrawlStatusCode(statusCode)) { - if (numRetriesLeft > 0) { - throw new Error( - `[Crawler][${jobId}] Received status code ${statusCode}. Will retry crawl. Retries left: ${numRetriesLeft}`, + if (precrawledArchiveAssetId) { + logger.info( + `[Crawler][${jobId}] The page has been precrawled. Will use the precrawled archive instead.`, + ); + const asset = await readAsset({ + userId, + assetId: precrawledArchiveAssetId, + }); + result = { + htmlContent: asset.asset.toString(), + screenshot: undefined, + pdf: undefined, + statusCode: 200, + url, + }; + } else { + result = await crawlPage( + jobId, + url, + userId, + forceStorePdf, + abortSignal, ); } - logger.info( - `[Crawler][${jobId}] Received status code ${statusCode} on latest retry attempt. Proceeding without retry.`, - ); - } - - const { metadata: meta, readableContent: parsedReadableContent } = - await runParseSubprocess(htmlContent, browserUrl, jobId, abortSignal); - abortSignal.throwIfAborted(); + abortSignal.throwIfAborted(); - const parseDate = (date: string | null | undefined) => { - if (!date) { - return null; - } - try { - return new Date(date); - } catch { - return null; + const { + htmlContent, + screenshot, + pdf, + statusCode, + url: browserUrl, + } = result; + + // Track status code in Prometheus + if (statusCode !== null) { + crawlerStatusCodeCounter.labels(statusCode.toString()).inc(); + setSpanAttributes({ + "crawler.statusCode": statusCode, + }); } - }; - // Phase 1: Write metadata immediately for fast user feedback. - // Content and asset storage happen later and can be slow (banner - // image download, screenshot/pdf upload, etc.). - await db - .update(bookmarkLinks) - .set({ - title: meta.title, - description: meta.description, - // Don't store data URIs as they're not valid URLs and are usually quite large - imageUrl: meta.image?.startsWith("data:") ? null : meta.image, - favicon: meta.logo, - crawlStatusCode: statusCode, - author: meta.author, - publisher: meta.publisher, - datePublished: parseDate(meta.datePublished), - dateModified: parseDate(meta.dateModified), - }) - .where(eq(bookmarkLinks.id, bookmarkId)); - - let readableContent = parsedReadableContent; - - const screenshotAssetInfo = await Promise.race([ - storeScreenshot(screenshot, userId, jobId), - abortPromise(abortSignal), - ]); - abortSignal.throwIfAborted(); - - const pdfAssetInfo = await Promise.race([ - storePdf(pdf, userId, jobId), - abortPromise(abortSignal), - ]); - abortSignal.throwIfAborted(); - - const htmlContentAssetInfo = await storeHtmlContent( - readableContent?.content, - userId, - jobId, - ); - abortSignal.throwIfAborted(); - let imageAssetInfo: DBAssetType | null = null; - if (meta.image) { - const downloaded = await downloadAndStoreImage( - meta.image, - userId, - jobId, - abortSignal, - ); - if (downloaded) { - imageAssetInfo = { - id: downloaded.assetId, - bookmarkId, - userId, - assetType: AssetTypes.LINK_BANNER_IMAGE, - contentType: downloaded.contentType, - size: downloaded.size, - }; + if (shouldRetryCrawlStatusCode(statusCode)) { + if (numRetriesLeft > 0) { + throw new Error( + `[Crawler][${jobId}] Received status code ${statusCode}. Will retry crawl. Retries left: ${numRetriesLeft}`, + ); + } + logger.info( + `[Crawler][${jobId}] Received status code ${statusCode} on latest retry attempt. Proceeding without retry.`, + ); } - } - abortSignal.throwIfAborted(); - - // Phase 2: Write content and asset references. - // TODO(important): Restrict the size of content to store - const assetDeletionTasks: Promise[] = []; - const inlineHtmlContent = - htmlContentAssetInfo.result === "store_inline" - ? (readableContent?.content ?? null) - : null; - readableContent = null; - await db.transaction(async (txn) => { - await txn + + const { metadata: meta, readableContent: parsedReadableContent } = + await runParseSubprocess(htmlContent, browserUrl, jobId, abortSignal); + abortSignal.throwIfAborted(); + + const parseDate = (date: string | null | undefined) => { + if (!date) { + return null; + } + try { + return new Date(date); + } catch { + return null; + } + }; + + // Phase 1: Write metadata immediately for fast user feedback. + // Content and asset storage happen later and can be slow (banner + // image download, screenshot/pdf upload, etc.). + await db .update(bookmarkLinks) .set({ - crawledAt: new Date(), - htmlContent: inlineHtmlContent, - contentAssetId: - htmlContentAssetInfo.result === "stored" - ? htmlContentAssetInfo.assetId - : null, + title: meta.title, + description: meta.description, + // Don't store data URIs as they're not valid URLs and are usually quite large + imageUrl: meta.image?.startsWith("data:") ? null : meta.image, + favicon: meta.logo, + crawlStatusCode: statusCode, + author: meta.author, + publisher: meta.publisher, + datePublished: parseDate(meta.datePublished), + dateModified: parseDate(meta.dateModified), }) .where(eq(bookmarkLinks.id, bookmarkId)); + let readableContent = parsedReadableContent; + + const screenshotAssetInfo = await Promise.race([ + storeScreenshot(screenshot, userId, jobId), + abortPromise(abortSignal), + ]); if (screenshotAssetInfo) { - await updateAsset( - oldScreenshotAssetId, - { - id: screenshotAssetInfo.assetId, - bookmarkId, - userId, - assetType: AssetTypes.LINK_SCREENSHOT, - contentType: screenshotAssetInfo.contentType, - size: screenshotAssetInfo.size, - fileName: screenshotAssetInfo.fileName, - }, - txn, - ); - assetDeletionTasks.push( - silentDeleteAsset(userId, oldScreenshotAssetId), - ); + newAssetIds.push(screenshotAssetInfo.assetId); } + abortSignal.throwIfAborted(); + + const pdfAssetInfo = await Promise.race([ + storePdf(pdf, userId, jobId), + abortPromise(abortSignal), + ]); if (pdfAssetInfo) { - await updateAsset( - oldPdfAssetId, - { - id: pdfAssetInfo.assetId, - bookmarkId, - userId, - assetType: AssetTypes.LINK_PDF, - contentType: pdfAssetInfo.contentType, - size: pdfAssetInfo.size, - fileName: pdfAssetInfo.fileName, - }, - txn, - ); - assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId)); - } - if (imageAssetInfo) { - await updateAsset(oldImageAssetId, imageAssetInfo, txn); - assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId)); + newAssetIds.push(pdfAssetInfo.assetId); } + abortSignal.throwIfAborted(); + + const htmlContentAssetInfo = await storeHtmlContent( + readableContent?.content, + userId, + jobId, + ); if (htmlContentAssetInfo.result === "stored") { - await updateAsset( - oldContentAssetId, - { - id: htmlContentAssetInfo.assetId, - bookmarkId, - userId, - assetType: AssetTypes.LINK_HTML_CONTENT, - contentType: ASSET_TYPES.TEXT_HTML, - size: htmlContentAssetInfo.size, - fileName: null, - }, - txn, - ); - assetDeletionTasks.push(silentDeleteAsset(userId, oldContentAssetId)); - } else if (oldContentAssetId) { - // Unlink the old content asset - await txn.delete(assets).where(eq(assets.id, oldContentAssetId)); - assetDeletionTasks.push(silentDeleteAsset(userId, oldContentAssetId)); + newAssetIds.push(htmlContentAssetInfo.assetId); } - }); - - // Delete the old assets if any - await Promise.all(assetDeletionTasks); - - return async () => { - if ( - !precrawledArchiveAssetId && - (serverConfig.crawler.fullPageArchive || archiveFullPage) - ) { - const archiveResult = await archiveWebpage( - htmlContent, - browserUrl, + abortSignal.throwIfAborted(); + let imageAssetInfo: DBAssetType | null = null; + if (meta.image) { + const downloaded = await downloadAndStoreImage( + meta.image, userId, jobId, abortSignal, ); + if (downloaded) { + newAssetIds.push(downloaded.assetId); + imageAssetInfo = { + id: downloaded.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_BANNER_IMAGE, + contentType: downloaded.contentType, + size: downloaded.size, + }; + } + } + abortSignal.throwIfAborted(); - if (archiveResult) { - const { - assetId: fullPageArchiveAssetId, - size, - contentType, - } = archiveResult; - - await db.transaction(async (txn) => { - await updateAsset( - oldFullPageArchiveAssetId, - { - id: fullPageArchiveAssetId, - bookmarkId, - userId, - assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE, - contentType, - size, - fileName: null, - }, - txn, - ); - }); - if (oldFullPageArchiveAssetId) { - await silentDeleteAsset(userId, oldFullPageArchiveAssetId); + // Phase 2: Write content and asset references. + // TODO(important): Restrict the size of content to store + const assetDeletionTasks: Promise[] = []; + const inlineHtmlContent = + htmlContentAssetInfo.result === "store_inline" + ? (readableContent?.content ?? null) + : null; + readableContent = null; + await db.transaction(async (txn) => { + await txn + .update(bookmarkLinks) + .set({ + crawledAt: new Date(), + htmlContent: inlineHtmlContent, + contentAssetId: + htmlContentAssetInfo.result === "stored" + ? htmlContentAssetInfo.assetId + : null, + }) + .where(eq(bookmarkLinks.id, bookmarkId)); + + if (screenshotAssetInfo) { + await updateAsset( + oldScreenshotAssetId, + { + id: screenshotAssetInfo.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_SCREENSHOT, + contentType: screenshotAssetInfo.contentType, + size: screenshotAssetInfo.size, + fileName: screenshotAssetInfo.fileName, + }, + txn, + ); + assetDeletionTasks.push( + silentDeleteAsset(userId, oldScreenshotAssetId), + ); + } + if (pdfAssetInfo) { + await updateAsset( + oldPdfAssetId, + { + id: pdfAssetInfo.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_PDF, + contentType: pdfAssetInfo.contentType, + size: pdfAssetInfo.size, + fileName: pdfAssetInfo.fileName, + }, + txn, + ); + assetDeletionTasks.push(silentDeleteAsset(userId, oldPdfAssetId)); + } + if (imageAssetInfo) { + await updateAsset(oldImageAssetId, imageAssetInfo, txn); + assetDeletionTasks.push(silentDeleteAsset(userId, oldImageAssetId)); + } + if (htmlContentAssetInfo.result === "stored") { + await updateAsset( + oldContentAssetId, + { + id: htmlContentAssetInfo.assetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_HTML_CONTENT, + contentType: ASSET_TYPES.TEXT_HTML, + size: htmlContentAssetInfo.size, + fileName: null, + }, + txn, + ); + assetDeletionTasks.push( + silentDeleteAsset(userId, oldContentAssetId), + ); + } else if (oldContentAssetId) { + // Unlink the old content asset + await txn.delete(assets).where(eq(assets.id, oldContentAssetId)); + assetDeletionTasks.push( + silentDeleteAsset(userId, oldContentAssetId), + ); + } + }); + + // Delete the old assets if any + await Promise.all(assetDeletionTasks); + + return async () => { + if ( + !precrawledArchiveAssetId && + (serverConfig.crawler.fullPageArchive || archiveFullPage) + ) { + const archiveResult = await archiveWebpage( + htmlContent, + browserUrl, + userId, + jobId, + abortSignal, + ); + + if (archiveResult) { + const { + assetId: fullPageArchiveAssetId, + size, + contentType, + } = archiveResult; + + await db.transaction(async (txn) => { + await updateAsset( + oldFullPageArchiveAssetId, + { + id: fullPageArchiveAssetId, + bookmarkId, + userId, + assetType: AssetTypes.LINK_FULL_PAGE_ARCHIVE, + contentType, + size, + fileName: null, + }, + txn, + ); + }); + if (oldFullPageArchiveAssetId) { + await silentDeleteAsset(userId, oldFullPageArchiveAssetId); + } } } - } - }; + }; + } catch (error) { + logger.error( + `[Crawler][${jobId}] crawlAndParseUrl encountered an error, cleaning up new assets: ${error}`, + ); + // Clean up any assets that were created during this attempt but not committed + await Promise.all( + newAssetIds.map((assetId) => silentDeleteAsset(userId, assetId)), + ); + throw error; + } }, ); } diff --git a/packages/shared/types/rules.ts b/packages/shared/types/rules.ts index fd99c2666..fb8638759 100644 --- a/packages/shared/types/rules.ts +++ b/packages/shared/types/rules.ts @@ -79,6 +79,20 @@ const zBookmarkTypeIsCondition = z.object({ bookmarkType: z.enum(["link", "text", "asset"]), }); +const zBookmarkSourceIsCondition = z.object({ + type: z.literal("bookmarkSourceIs"), + source: z.enum([ + "api", + "web", + "cli", + "mobile", + "extension", + "singlefile", + "rss", + "import", + ]), +}); + const zHasTagCondition = z.object({ type: z.literal("hasTag"), tagId: z.string(), @@ -100,6 +114,7 @@ const nonRecursiveCondition = z.discriminatedUnion("type", [ zTitleDoesNotContainCondition, zImportedFromFeedCondition, zBookmarkTypeIsCondition, + zBookmarkSourceIsCondition, zHasTagCondition, zIsFavouritedCondition, zIsArchivedCondition, @@ -121,6 +136,7 @@ export const zRuleEngineConditionSchema: z.ZodType = zTitleDoesNotContainCondition, zImportedFromFeedCondition, zBookmarkTypeIsCondition, + zBookmarkSourceIsCondition, zHasTagCondition, zIsFavouritedCondition, zIsArchivedCondition, @@ -244,6 +260,7 @@ const ruleValidaitorFn = ( switch (condition.type) { case "alwaysTrue": case "bookmarkTypeIs": + case "bookmarkSourceIs": case "isFavourited": case "isArchived": return true; diff --git a/packages/trpc/lib/ruleEngine.ts b/packages/trpc/lib/ruleEngine.ts index acfd747ee..66b845089 100644 --- a/packages/trpc/lib/ruleEngine.ts +++ b/packages/trpc/lib/ruleEngine.ts @@ -118,6 +118,9 @@ export class RuleEngine { case "bookmarkTypeIs": { return this.bookmark.type === condition.bookmarkType; } + case "bookmarkSourceIs": { + return this.bookmark.source === condition.source; + } case "hasTag": { return this.bookmark.tagsOnBookmarks.some( (t) => t.tagId === condition.tagId,