diff --git a/package.json b/package.json index 0939c02b..b2934fdc 100644 --- a/package.json +++ b/package.json @@ -113,7 +113,8 @@ "undici": "^5.28.5", "vaul": "^1.1.2", "vercel": "^44.7.3", - "zod": "^4.0.17" + "zod": "^4.0.17", + "cheerio": "^1.1.2" }, "devDependencies": { "@eslint/js": "^9.33.0", diff --git a/src/lib/search-service.ts b/src/lib/search-service.ts index 8b5d9157..49f6c9e9 100644 --- a/src/lib/search-service.ts +++ b/src/lib/search-service.ts @@ -1,7 +1,7 @@ import * as Sentry from '@sentry/react'; import { withTimeout } from './ai-utils'; import { createTokenBucketRateLimiter } from './rate-limiter'; - +import * as cheerio from 'cheerio'; const { logger } = Sentry; // Simple in-memory rate limiter per session (client-side) @@ -316,12 +316,14 @@ export class BraveSearchService { } private extractTextContent(html: string): string { - return html - .replace(/]*>.*?<\/script>/gi, '') - .replace(/]*>.*?<\/style>/gi, '') - .replace(/<[^>]*>/g, ' ') - .replace(/\s+/g, ' ') - .trim(); + // Use Cheerio to parse HTML and remove