From 3b8ce04c3b09f162b545bc71a95bc6cf01119501 Mon Sep 17 00:00:00 2001 From: mavrick-1 Date: Wed, 17 Sep 2025 15:20:50 +0530 Subject: [PATCH 1/2] Add context limit support for MCP compatibility - Add maxResponseSize parameter to all tools (scrape, map, search, crawl, check_crawl_status, extract) - Implement intelligent content truncation in asText function - Add truncation message when content exceeds specified limit - Update tool descriptions with usage examples - Add comprehensive documentation in README - Maintain full backward compatibility - Addresses GitHub issue #2165 for bounty --- README.md | 19 +++++++++++++ package-lock.json | 47 +++++++++++++++++++++++++++++-- package.json | 1 + src/index.ts | 72 ++++++++++++++++++++++++++++++++--------------- 4 files changed, 115 insertions(+), 24 deletions(-) diff --git a/README.md b/README.md index d9cef77..d271d55 100644 --- a/README.md +++ b/README.md @@ -21,6 +21,25 @@ A Model Context Protocol (MCP) server implementation that integrates with [Firec - Automatic retries and rate limiting - Cloud and self-hosted support - SSE support +- **Context limit support for MCP compatibility** + +## Context Limiting for MCP + +All tools now support the `maxResponseSize` parameter to limit response size for better MCP compatibility. This is especially useful for large responses that may exceed MCP context limits. + +**Example Usage:** +```json +{ + "name": "firecrawl_scrape", + "arguments": { + "url": "https://example.com", + "formats": ["markdown"], + "maxResponseSize": 50000 + } +} +``` + +When the response exceeds the specified limit, content will be truncated with a clear message indicating truncation occurred. This parameter is optional and preserves full backward compatibility. > Play around with [our MCP Server on MCP.so's playground](https://mcp.so/playground?server=firecrawl-mcp-server) or on [Klavis AI](https://www.klavis.ai/mcp-servers). diff --git a/package-lock.json b/package-lock.json index 7382b08..908493d 100644 --- a/package-lock.json +++ b/package-lock.json @@ -1,17 +1,18 @@ { "name": "firecrawl-mcp", - "version": "3.1.9", + "version": "3.2.0", "lockfileVersion": 3, "requires": true, "packages": { "": { "name": "firecrawl-mcp", - "version": "3.1.9", + "version": "3.2.0", "license": "MIT", "dependencies": { "@mendable/firecrawl-js": "^4.3.4", "dotenv": "^17.2.2", "firecrawl-fastmcp": "^1.0.2", + "node-fetch": "^2.7.0", "typescript": "^5.9.2", "zod": "^4.1.5" }, @@ -1223,6 +1224,26 @@ "node": ">= 0.6" } }, + "node_modules/node-fetch": { + "version": "2.7.0", + "resolved": "https://registry.npmjs.org/node-fetch/-/node-fetch-2.7.0.tgz", + "integrity": "sha512-c4FRfUm/dbcWZ7U+1Wq0AwCyFL+3nt2bEw05wfxSz+DWpWsitgmSgYmy2dQdWyKC1694ELPqMs/YzUSNozLt8A==", + "license": "MIT", + "dependencies": { + "whatwg-url": "^5.0.0" + }, + "engines": { + "node": "4.x || >=6.0.0" + }, + "peerDependencies": { + "encoding": "^0.1.0" + }, + "peerDependenciesMeta": { + "encoding": { + "optional": true + } + } + }, "node_modules/npm-run-path": { "version": "6.0.0", "resolved": "https://registry.npmjs.org/npm-run-path/-/npm-run-path-6.0.0.tgz", @@ -1732,6 +1753,12 @@ "url": "https://github.com/sponsors/Borewit" } }, + "node_modules/tr46": { + "version": "0.0.3", + "resolved": "https://registry.npmjs.org/tr46/-/tr46-0.0.3.tgz", + "integrity": "sha512-N3WMsuqV66lT30CrXNbEjx4GEwlow3v6rr4mCcv6prnfwhS01rkgyFdjPNBYd9br7LpXV1+Emh01fHnq2Gdgrw==", + "license": "MIT" + }, "node_modules/type-is": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/type-is/-/type-is-2.0.1.tgz", @@ -1841,6 +1868,22 @@ "node": ">= 0.8" } }, + "node_modules/webidl-conversions": { + "version": "3.0.1", + "resolved": "https://registry.npmjs.org/webidl-conversions/-/webidl-conversions-3.0.1.tgz", + "integrity": "sha512-2JAn3z8AR6rjK8Sm8orRC0h/bcl/DqL7tRPdGZ4I1CjdF+EaMLmYxBHyXuKL849eucPFhvBoxMsflfOb8kxaeQ==", + "license": "BSD-2-Clause" + }, + "node_modules/whatwg-url": { + "version": "5.0.0", + "resolved": "https://registry.npmjs.org/whatwg-url/-/whatwg-url-5.0.0.tgz", + "integrity": "sha512-saE57nupxk6v3HY35+jzBwYa0rKSy0XR8JSxZPwgLr7ys0IBzhGviA1/TUGJLmSVqs8pb9AnvICXEuOHLprYTw==", + "license": "MIT", + "dependencies": { + "tr46": "~0.0.3", + "webidl-conversions": "^3.0.0" + } + }, "node_modules/which": { "version": "2.0.2", "resolved": "https://registry.npmjs.org/which/-/which-2.0.2.tgz", diff --git a/package.json b/package.json index 2c9be2d..0e549ac 100644 --- a/package.json +++ b/package.json @@ -30,6 +30,7 @@ "@mendable/firecrawl-js": "^4.3.4", "dotenv": "^17.2.2", "firecrawl-fastmcp": "^1.0.2", + "node-fetch": "^2.7.0", "typescript": "^5.9.2", "zod": "^4.1.5" }, diff --git a/src/index.ts b/src/index.ts index 40116bf..40ef39b 100644 --- a/src/index.ts +++ b/src/index.ts @@ -150,8 +150,15 @@ function getClient(session?: SessionData): FirecrawlApp { return createClient(session?.firecrawlApiKey); } -function asText(data: unknown): string { - return JSON.stringify(data, null, 2); +function asText(data: unknown, maxResponseSize?: number): string { + const text = JSON.stringify(data, null, 2); + + if (maxResponseSize && maxResponseSize > 0 && text.length > maxResponseSize) { + const truncatedText = text.substring(0, maxResponseSize - 100); // Reserve space for truncation message + return truncatedText + '\n\n[Content truncated due to size limit. Increase maxResponseSize parameter to see full content.]'; + } + + return text; } // scrape tool (v2 semantics, minimal args) @@ -225,12 +232,13 @@ const scrapeParamsSchema = z.object({ .optional(), storeInCache: z.boolean().optional(), maxAge: z.number().optional(), + maxResponseSize: z.number().optional(), }); server.addTool({ name: 'firecrawl_scrape', description: ` -Scrape content from a single URL with advanced options. +Scrape content from a single URL with advanced options. This is the most powerful, fastest and most reliable scraper tool, if available you should always default to using this tool for any web scraping needs. **Best for:** Single page content extraction, when you know exactly which page contains the information. @@ -244,11 +252,13 @@ This is the most powerful, fastest and most reliable scraper tool, if available "arguments": { "url": "https://example.com", "formats": ["markdown"], - "maxAge": 172800000 + "maxAge": 172800000, + "maxResponseSize": 50000 } } \`\`\` **Performance:** Add maxAge parameter for 500% faster scrapes using cached data. +**Context Limiting:** Use maxResponseSize parameter to limit response size for MCP compatibility (e.g., 50000 characters). **Returns:** Markdown, HTML, or other formats as specified. `, parameters: scrapeParamsSchema, @@ -256,12 +266,12 @@ This is the most powerful, fastest and most reliable scraper tool, if available args: unknown, { session, log }: { session?: SessionData; log: Logger } ): Promise => { - const { url, ...options } = args as { url: string } & Record; + const { url, maxResponseSize, ...options } = args as { url: string; maxResponseSize?: number } & Record; const client = getClient(session); const cleaned = removeEmptyTopLevel(options as Record); log.info('Scraping URL', { url: String(url) }); const res = await client.scrape(String(url), { ...cleaned, origin: ORIGIN } as any); - return asText(res); + return asText(res, maxResponseSize); }, }); @@ -273,13 +283,15 @@ Map a website to discover all indexed URLs on the site. **Best for:** Discovering URLs on a website before deciding what to scrape; finding specific sections of a website. **Not recommended for:** When you already know which specific URL you need (use scrape or batch_scrape); when you need the content of the pages (use scrape after mapping). **Common mistakes:** Using crawl to discover URLs instead of map. +**Context Limiting:** Use maxResponseSize parameter to limit response size for MCP compatibility. **Prompt Example:** "List all URLs on example.com." **Usage Example:** \`\`\`json { "name": "firecrawl_map", "arguments": { - "url": "https://example.com" + "url": "https://example.com", + "maxResponseSize": 50000 } } \`\`\` @@ -292,17 +304,18 @@ Map a website to discover all indexed URLs on the site. includeSubdomains: z.boolean().optional(), limit: z.number().optional(), ignoreQueryParameters: z.boolean().optional(), + maxResponseSize: z.number().optional(), }), execute: async ( args: unknown, { session, log }: { session?: SessionData; log: Logger } ): Promise => { - const { url, ...options } = args as { url: string } & Record; + const { url, maxResponseSize, ...options } = args as { url: string; maxResponseSize?: number } & Record; const client = getClient(session); const cleaned = removeEmptyTopLevel(options as Record); log.info('Mapping URL', { url: String(url) }); const res = await client.map(String(url), { ...cleaned, origin: ORIGIN } as any); - return asText(res); + return asText(res, maxResponseSize); }, }); @@ -347,10 +360,12 @@ Search the web and optionally extract content from search results. This is the m "scrapeOptions": { "formats": ["markdown"], "onlyMainContent": true - } + }, + "maxResponseSize": 50000 } } \`\`\` +**Context Limiting:** Use maxResponseSize parameter to limit response size for MCP compatibility. **Returns:** Array of search results (with optional scraped content). `, parameters: z.object({ @@ -363,20 +378,21 @@ Search the web and optionally extract content from search results. This is the m .array(z.object({ type: z.enum(['web', 'images', 'news']) })) .optional(), scrapeOptions: scrapeParamsSchema.omit({ url: true }).partial().optional(), + maxResponseSize: z.number().optional(), }), execute: async ( args: unknown, { session, log }: { session?: SessionData; log: Logger } ): Promise => { const client = getClient(session); - const { query, ...opts } = args as Record; + const { query, maxResponseSize, ...opts } = args as { query: string; maxResponseSize?: number } & Record; const cleaned = removeEmptyTopLevel(opts as Record); log.info('Searching', { query: String(query) }); const res = await client.search(query as string, { ...(cleaned as any), origin: ORIGIN, }); - return asText(res); + return asText(res, maxResponseSize); }, }); @@ -400,10 +416,12 @@ server.addTool({ "limit": 20, "allowExternalLinks": false, "deduplicateSimilarURLs": true, - "sitemap": "include" + "sitemap": "include", + "maxResponseSize": 50000 } } \`\`\` + **Context Limiting:** Use maxResponseSize parameter to limit response size for MCP compatibility. **Returns:** Operation ID for status checking; use firecrawl_check_crawl_status to check progress. `, parameters: z.object({ @@ -431,9 +449,10 @@ server.addTool({ deduplicateSimilarURLs: z.boolean().optional(), ignoreQueryParameters: z.boolean().optional(), scrapeOptions: scrapeParamsSchema.omit({ url: true }).partial().optional(), + maxResponseSize: z.number().optional(), }), execute: async (args, { session, log }) => { - const { url, ...options } = args as Record; + const { url, maxResponseSize, ...options } = args as { url: string; maxResponseSize?: number } & Record; const client = getClient(session); const cleaned = removeEmptyTopLevel(options as Record); log.info('Starting crawl', { url: String(url) }); @@ -441,7 +460,7 @@ server.addTool({ ...(cleaned as any), origin: ORIGIN, }); - return asText(res); + return asText(res, maxResponseSize); }, }); @@ -455,20 +474,26 @@ Check the status of a crawl job. { "name": "firecrawl_check_crawl_status", "arguments": { - "id": "550e8400-e29b-41d4-a716-446655440000" + "id": "550e8400-e29b-41d4-a716-446655440000", + "maxResponseSize": 50000 } } \`\`\` +**Context Limiting:** Use maxResponseSize parameter to limit response size for MCP compatibility. **Returns:** Status and progress of the crawl job, including results if available. `, - parameters: z.object({ id: z.string() }), + parameters: z.object({ + id: z.string(), + maxResponseSize: z.number().optional(), + }), execute: async ( args: unknown, { session }: { session?: SessionData } ): Promise => { + const { id, maxResponseSize } = args as { id: string; maxResponseSize?: number }; const client = getClient(session); - const res = await client.getCrawlStatus((args as any).id as string); - return asText(res); + const res = await client.getCrawlStatus(id); + return asText(res, maxResponseSize); }, }); @@ -505,10 +530,12 @@ Extract structured information from web pages using LLM capabilities. Supports b }, "allowExternalLinks": false, "enableWebSearch": false, - "includeSubdomains": false + "includeSubdomains": false, + "maxResponseSize": 50000 } } \`\`\` +**Context Limiting:** Use maxResponseSize parameter to limit response size for MCP compatibility. **Returns:** Extracted structured data as defined by your schema. `, parameters: z.object({ @@ -518,13 +545,14 @@ Extract structured information from web pages using LLM capabilities. Supports b allowExternalLinks: z.boolean().optional(), enableWebSearch: z.boolean().optional(), includeSubdomains: z.boolean().optional(), + maxResponseSize: z.number().optional(), }), execute: async ( args: unknown, { session, log }: { session?: SessionData; log: Logger } ): Promise => { const client = getClient(session); - const a = args as Record; + const a = args as { maxResponseSize?: number } & Record; log.info('Extracting from URLs', { count: Array.isArray(a.urls) ? a.urls.length : 0, }); @@ -538,7 +566,7 @@ Extract structured information from web pages using LLM capabilities. Supports b origin: ORIGIN, }); const res = await client.extract(extractBody as any); - return asText(res); + return asText(res, a.maxResponseSize); }, }); const PORT = Number(process.env.PORT || 3000); From f5a4561081f910a77c990c231e5204733707e652 Mon Sep 17 00:00:00 2001 From: mavrick-1 Date: Fri, 26 Sep 2025 15:56:25 +0530 Subject: [PATCH 2/2] Fix: Return valid JSON with truncated content fields - Fixed JSON structure preservation when using maxResponseSize - Now truncates only content fields (markdown, html, etc.) instead of breaking JSON - Maintains all metadata, links, and structural data - Adds clear truncation message within content fields - Ensures MCP compatibility with proper JSON responses --- src/index.ts | 38 +++++++++++++++++++++++++++++++++----- 1 file changed, 33 insertions(+), 5 deletions(-) diff --git a/src/index.ts b/src/index.ts index 40ef39b..f250925 100644 --- a/src/index.ts +++ b/src/index.ts @@ -151,14 +151,42 @@ function getClient(session?: SessionData): FirecrawlApp { } function asText(data: unknown, maxResponseSize?: number): string { - const text = JSON.stringify(data, null, 2); + // If no size limit, return full JSON + if (!maxResponseSize || maxResponseSize <= 0) { + return JSON.stringify(data, null, 2); + } + + // Deep clone to avoid modifying original data + const dataCopy = JSON.parse(JSON.stringify(data)); + + // Function to recursively truncate markdown/content fields + function truncateMarkdown(obj: any): void { + if (!obj || typeof obj !== 'object') return; + + if (Array.isArray(obj)) { + obj.forEach(truncateMarkdown); + } else { + // Truncate markdown and other content fields if they exist + const contentFields = ['markdown', 'html', 'rawHtml', 'content', 'text']; - if (maxResponseSize && maxResponseSize > 0 && text.length > maxResponseSize) { - const truncatedText = text.substring(0, maxResponseSize - 100); // Reserve space for truncation message - return truncatedText + '\n\n[Content truncated due to size limit. Increase maxResponseSize parameter to see full content.]'; + for (const field of contentFields) { + if (obj[field] && typeof obj[field] === 'string' && obj[field].length > maxResponseSize!) { + obj[field] = obj[field].substring(0, maxResponseSize! - 100) + + '\n\n[Content truncated due to size limit. Increase maxResponseSize parameter to see full content.]'; + } + } + + // Recurse into nested objects + for (const key in obj) { + if (typeof obj[key] === 'object') { + truncateMarkdown(obj[key]); + } + } + } } - return text; + truncateMarkdown(dataCopy); + return JSON.stringify(dataCopy, null, 2); } // scrape tool (v2 semantics, minimal args)