From 173cf6c13d59a130397bb0d0b803d0ff54789e0d Mon Sep 17 00:00:00 2001 From: Quaylyn Rimer Date: Mon, 4 Aug 2025 15:59:55 -0600 Subject: [PATCH] fix: handle query parameter exclusions that glob patterns cannot match - Adds transformRequestFunction to enhance exclude pattern matching - Detects patterns like '**hl=**' and parses URLs to check query parameters - Fixes issue #179 where URLs with query params like ?hl=de were not excluded - Maintains backward compatibility with existing exclude patterns - Uses URL parsing for reliable query parameter detection Resolves #179 --- src/core.ts | 30 ++++++++++++++++++++++++++++++ 1 file changed, 30 insertions(+) diff --git a/src/core.ts b/src/core.ts index 05a9f8e3..1f182738 100644 --- a/src/core.ts +++ b/src/core.ts @@ -97,6 +97,36 @@ export async function crawl(config: Config) { typeof config.exclude === "string" ? [config.exclude] : config.exclude ?? [], + transformRequestFunction: (req) => { + // Enhanced URL filtering for query parameters that glob patterns can't handle properly + if (config.exclude) { + const excludePatterns = Array.isArray(config.exclude) + ? config.exclude + : [config.exclude]; + + for (const pattern of excludePatterns) { + if (typeof pattern === "string") { + // Handle query parameter exclusions that glob patterns miss + // Check for patterns like "**hl=**" or "**?hl=**" + const queryParamMatch = pattern.match( + /\*\*[\?]?([^=]+)=\*\*/, + ); + if (queryParamMatch) { + const paramName = queryParamMatch[1]; + try { + const url = new URL(req.url); + if (url.searchParams.has(paramName)) { + return false; // Exclude this URL + } + } catch { + // If URL parsing fails, continue with normal processing + } + } + } + } + } + return req; + }, }); }, // Comment this option to scrape the full website.