feat: add webhook_url parameter to crawler endpoint

VinciGit00 · claude · VinciGit00 · commit fd55fbcd2b4b · 2026-01-22T15:18:38.000+01:00
Add support for webhook notifications when crawl jobs complete.
This allows users to receive POST notifications at a specified URL
when their crawl job finishes processing.

Changes:
- Python SDK: Added webhook_url to CrawlRequest model with validation
- Python SDK: Updated sync and async client crawl methods
- JavaScript SDK: Added webhookUrl option to crawl function

Co-Authored-By: Claude Opus 4.5 &lt;noreply@anthropic.com&gt;
diff --git a/scrapegraph-js/src/crawl.js b/scrapegraph-js/src/crawl.js
@@ -25,6 +25,7 @@ import { getMockResponse } from './utils/mockResponse.js';
  * @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
  * @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
  * @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
+ * @param {string} [options.webhookUrl] - URL to receive webhook notifications when the crawl job completes
  * @returns {Promise<Object>} The crawl job response
  * @throws {Error} Throws an error if the HTTP request fails
  */
@@ -35,7 +36,7 @@ export async function crawl(
   schema,
   options = {}
 ) {
-  const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options;
+  const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null, webhookUrl = null } = options;
 
   // Check if mock mode is enabled
   const useMock = mock !== null ? mock : isMockEnabled();
@@ -98,6 +99,10 @@ export async function crawl(
     payload.exclude_paths = excludePaths;
   }
 
+  if (webhookUrl) {
+    payload.webhook_url = webhookUrl;
+  }
+
   try {
     const response = await axios.post(endpoint, payload, { headers });
     return response.data;
diff --git a/scrapegraph-py/scrapegraph_py/async_client.py b/scrapegraph-py/scrapegraph_py/async_client.py
@@ -864,6 +864,7 @@ async def crawl(
         stealth: bool = False,
         include_paths: Optional[list[str]] = None,
         exclude_paths: Optional[list[str]] = None,
+        webhook_url: Optional[str] = None,
         return_toon: bool = False,
     ):
         """Send a crawl request with support for both AI extraction and
@@ -887,6 +888,7 @@ async def crawl(
                           Supports wildcards: * matches any characters, ** matches any path segments
             exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*'])
                           Supports wildcards and takes precedence over include_paths
+            webhook_url: URL to receive webhook notifications when the crawl completes
             return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
         """
         logger.info("🔍 Starting crawl request")
@@ -916,6 +918,8 @@ async def crawl(
             logger.debug(f"✅ Include paths: {include_paths}")
         if exclude_paths:
             logger.debug(f"❌ Exclude paths: {exclude_paths}")
+        if webhook_url:
+            logger.debug(f"🔔 Webhook URL: {webhook_url}")
         if return_toon:
             logger.debug("🎨 TOON format output enabled")
 
@@ -945,6 +949,8 @@ async def crawl(
             request_data["include_paths"] = include_paths
         if exclude_paths is not None:
             request_data["exclude_paths"] = exclude_paths
+        if webhook_url is not None:
+            request_data["webhook_url"] = webhook_url
 
         request = CrawlRequest(**request_data)
         logger.debug("✅ Request validation passed")
diff --git a/scrapegraph-py/scrapegraph_py/client.py b/scrapegraph-py/scrapegraph_py/client.py
@@ -874,6 +874,7 @@ def crawl(
         stealth: bool = False,
         include_paths: Optional[list[str]] = None,
         exclude_paths: Optional[list[str]] = None,
+        webhook_url: Optional[str] = None,
         return_toon: bool = False,
     ):
         """Send a crawl request with support for both AI extraction and
@@ -897,6 +898,7 @@ def crawl(
                           Supports wildcards: * matches any characters, ** matches any path segments
             exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*'])
                           Supports wildcards and takes precedence over include_paths
+            webhook_url: URL to receive webhook notifications when the crawl completes
             return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
         """
         logger.info("🔍 Starting crawl request")
@@ -926,6 +928,8 @@ def crawl(
             logger.debug(f"✅ Include paths: {include_paths}")
         if exclude_paths:
             logger.debug(f"❌ Exclude paths: {exclude_paths}")
+        if webhook_url:
+            logger.debug(f"🔔 Webhook URL: {webhook_url}")
         if return_toon:
             logger.debug("🎨 TOON format output enabled")
 
@@ -955,6 +959,8 @@ def crawl(
             request_data["include_paths"] = include_paths
         if exclude_paths is not None:
             request_data["exclude_paths"] = exclude_paths
+        if webhook_url is not None:
+            request_data["webhook_url"] = webhook_url
 
         request = CrawlRequest(**request_data)
         logger.debug("✅ Request validation passed")
diff --git a/scrapegraph-py/scrapegraph_py/models/crawl.py b/scrapegraph-py/scrapegraph_py/models/crawl.py
@@ -94,6 +94,12 @@ class CrawlRequest(BaseModel):
         "Takes precedence over include_paths.",
         example=["/admin/*", "/api/**"]
     )
+    webhook_url: Optional[str] = Field(
+        default=None,
+        description="URL to receive webhook notifications when the crawl job completes. "
+        "The webhook will receive a POST request with the crawl results.",
+        example="https://example.com/webhook"
+    )
 
     @model_validator(mode="after")
     def validate_url(self) -> "CrawlRequest":
@@ -169,6 +175,21 @@ def validate_path_patterns(self) -> "CrawlRequest":
 
         return self
 
+    @model_validator(mode="after")
+    def validate_webhook_url(self) -> "CrawlRequest":
+        """Validate webhook URL format if provided"""
+        if self.webhook_url is not None:
+            if not self.webhook_url.strip():
+                raise ValueError("Webhook URL cannot be empty")
+            if not (
+                self.webhook_url.startswith("http://")
+                or self.webhook_url.startswith("https://")
+            ):
+                raise ValueError(
+                    "Invalid webhook URL - must start with http:// or https://"
+                )
+        return self
+
 
 class GetCrawlRequest(BaseModel):
     """Request model for get_crawl endpoint"""