Skip to content

Commit fd55fbc

Browse files
VinciGit00claude
andcommitted
feat: add webhook_url parameter to crawler endpoint
Add support for webhook notifications when crawl jobs complete. This allows users to receive POST notifications at a specified URL when their crawl job finishes processing. Changes: - Python SDK: Added webhook_url to CrawlRequest model with validation - Python SDK: Updated sync and async client crawl methods - JavaScript SDK: Added webhookUrl option to crawl function Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 31e9fef commit fd55fbc

File tree

4 files changed

+39
-1
lines changed

4 files changed

+39
-1
lines changed

scrapegraph-js/src/crawl.js

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ import { getMockResponse } from './utils/mockResponse.js';
2525
* @param {boolean} [options.stealth=false] - Enable stealth mode to avoid bot detection
2626
* @param {Array<string>} [options.includePaths] - List of path patterns to include (e.g., ['/products/*', '/blog/**']). Supports wildcards: * matches any characters, ** matches any path segments
2727
* @param {Array<string>} [options.excludePaths] - List of path patterns to exclude (e.g., ['/admin/*', '/api/*']). Supports wildcards and takes precedence over includePaths
28+
* @param {string} [options.webhookUrl] - URL to receive webhook notifications when the crawl job completes
2829
* @returns {Promise<Object>} The crawl job response
2930
* @throws {Error} Throws an error if the HTTP request fails
3031
*/
@@ -35,7 +36,7 @@ export async function crawl(
3536
schema,
3637
options = {}
3738
) {
38-
const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null } = options;
39+
const { mock = null, renderHeavyJs = false, stealth = false, includePaths = null, excludePaths = null, webhookUrl = null } = options;
3940

4041
// Check if mock mode is enabled
4142
const useMock = mock !== null ? mock : isMockEnabled();
@@ -98,6 +99,10 @@ export async function crawl(
9899
payload.exclude_paths = excludePaths;
99100
}
100101

102+
if (webhookUrl) {
103+
payload.webhook_url = webhookUrl;
104+
}
105+
101106
try {
102107
const response = await axios.post(endpoint, payload, { headers });
103108
return response.data;

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -864,6 +864,7 @@ async def crawl(
864864
stealth: bool = False,
865865
include_paths: Optional[list[str]] = None,
866866
exclude_paths: Optional[list[str]] = None,
867+
webhook_url: Optional[str] = None,
867868
return_toon: bool = False,
868869
):
869870
"""Send a crawl request with support for both AI extraction and
@@ -887,6 +888,7 @@ async def crawl(
887888
Supports wildcards: * matches any characters, ** matches any path segments
888889
exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*'])
889890
Supports wildcards and takes precedence over include_paths
891+
webhook_url: URL to receive webhook notifications when the crawl completes
890892
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
891893
"""
892894
logger.info("🔍 Starting crawl request")
@@ -916,6 +918,8 @@ async def crawl(
916918
logger.debug(f"✅ Include paths: {include_paths}")
917919
if exclude_paths:
918920
logger.debug(f"❌ Exclude paths: {exclude_paths}")
921+
if webhook_url:
922+
logger.debug(f"🔔 Webhook URL: {webhook_url}")
919923
if return_toon:
920924
logger.debug("🎨 TOON format output enabled")
921925

@@ -945,6 +949,8 @@ async def crawl(
945949
request_data["include_paths"] = include_paths
946950
if exclude_paths is not None:
947951
request_data["exclude_paths"] = exclude_paths
952+
if webhook_url is not None:
953+
request_data["webhook_url"] = webhook_url
948954

949955
request = CrawlRequest(**request_data)
950956
logger.debug("✅ Request validation passed")

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -874,6 +874,7 @@ def crawl(
874874
stealth: bool = False,
875875
include_paths: Optional[list[str]] = None,
876876
exclude_paths: Optional[list[str]] = None,
877+
webhook_url: Optional[str] = None,
877878
return_toon: bool = False,
878879
):
879880
"""Send a crawl request with support for both AI extraction and
@@ -897,6 +898,7 @@ def crawl(
897898
Supports wildcards: * matches any characters, ** matches any path segments
898899
exclude_paths: List of path patterns to exclude (e.g., ['/admin/*', '/api/*'])
899900
Supports wildcards and takes precedence over include_paths
901+
webhook_url: URL to receive webhook notifications when the crawl completes
900902
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
901903
"""
902904
logger.info("🔍 Starting crawl request")
@@ -926,6 +928,8 @@ def crawl(
926928
logger.debug(f"✅ Include paths: {include_paths}")
927929
if exclude_paths:
928930
logger.debug(f"❌ Exclude paths: {exclude_paths}")
931+
if webhook_url:
932+
logger.debug(f"🔔 Webhook URL: {webhook_url}")
929933
if return_toon:
930934
logger.debug("🎨 TOON format output enabled")
931935

@@ -955,6 +959,8 @@ def crawl(
955959
request_data["include_paths"] = include_paths
956960
if exclude_paths is not None:
957961
request_data["exclude_paths"] = exclude_paths
962+
if webhook_url is not None:
963+
request_data["webhook_url"] = webhook_url
958964

959965
request = CrawlRequest(**request_data)
960966
logger.debug("✅ Request validation passed")

scrapegraph-py/scrapegraph_py/models/crawl.py

Lines changed: 21 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -94,6 +94,12 @@ class CrawlRequest(BaseModel):
9494
"Takes precedence over include_paths.",
9595
example=["/admin/*", "/api/**"]
9696
)
97+
webhook_url: Optional[str] = Field(
98+
default=None,
99+
description="URL to receive webhook notifications when the crawl job completes. "
100+
"The webhook will receive a POST request with the crawl results.",
101+
example="https://example.com/webhook"
102+
)
97103

98104
@model_validator(mode="after")
99105
def validate_url(self) -> "CrawlRequest":
@@ -169,6 +175,21 @@ def validate_path_patterns(self) -> "CrawlRequest":
169175

170176
return self
171177

178+
@model_validator(mode="after")
179+
def validate_webhook_url(self) -> "CrawlRequest":
180+
"""Validate webhook URL format if provided"""
181+
if self.webhook_url is not None:
182+
if not self.webhook_url.strip():
183+
raise ValueError("Webhook URL cannot be empty")
184+
if not (
185+
self.webhook_url.startswith("http://")
186+
or self.webhook_url.startswith("https://")
187+
):
188+
raise ValueError(
189+
"Invalid webhook URL - must start with http:// or https://"
190+
)
191+
return self
192+
172193

173194
class GetCrawlRequest(BaseModel):
174195
"""Request model for get_crawl endpoint"""

0 commit comments

Comments
 (0)