Skip to content

Commit 580b8ba

Browse files
authored
Merge pull request #78 from ScrapeGraphAI/feat/add-wait-ms-parameter
feat: add wait_ms parameter to SmartScraper, Scrape, and Markdownify
2 parents 65b287a + a196d8d commit 580b8ba

File tree

7 files changed

+90
-8
lines changed

7 files changed

+90
-8
lines changed

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 10 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -445,16 +445,17 @@ def new_id(prefix: str) -> str:
445445
return {"status": "mock", "url": url, "method": method, "kwargs": kwargs}
446446

447447
async def markdownify(
448-
self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, return_toon: bool = False
448+
self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False
449449
):
450450
"""Send a markdownify request
451-
451+
452452
Args:
453453
website_url: The URL to convert to markdown
454454
headers: Optional HTTP headers
455455
mock: Enable mock mode for testing
456456
render_heavy_js: Enable heavy JavaScript rendering
457457
stealth: Enable stealth mode to avoid bot detection
458+
wait_ms: Number of milliseconds to wait before scraping the website
458459
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
459460
"""
460461
logger.info(f"🔍 Starting markdownify request for {website_url}")
@@ -467,7 +468,7 @@ async def markdownify(
467468
if return_toon:
468469
logger.debug("🎨 TOON format output enabled")
469470

470-
request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth)
471+
request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms)
471472
logger.debug("✅ Request validation passed")
472473

473474
result = await self._make_request(
@@ -504,6 +505,7 @@ async def scrape(
504505
branding: bool = False,
505506
headers: Optional[dict[str, str]] = None,
506507
stealth: bool = False,
508+
wait_ms: Optional[int] = None,
507509
return_toon: bool = False,
508510
):
509511
"""Send a scrape request to get HTML content from a website
@@ -514,6 +516,7 @@ async def scrape(
514516
branding: Whether to include branding in the response (defaults to False)
515517
headers: Optional headers to send with the request
516518
stealth: Enable stealth mode to avoid bot detection
519+
wait_ms: Number of milliseconds to wait before scraping the website
517520
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
518521
"""
519522
logger.info(f"🔍 Starting scrape request for {website_url}")
@@ -532,6 +535,7 @@ async def scrape(
532535
branding=branding,
533536
headers=headers,
534537
stealth=stealth,
538+
wait_ms=wait_ms,
535539
)
536540
logger.debug("✅ Request validation passed")
537541

@@ -619,6 +623,7 @@ async def smartscraper(
619623
plain_text: bool = False,
620624
render_heavy_js: bool = False,
621625
stealth: bool = False,
626+
wait_ms: Optional[int] = None,
622627
return_toon: bool = False,
623628
):
624629
"""
@@ -643,6 +648,7 @@ async def smartscraper(
643648
plain_text: Return plain text instead of structured data
644649
render_heavy_js: Enable heavy JavaScript rendering
645650
stealth: Enable stealth mode to avoid bot detection
651+
wait_ms: Number of milliseconds to wait before scraping the website
646652
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
647653
648654
Returns:
@@ -689,6 +695,7 @@ async def smartscraper(
689695
plain_text=plain_text,
690696
render_heavy_js=render_heavy_js,
691697
stealth=stealth,
698+
wait_ms=wait_ms,
692699
)
693700

694701
logger.debug("✅ Request validation passed")

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 11 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -458,15 +458,16 @@ def new_id(prefix: str) -> str:
458458
# Generic fallback
459459
return {"status": "mock", "url": url, "method": method, "kwargs": kwargs}
460460

461-
def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, return_toon: bool = False):
461+
def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None, mock: bool = False, render_heavy_js: bool = False, stealth: bool = False, wait_ms: Optional[int] = None, return_toon: bool = False):
462462
"""Send a markdownify request
463-
463+
464464
Args:
465465
website_url: The URL to convert to markdown
466466
headers: Optional HTTP headers
467467
mock: Enable mock mode for testing
468468
render_heavy_js: Enable heavy JavaScript rendering
469469
stealth: Enable stealth mode to avoid bot detection
470+
wait_ms: Number of milliseconds to wait before scraping the website
470471
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
471472
"""
472473
logger.info(f"🔍 Starting markdownify request for {website_url}")
@@ -479,7 +480,7 @@ def markdownify(self, website_url: str, headers: Optional[dict[str, str]] = None
479480
if return_toon:
480481
logger.debug("🎨 TOON format output enabled")
481482

482-
request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth)
483+
request = MarkdownifyRequest(website_url=website_url, headers=headers, mock=mock, render_heavy_js=render_heavy_js, stealth=stealth, wait_ms=wait_ms)
483484
logger.debug("✅ Request validation passed")
484485

485486
result = self._make_request(
@@ -515,6 +516,7 @@ def scrape(
515516
headers: Optional[dict[str, str]] = None,
516517
mock:bool=False,
517518
stealth:bool=False,
519+
wait_ms: Optional[int] = None,
518520
return_toon: bool = False,
519521
):
520522
"""Send a scrape request to get HTML content from a website
@@ -526,6 +528,7 @@ def scrape(
526528
headers: Optional headers to send with the request
527529
mock: Enable mock mode for testing
528530
stealth: Enable stealth mode to avoid bot detection
531+
wait_ms: Number of milliseconds to wait before scraping the website
529532
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
530533
"""
531534
logger.info(f"🔍 Starting scrape request for {website_url}")
@@ -544,7 +547,8 @@ def scrape(
544547
branding=branding,
545548
headers=headers,
546549
mock=mock,
547-
stealth=stealth
550+
stealth=stealth,
551+
wait_ms=wait_ms,
548552
)
549553
logger.debug("✅ Request validation passed")
550554

@@ -631,6 +635,7 @@ def smartscraper(
631635
plain_text: bool = False,
632636
render_heavy_js: bool = False,
633637
stealth: bool = False,
638+
wait_ms: Optional[int] = None,
634639
return_toon: bool = False,
635640
):
636641
"""
@@ -655,6 +660,7 @@ def smartscraper(
655660
plain_text: Return plain text instead of structured data
656661
render_heavy_js: Enable heavy JavaScript rendering
657662
stealth: Enable stealth mode to avoid bot detection
663+
wait_ms: Number of milliseconds to wait before scraping the website
658664
return_toon: If True, return response in TOON format (reduces token usage by 30-60%)
659665
660666
Returns:
@@ -701,6 +707,7 @@ def smartscraper(
701707
plain_text=plain_text,
702708
render_heavy_js=render_heavy_js,
703709
stealth=stealth,
710+
wait_ms=wait_ms,
704711
)
705712
logger.debug("✅ Request validation passed")
706713

scrapegraph-py/scrapegraph_py/models/markdownify.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -46,6 +46,7 @@ class MarkdownifyRequest(BaseModel):
4646
mock: bool = Field(default=False, description="Whether to use mock mode for the request")
4747
render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page")
4848
stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection")
49+
wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")
4950

5051
@model_validator(mode="after")
5152
def validate_url(self) -> "MarkdownifyRequest":

scrapegraph-py/scrapegraph_py/models/scrape.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -54,9 +54,10 @@ class ScrapeRequest(BaseModel):
5454
},
5555
description="Optional headers to send with the request, including cookies "
5656
"and user agent",
57-
),
57+
)
5858
mock: bool = Field(default=False, description="Whether to use mock mode for the request")
5959
stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection")
60+
wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")
6061

6162
@model_validator(mode="after")
6263
def validate_url(self) -> "ScrapeRequest":

scrapegraph-py/scrapegraph_py/models/smartscraper.py

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -100,6 +100,7 @@ class SmartScraperRequest(BaseModel):
100100
plain_text: bool = Field(default=False, description="Whether to return the result as plain text")
101101
render_heavy_js: bool = Field(default=False, description="Whether to render heavy JavaScript on the page")
102102
stealth: bool = Field(default=False, description="Enable stealth mode to avoid bot detection")
103+
wait_ms: Optional[int] = Field(default=None, description="The number of milliseconds to wait before scraping the website")
103104

104105
@model_validator(mode="after")
105106
def validate_user_prompt(self) -> "SmartScraperRequest":

scrapegraph-py/tests/test_scrape_models.py

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -130,6 +130,35 @@ def test_url_validation_edge_cases(self):
130130
ScrapeRequest(website_url=url)
131131

132132

133+
def test_wait_ms_default(self):
134+
"""Test scrape request wait_ms defaults to None"""
135+
request = ScrapeRequest(website_url="https://example.com")
136+
assert request.wait_ms is None
137+
138+
def test_wait_ms_custom_value(self):
139+
"""Test scrape request with custom wait_ms"""
140+
request = ScrapeRequest(
141+
website_url="https://example.com",
142+
wait_ms=5000,
143+
)
144+
assert request.wait_ms == 5000
145+
146+
def test_wait_ms_serialization(self):
147+
"""Test wait_ms is excluded from serialization when None"""
148+
request = ScrapeRequest(website_url="https://example.com")
149+
data = request.model_dump()
150+
assert "wait_ms" not in data
151+
152+
def test_wait_ms_serialization_with_value(self):
153+
"""Test wait_ms is included in serialization when set"""
154+
request = ScrapeRequest(
155+
website_url="https://example.com",
156+
wait_ms=5000,
157+
)
158+
data = request.model_dump()
159+
assert data["wait_ms"] == 5000
160+
161+
133162
class TestGetScrapeRequest:
134163
"""Test GetScrapeRequest model"""
135164

scrapegraph-py/tests/test_smartscraper_models.py

Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -95,6 +95,42 @@ def test_serialization_include_all(self):
9595
data = request.model_dump(exclude_none=False)
9696
assert data["render_heavy_js"] is False
9797

98+
def test_wait_ms_default(self):
99+
"""Test smartscraper request wait_ms defaults to None"""
100+
request = SmartScraperRequest(
101+
user_prompt="Extract data",
102+
website_url="https://example.com",
103+
)
104+
assert request.wait_ms is None
105+
106+
def test_wait_ms_custom_value(self):
107+
"""Test smartscraper request with custom wait_ms"""
108+
request = SmartScraperRequest(
109+
user_prompt="Extract data",
110+
website_url="https://example.com",
111+
wait_ms=5000,
112+
)
113+
assert request.wait_ms == 5000
114+
115+
def test_wait_ms_serialization(self):
116+
"""Test wait_ms is excluded from serialization when None"""
117+
request = SmartScraperRequest(
118+
user_prompt="Extract data",
119+
website_url="https://example.com",
120+
)
121+
data = request.model_dump()
122+
assert "wait_ms" not in data
123+
124+
def test_wait_ms_serialization_with_value(self):
125+
"""Test wait_ms is included in serialization when set"""
126+
request = SmartScraperRequest(
127+
user_prompt="Extract data",
128+
website_url="https://example.com",
129+
wait_ms=5000,
130+
)
131+
data = request.model_dump()
132+
assert data["wait_ms"] == 5000
133+
98134
def test_invalid_empty_prompt(self):
99135
"""Test smartscraper request with empty prompt"""
100136
with pytest.raises(ValidationError):

0 commit comments

Comments
 (0)