Skip to content

Commit 520920b

Browse files
authored
Merge pull request #63 from ScrapeGraphAI/add-docstring
feat: add docstring
2 parents 447fdd8 + 8984f4b commit 520920b

File tree

17 files changed

+617
-22
lines changed

17 files changed

+617
-22
lines changed

scrapegraph-py/scrapegraph_py/__init__.py

Lines changed: 48 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,51 @@
1+
"""
2+
ScrapeGraphAI Python SDK
3+
4+
A comprehensive Python SDK for the ScrapeGraphAI API, providing both synchronous
5+
and asynchronous clients for all API endpoints.
6+
7+
Main Features:
8+
- SmartScraper: AI-powered web scraping with structured data extraction
9+
- SearchScraper: Web research across multiple sources
10+
- Agentic Scraper: Automated browser interactions and form filling
11+
- Crawl: Website crawling with AI extraction or markdown conversion
12+
- Markdownify: Convert web pages to clean markdown
13+
- Schema Generation: AI-assisted schema creation for data extraction
14+
- Scheduled Jobs: Automate recurring scraping tasks
15+
16+
Quick Start:
17+
>>> from scrapegraph_py import Client
18+
>>>
19+
>>> # Initialize client from environment variables
20+
>>> client = Client.from_env()
21+
>>>
22+
>>> # Basic scraping
23+
>>> result = client.smartscraper(
24+
... website_url="https://example.com",
25+
... user_prompt="Extract all product information"
26+
... )
27+
>>>
28+
>>> # With context manager
29+
>>> with Client.from_env() as client:
30+
... result = client.scrape(website_url="https://example.com")
31+
32+
Async Usage:
33+
>>> import asyncio
34+
>>> from scrapegraph_py import AsyncClient
35+
>>>
36+
>>> async def main():
37+
... async with AsyncClient.from_env() as client:
38+
... result = await client.smartscraper(
39+
... website_url="https://example.com",
40+
... user_prompt="Extract products"
41+
... )
42+
>>>
43+
>>> asyncio.run(main())
44+
45+
For more information visit: https://scrapegraphai.com
46+
Documentation: https://docs.scrapegraphai.com
47+
"""
48+
149
from .async_client import AsyncClient
250
from .client import Client
351

scrapegraph-py/scrapegraph_py/async_client.py

Lines changed: 78 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,3 +1,38 @@
1+
"""
2+
Asynchronous HTTP client for the ScrapeGraphAI API.
3+
4+
This module provides an asynchronous client for interacting with all ScrapeGraphAI
5+
API endpoints including smartscraper, searchscraper, crawl, agentic scraper,
6+
markdownify, schema generation, scheduled jobs, and utility functions.
7+
8+
The AsyncClient class supports:
9+
- API key authentication
10+
- SSL verification configuration
11+
- Request timeout configuration
12+
- Automatic retry logic with exponential backoff
13+
- Mock mode for testing
14+
- Async context manager support for proper resource cleanup
15+
- Concurrent requests using asyncio
16+
17+
Example:
18+
Basic usage with environment variables:
19+
>>> import asyncio
20+
>>> from scrapegraph_py import AsyncClient
21+
>>> async def main():
22+
... client = AsyncClient.from_env()
23+
... result = await client.smartscraper(
24+
... website_url="https://example.com",
25+
... user_prompt="Extract product information"
26+
... )
27+
... await client.close()
28+
>>> asyncio.run(main())
29+
30+
Using async context manager:
31+
>>> async def main():
32+
... async with AsyncClient(api_key="sgai-...") as client:
33+
... result = await client.scrape(website_url="https://example.com")
34+
>>> asyncio.run(main())
35+
"""
136
import asyncio
237
from typing import Any, Dict, Optional, Callable
338

@@ -45,6 +80,30 @@
4580

4681

4782
class AsyncClient:
83+
"""
84+
Asynchronous client for the ScrapeGraphAI API.
85+
86+
This class provides asynchronous methods for all ScrapeGraphAI API endpoints.
87+
It handles authentication, request management, error handling, and supports
88+
mock mode for testing. Uses aiohttp for efficient async HTTP requests.
89+
90+
Attributes:
91+
api_key (str): The API key for authentication
92+
headers (dict): Default headers including API key
93+
timeout (ClientTimeout): Request timeout configuration
94+
max_retries (int): Maximum number of retry attempts
95+
retry_delay (float): Base delay between retries in seconds
96+
mock (bool): Whether mock mode is enabled
97+
session (ClientSession): Aiohttp session for connection pooling
98+
99+
Example:
100+
>>> async def example():
101+
... async with AsyncClient.from_env() as client:
102+
... result = await client.smartscraper(
103+
... website_url="https://example.com",
104+
... user_prompt="Extract all products"
105+
... )
106+
"""
48107
@classmethod
49108
def from_env(
50109
cls,
@@ -145,7 +204,25 @@ def __init__(
145204
logger.info("✅ AsyncClient initialized successfully")
146205

147206
async def _make_request(self, method: str, url: str, **kwargs) -> Any:
148-
"""Make HTTP request with retry logic."""
207+
"""
208+
Make asynchronous HTTP request with retry logic and error handling.
209+
210+
Args:
211+
method: HTTP method (GET, POST, etc.)
212+
url: Full URL for the request
213+
**kwargs: Additional arguments to pass to aiohttp
214+
215+
Returns:
216+
Parsed JSON response data
217+
218+
Raises:
219+
APIError: If the API returns an error response
220+
ConnectionError: If unable to connect after all retries
221+
222+
Note:
223+
In mock mode, this method returns deterministic responses without
224+
making actual HTTP requests.
225+
"""
149226
# Short-circuit when mock mode is enabled
150227
if getattr(self, "mock", False):
151228
return self._mock_response(method, url, **kwargs)

scrapegraph-py/scrapegraph_py/client.py

Lines changed: 70 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,31 @@
1-
# Client implementation goes here
1+
"""
2+
Synchronous HTTP client for the ScrapeGraphAI API.
3+
4+
This module provides a synchronous client for interacting with all ScrapeGraphAI
5+
API endpoints including smartscraper, searchscraper, crawl, agentic scraper,
6+
markdownify, schema generation, scheduled jobs, and utility functions.
7+
8+
The Client class supports:
9+
- API key authentication
10+
- SSL verification configuration
11+
- Request timeout configuration
12+
- Automatic retry logic with exponential backoff
13+
- Mock mode for testing
14+
- Context manager support for proper resource cleanup
15+
16+
Example:
17+
Basic usage with environment variables:
18+
>>> from scrapegraph_py import Client
19+
>>> client = Client.from_env()
20+
>>> result = client.smartscraper(
21+
... website_url="https://example.com",
22+
... user_prompt="Extract product information"
23+
... )
24+
25+
Using context manager:
26+
>>> with Client(api_key="sgai-...") as client:
27+
... result = client.scrape(website_url="https://example.com")
28+
"""
229
import uuid as _uuid
330
from typing import Any, Callable, Dict, Optional
431
from urllib.parse import urlparse
@@ -51,6 +78,29 @@
5178

5279

5380
class Client:
81+
"""
82+
Synchronous client for the ScrapeGraphAI API.
83+
84+
This class provides synchronous methods for all ScrapeGraphAI API endpoints.
85+
It handles authentication, request management, error handling, and supports
86+
mock mode for testing.
87+
88+
Attributes:
89+
api_key (str): The API key for authentication
90+
headers (dict): Default headers including API key
91+
timeout (Optional[float]): Request timeout in seconds
92+
max_retries (int): Maximum number of retry attempts
93+
retry_delay (float): Delay between retries in seconds
94+
mock (bool): Whether mock mode is enabled
95+
session (requests.Session): HTTP session for connection pooling
96+
97+
Example:
98+
>>> client = Client.from_env()
99+
>>> result = client.smartscraper(
100+
... website_url="https://example.com",
101+
... user_prompt="Extract all products"
102+
... )
103+
"""
54104
@classmethod
55105
def from_env(
56106
cls,
@@ -174,7 +224,25 @@ def __init__(
174224
logger.info("✅ Client initialized successfully")
175225

176226
def _make_request(self, method: str, url: str, **kwargs) -> Any:
177-
"""Make HTTP request with error handling."""
227+
"""
228+
Make HTTP request with error handling and retry logic.
229+
230+
Args:
231+
method: HTTP method (GET, POST, etc.)
232+
url: Full URL for the request
233+
**kwargs: Additional arguments to pass to requests
234+
235+
Returns:
236+
Parsed JSON response data
237+
238+
Raises:
239+
APIError: If the API returns an error response
240+
ConnectionError: If unable to connect to the API
241+
242+
Note:
243+
In mock mode, this method returns deterministic responses without
244+
making actual HTTP requests.
245+
"""
178246
# Short-circuit when mock mode is enabled
179247
if getattr(self, "mock", False):
180248
return self._mock_response(method, url, **kwargs)

scrapegraph-py/scrapegraph_py/config.py

Lines changed: 10 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,13 @@
1-
# Configuration and constants
1+
"""
2+
Configuration and constants for the ScrapeGraphAI SDK.
3+
4+
This module contains API configuration settings including the base URL
5+
and default headers used for all API requests.
6+
7+
Attributes:
8+
API_BASE_URL (str): Base URL for the ScrapeGraphAI API endpoints
9+
DEFAULT_HEADERS (dict): Default HTTP headers for API requests
10+
"""
211
API_BASE_URL = "https://api.scrapegraphai.com/v1"
312
DEFAULT_HEADERS = {
413
"accept": "application/json",

scrapegraph-py/scrapegraph_py/exceptions.py

Lines changed: 24 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,28 @@
1+
"""
2+
Custom exceptions for the ScrapeGraphAI SDK.
3+
4+
This module defines custom exception classes used throughout the SDK
5+
for handling API errors and other exceptional conditions.
6+
"""
7+
8+
19
class APIError(Exception):
2-
"""Base exception for API errors."""
10+
"""
11+
Exception raised for API errors.
12+
13+
This exception is raised when the API returns an error response,
14+
providing both the error message and HTTP status code for debugging.
15+
16+
Attributes:
17+
message (str): The error message from the API
18+
status_code (int): HTTP status code of the error response
19+
20+
Example:
21+
>>> try:
22+
... client.smartscraper(website_url="invalid")
23+
... except APIError as e:
24+
... print(f"API error {e.status_code}: {e.message}")
25+
"""
326

427
def __init__(self, message: str, status_code: int = None):
528
self.status_code = status_code

0 commit comments

Comments
 (0)