Skip to content

Commit cc2a08e

Browse files
authored
Add Firecrawl Tools For The New arcade_web` Toolkit (#110)
# PR Description This PR adds 6 new tools inside the new `arcade_web` toolkit. None of these tools require auth. They do, however, require the `FIRECRAWL_API_KEY` API Key to be set. The new tools implement the [Firecrawl](https://www.firecrawl.dev/) APIs `/scrape (POST)`, `/crawl (POST)`, `/crawl/{id} (GET)`, `/crawl/{id} (DELETE)`, and `/map (POST)`. The six tools are: * `Web.ScrapeUrl`: - In the future I would like this tool to support actions (clicking, scrolling, screenshotting, etc) and extract (specify what you want to scrape) parameters. Firecrawl supports both of these parameters. * `Web.CrawlWebsite`: - If `async_crawl` is true, then the tool just returns the id of the crawl job, which you can retrieve later with the `Web.GetCrawlData` tool. If `async_crawl` is false, then the entire contents of the crawl are returned. * `Web.GetCrawlStatus` - Works for in progress or recently finished crawl jobs (Firecrawl's limitation) * `Web.GetCrawlData` - Works for in progress or recently finished crawl jobs (Firecrawl's limitation) * `Web.CancelCrawl` - You can cancel an in progress async crawl job * `Web.MapWebsite` - This endpoint is in alpha, but it can give you all of the links of an entire website, or optionally, you can specify in natural language what type of links you want to map by using the `search` parameter. For example "only map webpages that are about AI"
1 parent 1c6e3f4 commit cc2a08e

File tree

9 files changed

+557
-0
lines changed

9 files changed

+557
-0
lines changed

toolkits/web/arcade_web/__init__.py

Whitespace-only changes.

toolkits/web/arcade_web/tools/__init__.py

Whitespace-only changes.
Lines changed: 180 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,180 @@
1+
from typing import Annotated, Any, Optional
2+
3+
from firecrawl import FirecrawlApp
4+
5+
from arcade.sdk import tool
6+
from arcade_web.tools.models import Formats
7+
from arcade_web.tools.utils import get_secret
8+
9+
10+
# TODO: Support actions. This would enable clicking, scrolling, screenshotting, etc.
11+
# TODO: Support extract.
12+
# TODO: Support headers param?
13+
@tool
14+
async def scrape_url(
15+
url: Annotated[str, "URL to scrape"],
16+
formats: Annotated[
17+
Optional[list[Formats]], "Formats to retrieve. Defaults to ['markdown']."
18+
] = None,
19+
only_main_content: Annotated[
20+
Optional[bool],
21+
"Only return the main content of the page excluding headers, navs, footers, etc.",
22+
] = True,
23+
include_tags: Annotated[list[str] | None, "List of tags to include in the output"] = None,
24+
exclude_tags: Annotated[list[str] | None, "List of tags to exclude from the output"] = None,
25+
wait_for: Annotated[
26+
Optional[int],
27+
"Specify a delay in milliseconds before fetching the content, allowing the page sufficient time to load.",
28+
] = 10,
29+
timeout: Annotated[Optional[int], "Timeout in milliseconds for the request"] = 30000,
30+
) -> Annotated[dict[str, Any], "Scraped data in specified formats"]:
31+
"""Scrape a URL using Firecrawl and return the data in specified formats."""
32+
33+
api_key = get_secret("FIRECRAWL_API_KEY")
34+
35+
formats = formats or [Formats.MARKDOWN]
36+
37+
app = FirecrawlApp(api_key=api_key)
38+
params = {
39+
"formats": formats,
40+
"onlyMainContent": only_main_content,
41+
"includeTags": include_tags or [],
42+
"excludeTags": exclude_tags or [],
43+
"waitFor": wait_for,
44+
"timeout": timeout,
45+
}
46+
response = app.scrape_url(url, params=params)
47+
48+
return response
49+
50+
51+
# TODO: Support scrapeOptions.
52+
@tool
53+
async def crawl_website(
54+
url: Annotated[str, "URL to crawl"],
55+
exclude_paths: Annotated[list[str] | None, "URL patterns to exclude from the crawl"] = None,
56+
include_paths: Annotated[list[str] | None, "URL patterns to include in the crawl"] = None,
57+
max_depth: Annotated[int, "Maximum depth to crawl relative to the entered URL"] = 2,
58+
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
59+
limit: Annotated[int, "Limit the number of pages to crawl"] = 10,
60+
allow_backward_links: Annotated[
61+
bool,
62+
"Enable navigation to previously linked pages and enable crawling sublinks that are not children of the 'url' input parameter.",
63+
] = False,
64+
allow_external_links: Annotated[bool, "Allow following links to external websites"] = False,
65+
webhook: Annotated[
66+
Optional[str],
67+
"The URL to send a POST request to when the crawl is started, updated and completed.",
68+
] = None,
69+
async_crawl: Annotated[bool, "Run the crawl asynchronously"] = True,
70+
) -> Annotated[dict[str, Any], "Crawl status and data"]:
71+
"""
72+
Crawl a website using Firecrawl. If the crawl is asynchronous, then returns the crawl ID.
73+
If the crawl is synchronous, then returns the crawl data.
74+
"""
75+
76+
api_key = get_secret("FIRECRAWL_API_KEY")
77+
78+
app = FirecrawlApp(api_key=api_key)
79+
params = {
80+
"limit": limit,
81+
"excludePaths": exclude_paths or [],
82+
"includePaths": include_paths or [],
83+
"maxDepth": max_depth,
84+
"ignoreSitemap": ignore_sitemap,
85+
"allowBackwardLinks": allow_backward_links,
86+
"allowExternalLinks": allow_external_links,
87+
}
88+
if webhook:
89+
params["webhook"] = webhook
90+
91+
if async_crawl:
92+
response = app.async_crawl_url(url, params=params)
93+
if (
94+
"url" in response
95+
): # Url isn't clickable, so removing it since only the ID is needed to check status
96+
del response["url"]
97+
else:
98+
response = app.crawl_url(url, params=params)
99+
100+
return response
101+
102+
103+
@tool
104+
async def get_crawl_status(
105+
crawl_id: Annotated[str, "The ID of the crawl job"],
106+
) -> Annotated[dict[str, Any], "Crawl status information"]:
107+
"""
108+
Get the status of a Firecrawl 'crawl' that is either in progress or recently completed.
109+
"""
110+
111+
api_key = get_secret("FIRECRAWL_API_KEY")
112+
113+
app = FirecrawlApp(api_key=api_key)
114+
crawl_status = app.check_crawl_status(crawl_id)
115+
116+
if "data" in crawl_status:
117+
del crawl_status["data"]
118+
119+
return crawl_status
120+
121+
122+
# TODO: Support responses greater than 10 MB. If the response is greater than 10 MB, then the Firecrawl API response will have a next_url field.
123+
@tool
124+
async def get_crawl_data(
125+
crawl_id: Annotated[str, "The ID of the crawl job"],
126+
) -> Annotated[dict[str, Any], "Crawl data information"]:
127+
"""
128+
Get the data of a Firecrawl 'crawl' that is either in progress or recently completed.
129+
"""
130+
131+
api_key = get_secret("FIRECRAWL_API_KEY")
132+
133+
app = FirecrawlApp(api_key=api_key)
134+
crawl_data = app.check_crawl_status(crawl_id)
135+
136+
return crawl_data
137+
138+
139+
@tool
140+
async def cancel_crawl(
141+
crawl_id: Annotated[str, "The ID of the asynchronous crawl job to cancel"],
142+
) -> Annotated[dict[str, Any], "Cancellation status information"]:
143+
"""
144+
Cancel an asynchronous crawl job that is in progress using the Firecrawl API.
145+
"""
146+
147+
api_key = get_secret("FIRECRAWL_API_KEY")
148+
149+
app = FirecrawlApp(api_key=api_key)
150+
cancellation_status = app.cancel_crawl(crawl_id)
151+
152+
return cancellation_status
153+
154+
155+
@tool
156+
async def map_website(
157+
url: Annotated[str, "The base URL to start crawling from"],
158+
search: Annotated[Optional[str], "Search query to use for mapping"] = None,
159+
ignore_sitemap: Annotated[bool, "Ignore the website sitemap when crawling"] = True,
160+
include_subdomains: Annotated[bool, "Include subdomains of the website"] = False,
161+
limit: Annotated[int, "Maximum number of links to return"] = 5000,
162+
) -> Annotated[dict[str, Any], "Website map data"]:
163+
"""
164+
Map a website from a single URL to a map of the entire website.
165+
"""
166+
167+
api_key = get_secret("FIRECRAWL_API_KEY")
168+
169+
app = FirecrawlApp(api_key=api_key)
170+
params = {
171+
"ignoreSitemap": ignore_sitemap,
172+
"includeSubdomains": include_subdomains,
173+
"limit": limit,
174+
}
175+
if search:
176+
params["search"] = search
177+
178+
map_result = app.map_url(url, params=params)
179+
180+
return map_result
Lines changed: 11 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,11 @@
1+
from enum import Enum
2+
3+
4+
# Models and enums for firecrawl web tools
5+
class Formats(str, Enum):
6+
MARKDOWN = "markdown"
7+
HTML = "html"
8+
RAW_HTML = "rawHtml"
9+
LINKS = "links"
10+
SCREENSHOT = "screenshot"
11+
SCREENSHOT_AT_FULL_PAGE = "screenshot@fullPage"
Lines changed: 9 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,9 @@
1+
import os
2+
from typing import Any, Optional
3+
4+
5+
def get_secret(name: str, default: Optional[Any] = None) -> Any:
6+
secret = os.getenv(name)
7+
if secret is None and default is not None:
8+
return default
9+
return secret

0 commit comments

Comments
 (0)