diff --git a/skills/tinyfish-web-agent/SKILL.md b/skills/tinyfish-web-agent/SKILL.md new file mode 100644 index 0000000..66a4e9c --- /dev/null +++ b/skills/tinyfish-web-agent/SKILL.md @@ -0,0 +1,106 @@ +--- +name: tinyfish +description: Use TinyFish/Mino web agent to extract/scrape websites, extract data, and automate browser actions using natural language. Use when you need to extract/scrape data from websites, handle bot-protected sites, or automate web tasks. +--- + +# TinyFish Web Agent + +Requires: `MINO_API_KEY` environment variable + +## Best Practices + +1. **Specify JSON format**: Always describe the exact structure you want returned +2. **Parallel calls**: When extracting from multiple independent sites, make separate parallel calls instead of combining into one prompt + +## Basic Extract/Scrape + +Extract data from a page. Specify the JSON structure you want: + +```python +import requests +import json +import os + +response = requests.post( + "https://mino.ai/v1/automation/run-sse", + headers={ + "X-API-Key": os.environ["MINO_API_KEY"], + "Content-Type": "application/json", + }, + json={ + "url": "https://example.com", + "goal": "Extract product info as JSON: {\"name\": str, \"price\": str, \"in_stock\": bool}", + }, + stream=True, +) + +for line in response.iter_lines(): + if line: + line_str = line.decode("utf-8") + if line_str.startswith("data: "): + event = json.loads(line_str[6:]) + if event.get("type") == "COMPLETE" and event.get("status") == "COMPLETED": + print(json.dumps(event["resultJson"], indent=2)) +``` + +## Multiple Items + +Extract lists of data with explicit structure: + +```python +json={ + "url": "https://example.com/products", + "goal": "Extract all products as JSON array: [{\"name\": str, \"price\": str, \"url\": str}]", +} +``` + +## Stealth Mode + +For bot-protected sites: + +```python +json={ + "url": "https://protected-site.com", + "goal": "Extract product data as JSON: {\"name\": str, \"price\": str, \"description\": str}", + "browser_profile": "stealth", +} +``` + +## Proxy + +Route through specific country: + +```python +json={ + "url": "https://geo-restricted-site.com", + "goal": "Extract pricing data as JSON: {\"item\": str, \"price\": str, \"currency\": str}", + "browser_profile": "stealth", + "proxy_config": { + "enabled": True, + "country_code": "US", + }, +} +``` + +## Output + +Results are in `event["resultJson"]` when `event["type"] == "COMPLETE"` + +## Parallel Extraction + +When extracting from multiple independent sources, make separate parallel API calls instead of combining into one prompt: + +**Good** - Parallel calls: +```python +# Compare pizza prices - run these simultaneously +call_1 = extract("https://pizzahut.com", "Extract pizza prices as JSON: [{\"name\": str, \"price\": str}]") +call_2 = extract("https://dominos.com", "Extract pizza prices as JSON: [{\"name\": str, \"price\": str}]") +``` + +**Bad** - Single combined call: +```python +# Don't do this - less reliable and slower +extract("https://pizzahut.com", "Extract prices from Pizza Hut and also go to Dominos...") +``` + +Each independent extraction task should be its own API call. This is faster (parallel execution) and more reliable. diff --git a/skills/tinyfish-web-agent/scripts/extract.py b/skills/tinyfish-web-agent/scripts/extract.py new file mode 100644 index 0000000..0238039 --- /dev/null +++ b/skills/tinyfish-web-agent/scripts/extract.py @@ -0,0 +1,79 @@ +#!/usr/bin/env python3 +""" +TinyFish web extract/scrape helper + +Usage: + extract.py [--stealth] [--proxy US] + +Best practice: Specify the JSON format you want in the goal for better results. + +Examples: + extract.py "https://example.com" "Extract product as JSON: {\"name\": str, \"price\": str}" + extract.py "https://site.com" "Get all links as JSON: [{\"text\": str, \"url\": str}]" --stealth + extract.py "https://site.com" "Extract items as JSON: [{\"title\": str, \"price\": str}]" --stealth --proxy US +""" + +import os +import sys +import json +import urllib.request +import argparse + + +def extract(url, goal, stealth=False, proxy_country=None): + """Extract/scrape data from a website using TinyFish""" + api_key = os.environ.get("MINO_API_KEY") + if not api_key: + print("Error: MINO_API_KEY environment variable not set", file=sys.stderr) + sys.exit(1) + + payload = { + "url": url, + "goal": goal, + } + + if stealth: + payload["browser_profile"] = "stealth" + + if proxy_country: + payload["proxy_config"] = { + "enabled": True, + "country_code": proxy_country, + } + + req = urllib.request.Request( + "https://mino.ai/v1/automation/run-sse", + data=json.dumps(payload).encode(), + headers={ + "X-API-Key": api_key, + "Content-Type": "application/json", + } + ) + + print(f"Extracting from {url}...", file=sys.stderr) + + with urllib.request.urlopen(req) as response: + for line in response: + line_str = line.decode("utf-8").strip() + if line_str.startswith("data: "): + event = json.loads(line_str[6:]) + + # Print status updates + if event.get("type") == "STATUS_UPDATE": + print(f"[{event.get('status')}] {event.get('message', '')}", file=sys.stderr) + + # Print final result + if event.get("type") == "COMPLETE" and event.get("status") == "COMPLETED": + print(json.dumps(event["resultJson"], indent=2)) + return event["resultJson"] + + +if __name__ == "__main__": + parser = argparse.ArgumentParser(description="TinyFish web extract/scrape tool") + parser.add_argument("url", help="URL to extract/scrape from") + parser.add_argument("goal", help="What to extract (natural language)") + parser.add_argument("--stealth", action="store_true", help="Use stealth mode") + parser.add_argument("--proxy", help="Proxy country code (e.g., US, UK, DE)") + + args = parser.parse_args() + extract(args.url, args.goal, args.stealth, args.proxy)