-
Notifications
You must be signed in to change notification settings - Fork 0
/
Copy pathweb_search_engine.py
73 lines (57 loc) · 2.29 KB
/
web_search_engine.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
import os
import argparse
import requests
from bs4 import BeautifulSoup
from urllib.parse import urlparse, urljoin
from collections import deque
# Global set to store visited URLs to avoid duplicate work
visited_urls = set()
def crawl(start_url, max_depth=1):
urls_to_crawl = deque([(start_url, 0)])
indexed_urls = []
while urls_to_crawl:
url, depth = urls_to_crawl.popleft()
if url in visited_urls or depth > max_depth:
continue
visited_urls.add(url)
try:
response = requests.get(url)
response.raise_for_status()
except requests.RequestException as e:
print(f"Failed to fetch {url}: {e}")
continue
soup = BeautifulSoup(response.text, 'html.parser')
indexed_urls.append(url)
if depth < max_depth:
for link in soup.find_all('a', href=True):
full_url = urljoin(url, link['href'])
if is_valid_url(full_url):
urls_to_crawl.append((full_url, depth + 1))
return indexed_urls
def is_valid_url(url):
parsed = urlparse(url)
return bool(parsed.netloc) and bool(parsed.scheme)
def search(indexed_urls, query):
return [url for url in indexed_urls if query in url]
def main():
parser = argparse.ArgumentParser(description="A simple web crawler and search engine.")
parser.add_argument('start_url', help='The starting URL to begin crawling from.')
parser.add_argument('-d', '--depth', type=int, default=1, help='The maximum depth to crawl.')
parser.add_argument('-q', '--query', required=True, help='The search query.')
parser.add_argument('-o', '--output', help='Output results to a file.')
args = parser.parse_args()
print(f"Crawling {args.start_url} up to depth {args.depth}...")
indexed_urls = crawl(args.start_url, args.depth)
print(f"Indexed {len(indexed_urls)} URLs.")
print(f"Searching for '{args.query}'...")
results = search(indexed_urls, args.query)
print(f"Found {len(results)} results:")
for result in results:
print(result)
if args.output:
with open(args.output, 'w') as f:
for result in results:
f.write(result + '\n')
print(f"Results written to {args.output}")
if __name__ == "__main__":
main()