diff --git a/README.md b/README.md index 3362b30..e4a5606 100644 --- a/README.md +++ b/README.md @@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/ -To run the scraper, execute the main.py script by running the command -python main.py +python src/main.py --Make sure you are in the src directory when you run the command (the directory that contains main.py). +-Make sure you are in the webscraper directory when you run the command ##Where is the entry point? diff --git a/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc b/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc new file mode 100644 index 0000000..dd1704d Binary files /dev/null and b/webscraper/ABC/__pycache__/base_scraper.cpython-39.pyc differ diff --git a/webscraper/api/__pycache__/interface.cpython-39.pyc b/webscraper/api/__pycache__/interface.cpython-39.pyc new file mode 100644 index 0000000..4597f1e Binary files /dev/null and b/webscraper/api/__pycache__/interface.cpython-39.pyc differ diff --git a/webscraper/output.json b/webscraper/output.json new file mode 100644 index 0000000..e64146a --- /dev/null +++ b/webscraper/output.json @@ -0,0 +1,24 @@ +{ + "/": [ + "A Light in the Attic", + "Tipping the Velvet", + "Soumission", + "Sharp Objects", + "Sapiens: A Brief History of Humankind", + "The Requiem Red", + "The Dirty Little Secrets of Getting Your Dream Job", + "The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull", + "The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics", + "The Black Maria", + "Starving Hearts (Triangular Trade Trilogy, #1)", + "Shakespeare's Sonnets", + "Set Me Free", + "Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)", + "Rip it Up and Start Again", + "Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991", + "Olio", + "Mesaerion: The Best Science Fiction Stories 1800-1849", + "Libertarianism for Beginners", + "It's Only the Himalayas" + ] +} \ No newline at end of file diff --git a/webscraper/src/Cheaper_Scraper.py b/webscraper/src/Cheaper_Scraper.py index 27fffc0..2807c0c 100644 --- a/webscraper/src/Cheaper_Scraper.py +++ b/webscraper/src/Cheaper_Scraper.py @@ -1,42 +1,51 @@ import requests import time from bs4 import BeautifulSoup +from urllib.parse import urlparse import logging from typing import Dict, List, Optional from webscraper.ABC.base_scraper import BaseScraper from webscraper.src.robot_check import RoboCheck from webscraper.api.interface import ScraperAPIInterface +from webscraper.src.fetch_utils import cached_get +from functools import lru_cache - - -class CheaperScraper(BaseScraper, ScraperAPIInterface): - def __init__(self, base_url: str = "", user_agent: str = "CheaperBot/0.1", delay: float = 2.0) -> None: +class CheaperScraper(BaseScraper): + def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None: """Initialize the scraper with base parameters. - + Args: base_url: The base URL to scrape user_agent: User agent string to identify the scraper delay: Time in seconds to wait between requests """ + parsed_url = urlparse(base_url) + if not parsed_url.scheme or not parsed_url.netloc: + raise ValueError(f"Invalid base URL: {base_url}") self.base_url = base_url.rstrip('/') self.delay = delay self.user_agent = user_agent + #initialize session self.session = requests.Session() self.session.headers.update({"User-Agent": self.user_agent}) - # robot logic checks if there are instances not able to be + + # robot logic checks if there are instances not able to be self.robots = RoboCheck(base_url, user_agent) + + + def fetch(self, path: str = "/") -> Optional[str]: """Fetch content from a specific path. - + Args: path: The URL path to fetch - + Returns: HTML content as string if successful, None otherwise """ @@ -46,42 +55,42 @@ def fetch(self, path: str = "/") -> Optional[str]: return None url = self.base_url + path - - try: - response = self.session.get(url, timeout=10) - response.raise_for_status() - time.sleep(self.delay) # delay to simulate a user - return response.text - except requests.RequestException as e: - logging.error(f"Error fetching {url}: {e}") - return None - + cached_before = cached_get.cache_info().hits + html = cached_get(url, self.user_agent) + cached_after = cached_get.cache_info().hits + + if cached_after == cached_before: + time.sleep(self.delay) + + return html + def parse(self, html: str) -> List[str]: """Parse HTML content. - + Args: html: The HTML content to parse - + Returns: List of parsed items from the HTML """ soup = BeautifulSoup(html, "html.parser") results = [] - + for book in soup.find_all("article", class_="product_pod"): title = book.h3.a["title"] results.append(title) - + return results - - + + + def scrape(self, paths: List[str]) -> Dict[str, List[str]]: """Scrape multiple paths. - + Args: paths: List of URL paths to scrape - + Returns: Dictionary mapping paths to their parsed results """ @@ -92,6 +101,7 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]: if html: results[path] = self.parse(html) return results + def get_scraped_data(self, paths: List[str]) -> Dict[str, List[str]]: return self.scrape(paths) diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc index ac8006a..036324a 100644 Binary files a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc and b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc differ diff --git a/webscraper/src/__pycache__/__init__.cpython-39.pyc b/webscraper/src/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..d43fde4 Binary files /dev/null and b/webscraper/src/__pycache__/__init__.cpython-39.pyc differ diff --git a/webscraper/src/__pycache__/fetch_utils.cpython-39.pyc b/webscraper/src/__pycache__/fetch_utils.cpython-39.pyc new file mode 100644 index 0000000..8e69c48 Binary files /dev/null and b/webscraper/src/__pycache__/fetch_utils.cpython-39.pyc differ diff --git a/webscraper/src/__pycache__/main.cpython-39.pyc b/webscraper/src/__pycache__/main.cpython-39.pyc new file mode 100644 index 0000000..bd33919 Binary files /dev/null and b/webscraper/src/__pycache__/main.cpython-39.pyc differ diff --git a/webscraper/src/__pycache__/robot_check.cpython-39.pyc b/webscraper/src/__pycache__/robot_check.cpython-39.pyc index 4806464..d1557b9 100644 Binary files a/webscraper/src/__pycache__/robot_check.cpython-39.pyc and b/webscraper/src/__pycache__/robot_check.cpython-39.pyc differ diff --git a/webscraper/src/fetch_utils.py b/webscraper/src/fetch_utils.py new file mode 100644 index 0000000..1f15afa --- /dev/null +++ b/webscraper/src/fetch_utils.py @@ -0,0 +1,21 @@ +import requests +import logging +from functools import lru_cache +from typing import Optional + + +@lru_cache(maxsize=128) +def cached_get(url: str, user_agent: str) -> Optional[str]: + print(f"[HTTP Request] Fetching from web: {url}") + headers = {"User-Agent": user_agent} + try: + response = requests.get(url, headers=headers, timeout=10) + response.raise_for_status() + return response.text + except requests.RequestException as e: + logging.error(f"Error fetching {url}: {e}") + return None + + + + diff --git a/webscraper/src/main.py b/webscraper/src/main.py index 700835b..4a27839 100644 --- a/webscraper/src/main.py +++ b/webscraper/src/main.py @@ -1,6 +1,12 @@ -from Cheaper_Scraper import CheaperScraper import json +#import time // for testing +# i added htese imports below becasue when i ran it it wasnt finding the folders +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) +from src.Cheaper_Scraper import CheaperScraper + def main(): # Set up the scraper for a simple legal-to-scrape website @@ -26,3 +32,5 @@ def main(): if __name__ == "__main__": main() + + diff --git a/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc b/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc new file mode 100644 index 0000000..ba417b4 Binary files /dev/null and b/webscraper/src/tests/__pycache__/__init__.cpython-39.pyc differ diff --git a/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc b/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc new file mode 100644 index 0000000..c22e0aa Binary files /dev/null and b/webscraper/src/tests/__pycache__/test_fetch_and_cache.cpython-39.pyc differ diff --git a/webscraper/src/tests/test_fetch_and_cache.py b/webscraper/src/tests/test_fetch_and_cache.py new file mode 100644 index 0000000..69a4c3a --- /dev/null +++ b/webscraper/src/tests/test_fetch_and_cache.py @@ -0,0 +1,72 @@ +import unittest +import time + +from webscraper.src.Cheaper_Scraper import CheaperScraper +from webscraper.src.fetch_utils import cached_get + +#to test, be in the webscraper directory and use the following command in terminal +# python -m unittest webscraper.src.tests.test_fetch_and_cache -v + + + +class TestCheaperScraperFetchCache(unittest.TestCase): + + def setUp(self): + self.scraper = CheaperScraper("https://books.toscrape.com") + cached_get.cache_clear() # Reset cache before each test + + def test_valid_fetch(self): + html = self.scraper.fetch("/") + self.assertIsInstance(html, str) + self.assertIn("