Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,9 +11,9 @@ Initial Landing page![Initial Landing page](https://github.com/user-attachments/

-To run the scraper, execute the main.py script by running the command

python main.py
python src/main.py

-Make sure you are in the src directory when you run the command (the directory that contains main.py).
-Make sure you are in the webscraper directory when you run the command

##Where is the entry point?

Expand Down
Binary file not shown.
Binary file added webscraper/api/__pycache__/interface.cpython-39.pyc
Binary file not shown.
24 changes: 24 additions & 0 deletions webscraper/output.json
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
{
"/": [
"A Light in the Attic",
"Tipping the Velvet",
"Soumission",
"Sharp Objects",
"Sapiens: A Brief History of Humankind",
"The Requiem Red",
"The Dirty Little Secrets of Getting Your Dream Job",
"The Coming Woman: A Novel Based on the Life of the Infamous Feminist, Victoria Woodhull",
"The Boys in the Boat: Nine Americans and Their Epic Quest for Gold at the 1936 Berlin Olympics",
"The Black Maria",
"Starving Hearts (Triangular Trade Trilogy, #1)",
"Shakespeare's Sonnets",
"Set Me Free",
"Scott Pilgrim's Precious Little Life (Scott Pilgrim #1)",
"Rip it Up and Start Again",
"Our Band Could Be Your Life: Scenes from the American Indie Underground, 1981-1991",
"Olio",
"Mesaerion: The Best Science Fiction Stories 1800-1849",
"Libertarianism for Beginners",
"It's Only the Himalayas"
]
}
62 changes: 36 additions & 26 deletions webscraper/src/Cheaper_Scraper.py
Original file line number Diff line number Diff line change
@@ -1,42 +1,51 @@
import requests
import time
from bs4 import BeautifulSoup
from urllib.parse import urlparse
import logging
from typing import Dict, List, Optional
from webscraper.ABC.base_scraper import BaseScraper
from webscraper.src.robot_check import RoboCheck
from webscraper.api.interface import ScraperAPIInterface
from webscraper.src.fetch_utils import cached_get
from functools import lru_cache




class CheaperScraper(BaseScraper, ScraperAPIInterface):
def __init__(self, base_url: str = "", user_agent: str = "CheaperBot/0.1", delay: float = 2.0) -> None:
class CheaperScraper(BaseScraper):
def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None:
"""Initialize the scraper with base parameters.

Args:
base_url: The base URL to scrape
user_agent: User agent string to identify the scraper
delay: Time in seconds to wait between requests
"""
parsed_url = urlparse(base_url)
if not parsed_url.scheme or not parsed_url.netloc:
raise ValueError(f"Invalid base URL: {base_url}")

self.base_url = base_url.rstrip('/')
self.delay = delay
self.user_agent = user_agent


#initialize session
self.session = requests.Session()
self.session.headers.update({"User-Agent": self.user_agent})

# robot logic checks if there are instances not able to be

# robot logic checks if there are instances not able to be
self.robots = RoboCheck(base_url, user_agent)




def fetch(self, path: str = "/") -> Optional[str]:
"""Fetch content from a specific path.

Args:
path: The URL path to fetch

Returns:
HTML content as string if successful, None otherwise
"""
Expand All @@ -46,42 +55,42 @@ def fetch(self, path: str = "/") -> Optional[str]:
return None

url = self.base_url + path

try:
response = self.session.get(url, timeout=10)
response.raise_for_status()
time.sleep(self.delay) # delay to simulate a user
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None

cached_before = cached_get.cache_info().hits
html = cached_get(url, self.user_agent)
cached_after = cached_get.cache_info().hits

if cached_after == cached_before:
time.sleep(self.delay)

return html

def parse(self, html: str) -> List[str]:
"""Parse HTML content.

Args:
html: The HTML content to parse

Returns:
List of parsed items from the HTML
"""
soup = BeautifulSoup(html, "html.parser")
results = []

for book in soup.find_all("article", class_="product_pod"):
title = book.h3.a["title"]
results.append(title)

return results






def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
"""Scrape multiple paths.

Args:
paths: List of URL paths to scrape

Returns:
Dictionary mapping paths to their parsed results
"""
Expand All @@ -92,6 +101,7 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
if html:
results[path] = self.parse(html)
return results


def get_scraped_data(self, paths: List[str]) -> Dict[str, List[str]]:
return self.scrape(paths)
Binary file modified webscraper/src/__pycache__/Cheaper_Scraper.cpython-39.pyc
Binary file not shown.
Binary file added webscraper/src/__pycache__/__init__.cpython-39.pyc
Binary file not shown.
Binary file not shown.
Binary file added webscraper/src/__pycache__/main.cpython-39.pyc
Binary file not shown.
Binary file modified webscraper/src/__pycache__/robot_check.cpython-39.pyc
Binary file not shown.
21 changes: 21 additions & 0 deletions webscraper/src/fetch_utils.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,21 @@
import requests
import logging
from functools import lru_cache
from typing import Optional


@lru_cache(maxsize=128)
def cached_get(url: str, user_agent: str) -> Optional[str]:
print(f"[HTTP Request] Fetching from web: {url}")
headers = {"User-Agent": user_agent}
try:
response = requests.get(url, headers=headers, timeout=10)
response.raise_for_status()
return response.text
except requests.RequestException as e:
logging.error(f"Error fetching {url}: {e}")
return None




10 changes: 9 additions & 1 deletion webscraper/src/main.py
Original file line number Diff line number Diff line change
@@ -1,6 +1,12 @@

from Cheaper_Scraper import CheaperScraper
import json
#import time // for testing
# i added htese imports below becasue when i ran it it wasnt finding the folders
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..')))
from src.Cheaper_Scraper import CheaperScraper


def main():
# Set up the scraper for a simple legal-to-scrape website
Expand All @@ -26,3 +32,5 @@ def main():

if __name__ == "__main__":
main()


Binary file not shown.
Binary file not shown.
72 changes: 72 additions & 0 deletions webscraper/src/tests/test_fetch_and_cache.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,72 @@
import unittest
import time

from webscraper.src.Cheaper_Scraper import CheaperScraper
from webscraper.src.fetch_utils import cached_get

#to test, be in the webscraper directory and use the following command in terminal
# python -m unittest webscraper.src.tests.test_fetch_and_cache -v



class TestCheaperScraperFetchCache(unittest.TestCase):

def setUp(self):
self.scraper = CheaperScraper("https://books.toscrape.com")
cached_get.cache_clear() # Reset cache before each test

def test_valid_fetch(self):
html = self.scraper.fetch("/")
self.assertIsInstance(html, str)
self.assertIn("<html", html.lower())

def test_invalid_path_fetch(self):
html = self.scraper.fetch("/this-page-does-not-exist")
# Even though it doesn't exist, the site may return a 200 with a 404 page
self.assertTrue(html is None or "<html" in html.lower())

def test_cache_effectiveness(self):
start = time.time()
self.scraper.fetch("/") # First fetch
time1 = time.time() - start

start = time.time()
self.scraper.fetch("/") # Second fetch (should be cached)
time2 = time.time() - start

cache_info = cached_get.cache_info()
self.assertLess(time2, time1)
self.assertGreaterEqual(cache_info.hits, 1)

def test_non_http_url(self):
with self.assertRaises(ValueError):
CheaperScraper("not_a_real_url")

def test_cache_timing_and_stats(self):
print("\n=== Cache Timing and Stats Test ===")

# First fetch (expected to be slow and hit the network)
start = time.time()
html1 = self.scraper.fetch("/")
time1 = round(time.time() - start, 2)
print(f"First fetch took: {time1} seconds")

# Second fetch (expected to be fast due to cache)
start = time.time()
html2 = self.scraper.fetch("/")
time2 = round(time.time() - start, 2)
print(f"Second fetch took: {time2} seconds")

# Confirm that the second fetch was faster
self.assertLess(time2, time1, "Second fetch should be faster due to caching")

# Print and assert cache stats
stats = cached_get.cache_info()
print("Cache stats:", stats)
self.assertGreaterEqual(stats.hits, 1, "There should be at least 1 cache hit")




if __name__ == "__main__":
unittest.main()