diff --git a/environment.yml b/environment.yml index a4219bb..1a6ff6f 100644 --- a/environment.yml +++ b/environment.yml @@ -8,6 +8,7 @@ dependencies: - numpy - pandas - requests + - flask - pip: - beautifulsoup4 - lxml diff --git a/webscraper/ABC/__pycache__/base_scraper.cpython-311.pyc b/webscraper/ABC/__pycache__/base_scraper.cpython-311.pyc new file mode 100644 index 0000000..2dc46e3 Binary files /dev/null and b/webscraper/ABC/__pycache__/base_scraper.cpython-311.pyc differ diff --git a/webscraper/api/__pycache__/interface.cpython-311.pyc b/webscraper/api/__pycache__/interface.cpython-311.pyc new file mode 100644 index 0000000..1b9240d Binary files /dev/null and b/webscraper/api/__pycache__/interface.cpython-311.pyc differ diff --git a/webscraper/api/__pycache__/routes.cpython-311.pyc b/webscraper/api/__pycache__/routes.cpython-311.pyc new file mode 100644 index 0000000..5e18603 Binary files /dev/null and b/webscraper/api/__pycache__/routes.cpython-311.pyc differ diff --git a/webscraper/api/interface.py b/webscraper/api/interface.py new file mode 100644 index 0000000..af2ef9d --- /dev/null +++ b/webscraper/api/interface.py @@ -0,0 +1,8 @@ +from abc import ABC, abstractmethod + +class ScraperAPIInterface(ABC): + + @abstractmethod + def get_scraped_data(self, paths: list[str]) -> dict: + """Given a list of paths, return scraped results.""" + pass diff --git a/webscraper/api/routes.py b/webscraper/api/routes.py new file mode 100644 index 0000000..919633c --- /dev/null +++ b/webscraper/api/routes.py @@ -0,0 +1,26 @@ +import sys +import os +sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..'))) + +from flask import Flask, jsonify, request +from webscraper.src.Cheaper_Scraper import CheaperScraper + +app = Flask(__name__) +scraper = CheaperScraper(base_url="https://books.toscrape.com") + +@app.route('/') +def home(): + return jsonify({"message": "Welcome to Cheaper API!"}) + +@app.route('/scrape', methods=['GET']) +def scrape_books(): + paths = request.args.getlist('path') + if not paths: + return jsonify({"error": "No paths provided"}), 400 + + results = scraper.get_scraped_data(paths) + return jsonify(results) + +if __name__ == '__main__': + app.run(debug=True) + diff --git a/webscraper/api/tests/__pycache__/test_routes.cpython-311.pyc b/webscraper/api/tests/__pycache__/test_routes.cpython-311.pyc new file mode 100644 index 0000000..1ce37af Binary files /dev/null and b/webscraper/api/tests/__pycache__/test_routes.cpython-311.pyc differ diff --git a/webscraper/api/tests/test_routes.py b/webscraper/api/tests/test_routes.py new file mode 100644 index 0000000..4edd3f9 --- /dev/null +++ b/webscraper/api/tests/test_routes.py @@ -0,0 +1,26 @@ +import unittest +from webscraper.api.routes import app + +class TestRoutes(unittest.TestCase): + + def setUp(self): + self.client = app.test_client() + self.client.testing = True + + def test_home_route(self): + response = self.client.get('/') + self.assertEqual(response.status_code, 200) + self.assertIn(b"Welcome to Cheaper API", response.data) + + def test_scrape_no_params(self): + response = self.client.get('/scrape') + self.assertEqual(response.status_code, 400) + self.assertIn(b"No paths provided", response.data) + + def test_scrape_valid_path(self): + response = self.client.get('/scrape?path=/catalogue/page-1.html') + self.assertEqual(response.status_code, 200) + self.assertIsInstance(response.get_json(), dict) + +if __name__ == '__main__': + unittest.main() diff --git a/webscraper/src/Cheaper_Scraper.py b/webscraper/src/Cheaper_Scraper.py index 2524a63..27fffc0 100644 --- a/webscraper/src/Cheaper_Scraper.py +++ b/webscraper/src/Cheaper_Scraper.py @@ -3,12 +3,15 @@ from bs4 import BeautifulSoup import logging from typing import Dict, List, Optional -from ABC.base_scraper import BaseScraper -from Robot_Check import RoboCheck +from webscraper.ABC.base_scraper import BaseScraper +from webscraper.src.robot_check import RoboCheck +from webscraper.api.interface import ScraperAPIInterface -class CheaperScraper(BaseScraper): - def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None: + + +class CheaperScraper(BaseScraper, ScraperAPIInterface): + def __init__(self, base_url: str = "", user_agent: str = "CheaperBot/0.1", delay: float = 2.0) -> None: """Initialize the scraper with base parameters. Args: @@ -88,4 +91,7 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]: html = self.fetch(path) if html: results[path] = self.parse(html) - return results \ No newline at end of file + return results + + def get_scraped_data(self, paths: List[str]) -> Dict[str, List[str]]: + return self.scrape(paths) diff --git a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-311.pyc b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-311.pyc index b453941..dedb95d 100644 Binary files a/webscraper/src/__pycache__/Cheaper_Scraper.cpython-311.pyc and b/webscraper/src/__pycache__/Cheaper_Scraper.cpython-311.pyc differ diff --git a/webscraper/src/__pycache__/__init__.cpython-311.pyc b/webscraper/src/__pycache__/__init__.cpython-311.pyc new file mode 100644 index 0000000..c420941 Binary files /dev/null and b/webscraper/src/__pycache__/__init__.cpython-311.pyc differ diff --git a/webscraper/src/__pycache__/robot_check.cpython-311.pyc b/webscraper/src/__pycache__/robot_check.cpython-311.pyc index 0d139ac..e7f181b 100644 Binary files a/webscraper/src/__pycache__/robot_check.cpython-311.pyc and b/webscraper/src/__pycache__/robot_check.cpython-311.pyc differ