Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions environment.yml
Original file line number Diff line number Diff line change
Expand Up @@ -8,6 +8,7 @@ dependencies:
- numpy
- pandas
- requests
- flask
- pip:
- beautifulsoup4
- lxml
Binary file not shown.
Binary file added webscraper/api/__pycache__/interface.cpython-311.pyc
Binary file not shown.
Binary file added webscraper/api/__pycache__/routes.cpython-311.pyc
Binary file not shown.
8 changes: 8 additions & 0 deletions webscraper/api/interface.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,8 @@
from abc import ABC, abstractmethod

class ScraperAPIInterface(ABC):

@abstractmethod
def get_scraped_data(self, paths: list[str]) -> dict:
"""Given a list of paths, return scraped results."""
pass
26 changes: 26 additions & 0 deletions webscraper/api/routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import sys
import os
sys.path.append(os.path.abspath(os.path.join(os.path.dirname(__file__), '..', '..')))

from flask import Flask, jsonify, request
from webscraper.src.Cheaper_Scraper import CheaperScraper

app = Flask(__name__)
scraper = CheaperScraper(base_url="https://books.toscrape.com")

@app.route('/')
def home():
return jsonify({"message": "Welcome to Cheaper API!"})

@app.route('/scrape', methods=['GET'])
def scrape_books():
paths = request.args.getlist('path')
if not paths:
return jsonify({"error": "No paths provided"}), 400

results = scraper.get_scraped_data(paths)
return jsonify(results)

if __name__ == '__main__':
app.run(debug=True)

Binary file not shown.
26 changes: 26 additions & 0 deletions webscraper/api/tests/test_routes.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,26 @@
import unittest
from webscraper.api.routes import app

class TestRoutes(unittest.TestCase):

def setUp(self):
self.client = app.test_client()
self.client.testing = True

def test_home_route(self):
response = self.client.get('/')
self.assertEqual(response.status_code, 200)
self.assertIn(b"Welcome to Cheaper API", response.data)

def test_scrape_no_params(self):
response = self.client.get('/scrape')
self.assertEqual(response.status_code, 400)
self.assertIn(b"No paths provided", response.data)

def test_scrape_valid_path(self):
response = self.client.get('/scrape?path=/catalogue/page-1.html')
self.assertEqual(response.status_code, 200)
self.assertIsInstance(response.get_json(), dict)

if __name__ == '__main__':
unittest.main()
16 changes: 11 additions & 5 deletions webscraper/src/Cheaper_Scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,12 +3,15 @@
from bs4 import BeautifulSoup
import logging
from typing import Dict, List, Optional
from ABC.base_scraper import BaseScraper
from Robot_Check import RoboCheck
from webscraper.ABC.base_scraper import BaseScraper
from webscraper.src.robot_check import RoboCheck
from webscraper.api.interface import ScraperAPIInterface


class CheaperScraper(BaseScraper):
def __init__(self, base_url:str, user_agent: str= "CheaperBot/0.1", delay: float=2.0) -> None:


class CheaperScraper(BaseScraper, ScraperAPIInterface):
def __init__(self, base_url: str = "", user_agent: str = "CheaperBot/0.1", delay: float = 2.0) -> None:
"""Initialize the scraper with base parameters.

Args:
Expand Down Expand Up @@ -88,4 +91,7 @@ def scrape(self, paths: List[str]) -> Dict[str, List[str]]:
html = self.fetch(path)
if html:
results[path] = self.parse(html)
return results
return results

def get_scraped_data(self, paths: List[str]) -> Dict[str, List[str]]:
return self.scrape(paths)
Binary file modified webscraper/src/__pycache__/Cheaper_Scraper.cpython-311.pyc
Binary file not shown.
Binary file added webscraper/src/__pycache__/__init__.cpython-311.pyc
Binary file not shown.
Binary file modified webscraper/src/__pycache__/robot_check.cpython-311.pyc
Binary file not shown.