-
Notifications
You must be signed in to change notification settings - Fork 1
Feature/scraper for piter com #29
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
Changes from all commits
5a49ce0
885aeb8
aa173ef
91464bc
fc3c6e6
5edd520
d8381c4
6f3a6ab
987909c
0d6d52f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -199,7 +199,7 @@ poetry.toml | |
|
|
||
| # ruff | ||
| .ruff_cache/ | ||
|
|
||
| pgdata | ||
| # LSP config files | ||
| pyrightconfig.json | ||
|
|
||
|
|
||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,83 @@ | ||
| import asyncio | ||
| from asgiref.sync import sync_to_async | ||
| from django.core.management.base import BaseCommand | ||
| from urllib.parse import urljoin | ||
|
|
||
| from apps.books.models import Book, Author, Publisher | ||
| from apps.books.services.book_saver import BookSaver | ||
| from apps.books.scrapers.piter_publ.book_parser import BookParser | ||
| from apps.books.scrapers.piter_publ.piter_scraper import PiterScraper | ||
| from apps.books.scrapers.base_scraper import BaseScraper | ||
| from apps.books.services.author_service import AuthorService | ||
| from apps.books.services.publisher_service import PublisherService | ||
| from logger.books.log import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
| author_service = AuthorService(Author) | ||
| publisher_service = PublisherService(Publisher) | ||
| book_saver = BookSaver(Book, publisher_service, author_service, logger) | ||
|
|
||
|
|
||
| class AsyncBookFetcher(BaseScraper): | ||
| def __init__(self, base_domain: str, delay: float = 2.0, max_concurrent: int = 3): | ||
| super().__init__(delay) | ||
| self.base_domain = base_domain | ||
| self.semaphore = asyncio.Semaphore(max_concurrent) | ||
| self._last_request_time = 0 | ||
|
|
||
| async def scrape_book(self, url: str): | ||
| async with self.semaphore: | ||
| now = asyncio.get_event_loop().time() | ||
| elapsed = now - self._last_request_time | ||
| if elapsed < self.delay: | ||
| await asyncio.sleep(self.delay - elapsed) | ||
| self._last_request_time = asyncio.get_event_loop().time() | ||
|
|
||
| if url.startswith("/"): | ||
| url = urljoin(self.base_domain, url) | ||
|
|
||
| logger.info(f"fetching book: {url}") | ||
| html = await self.fetch(url) | ||
| if not html: | ||
| logger.warning(f"failed to fetch {url}") | ||
| return None | ||
|
|
||
| parser = BookParser(html) | ||
| book_data = { | ||
| "url": url, | ||
| "book_title": parser.extract_book_name().get("book_title", ""), | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Вот тут для меня выглядит странно, почему не обработать внутри метода |
||
| "author": parser.extract_authors(), | ||
| "price": parser.extract_price(), | ||
| "details": parser.extract_all_params(), | ||
| "description": parser.extract_description().get("description", ""), | ||
| "cover": parser.extract_cover_image(), | ||
| } | ||
| logger.debug(f"parsed book data for: {book_data['book_title']}") | ||
| return book_data | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Посмотреть весь пул реквест на соответствие принципам SOLID (на первом месте SRP, на втором DI, остальные по желанию) |
||
|
|
||
|
|
||
| class Command(BaseCommand): | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Можно добавить docstrings и type hints |
||
| help = "Парсит книги с сайта Piter и сохраняет в базу данных" | ||
|
|
||
| def handle(self, *args, **kwargs): | ||
| logger.info("starting book import from Piter") | ||
| asyncio.run(self.import_books()) | ||
| logger.info("book import finished") | ||
|
|
||
| async def import_books(self): | ||
| piter = PiterScraper() | ||
| book_scraper = AsyncBookFetcher(base_domain="https://www.piter.com") | ||
|
|
||
| tasks = [] | ||
| async for link in piter.scrape_book_links(): | ||
| logger.debug(f"found book link: {link}") | ||
| task = asyncio.create_task(book_scraper.scrape_book(link)) | ||
| tasks.append(task) | ||
|
|
||
| for coro in asyncio.as_completed(tasks): | ||
| book = await coro | ||
| if book: | ||
| await self.save_book(book) | ||
|
|
||
| async def save_book(self, item: dict): | ||
| await sync_to_async(book_saver.save_book)(item) | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,24 @@ | ||
| import asyncio | ||
| import httpx | ||
|
|
||
| from bs4 import BeautifulSoup | ||
|
|
||
|
|
||
| class BaseScraper: | ||
| def __init__(self, delay: float = 1.0): | ||
| self.headers = {"User-Agent": "Mozilla/5.0"} | ||
| self.delay = delay | ||
|
|
||
| async def fetch(self, url): | ||
| try: | ||
| await asyncio.sleep(self.delay) | ||
| async with httpx.AsyncClient(timeout=10.0) as client: | ||
| response = await client.get(url, headers=self.headers) | ||
| response.raise_for_status() | ||
| return response.text | ||
| except (httpx.HTTPError, asyncio.TimeoutError) as e: | ||
| print(f"request failed: {url}, error: {str(e)}") | ||
| return None | ||
|
|
||
| def parse(self, html) -> BeautifulSoup: | ||
| return BeautifulSoup(html, "html.parser") |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,148 @@ | ||
| from bs4 import BeautifulSoup | ||
| from typing import List, Dict | ||
| from urllib.parse import urljoin | ||
|
|
||
| from logger.books.log import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| class BookParser: | ||
| def __init__(self, html: str, base_domain: str = "https://www.piter.com"): | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. https://www.piter.com я бы вынес куда-то в глобальную переменную |
||
| self.soup = BeautifulSoup(html, "lxml") | ||
| self.base_domain = base_domain | ||
|
|
||
| def extract_book_name(self): | ||
| try: | ||
| title = self.soup.select_one("div.product-info h1") | ||
| if not title: | ||
| logger.warning("book title not found in HTML") | ||
| return {"book_title": ""} | ||
|
|
||
| result = {"book_title": title.get_text(strip=True) if title else ""} | ||
| logger.debug(f"extracted book title: {result['book_title']}") | ||
| return result | ||
| except Exception as e: | ||
| logger.error(f"book title extraction failed {str(e)}") | ||
| return {"book_title": ""} | ||
|
|
||
| def extract_description(self): | ||
| try: | ||
| description = self.soup.find("div", id="tab-1") | ||
| if not description: | ||
| logger.warning("book description not found in HTML") | ||
| return {"description": ""} | ||
| full_text = description.get_text(separator="\n", strip=True) | ||
| logger.debug(f"extracted book description: {full_text}") | ||
| return {"description": full_text} | ||
| except Exception as e: | ||
| logger.error(f"description extraction failed {str(e)}") | ||
| return {"description": ""} | ||
|
|
||
| def extract_all_params(self): | ||
| result = {} | ||
| items = self.soup.select("div.params li") | ||
|
|
||
| for li in items: | ||
| label = li.select_one("span.grid-5") | ||
| value = li.select_one("span.grid-7") | ||
| if label and value: | ||
| key = label.get_text(strip=True).rstrip(":") | ||
| val = value.get_text(strip=True) | ||
| result[key] = val | ||
| return result | ||
|
|
||
| def extract_cover_image(self): | ||
| try: | ||
| container = self.soup.select_one('div.photo, div[class*="photo"]') | ||
| if container: | ||
| img = container.select_one("img") | ||
| if img and img.get("src"): | ||
| src = img["src"].strip() | ||
| logger.debug(f"extracted cover image from container: {src}") | ||
| return {"cover_image": urljoin("https://www.piter.com", src)} | ||
|
|
||
| img = self.soup.select_one("img") | ||
| if img and img.get("src"): | ||
| src = img["src"].strip() | ||
| logger.debug(f"extracted cover image from img: {src}") | ||
| return {"cover_image": urljoin("https://www.piter.com", src)} | ||
|
|
||
| return {"cover_image": ""} | ||
| except Exception as e: | ||
| logger.error(f"cover extraction failed {str(e)}") | ||
| return {"cover": ""} | ||
|
|
||
| def extract_authors(self) -> List[Dict[str, str]]: | ||
| try: | ||
| authors = [] | ||
| author_blocks = self.soup.select("#tab-2 .autor-wrapper") | ||
|
|
||
| for block in author_blocks: | ||
| name_tag = block.select_one("h2") | ||
| if name_tag: | ||
| full_name = name_tag.get_text(strip=True) | ||
| parts = full_name.split() | ||
| if len(parts) == 1: | ||
|
Owner
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. 123 - антипаттерн magic numbers |
||
| last_name = parts[0] | ||
| first_name = "" | ||
| elif len(parts) == 2: | ||
| last_name, first_name = parts | ||
| elif len(parts) >= 3: | ||
| first_name = parts[1] | ||
| last_name = " ".join([parts[0]] + parts[2:]) | ||
| else: | ||
| last_name = "" | ||
| first_name = "" | ||
| logger.warning(f"empty author name: {full_name}") | ||
|
|
||
| description_block = name_tag.parent | ||
| bio_parts = [] | ||
| for bio in description_block.contents: | ||
| if bio != name_tag and isinstance(bio, str): | ||
| bio_parts.append(bio.strip()) | ||
| bio = " ".join(bio_parts).strip() | ||
|
|
||
| authors.append( | ||
| { | ||
| "first_name": first_name.strip("."), | ||
| "last_name": last_name, | ||
| "bio": bio, | ||
| } | ||
| ) | ||
|
|
||
| logger.info(f"parsed {len(authors)} authors from tab-2") | ||
| return authors | ||
| except Exception as e: | ||
| logger.error(f"failed to parse authors from tab-2: {str(e)}") | ||
| logger.exception("tab-2 author parsing error details") | ||
| return [] | ||
|
|
||
| def extract_author_bio(self) -> str: | ||
| try: | ||
| block = self.soup.select_one("div.author-wrapper div.grid-9.s-grid-12") | ||
| if not block: | ||
| logger.warning("block with author_bio not found in HTML") | ||
| return "" | ||
| name_tag = block.select_one("h2") | ||
| name_text = name_tag.get_text(strip=True) if name_tag else "" | ||
| full_text = block.get_text(separator=" ", strip=True) | ||
| logger.debug(f"extracted author_bio: {full_text}") | ||
| return full_text.replace(name_text, "", 1).strip() | ||
| except Exception as e: | ||
| logger.error(f"author_bio extraction failed {str(e)}") | ||
| return {"author_bio": ""} | ||
|
|
||
| def extract_price(self): | ||
| try: | ||
| price = self.soup.select("div.price.color") | ||
| if len(price) >= 2: | ||
| logger.debug(f"extracted price: {price}") | ||
| return { | ||
| "price": price[0].text.strip(), | ||
| "electronic_price": price[1].text.strip(), | ||
| } | ||
| return {} | ||
| except Exception as e: | ||
| logger.error(f"price extraction failed {str(e)}") | ||
| return {"price": "", "electronic_price": ""} | ||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,38 @@ | ||
| from bs4 import BeautifulSoup | ||
| from urllib.parse import urljoin | ||
| from typing import List | ||
|
|
||
| from logger.books.log import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| class LinkExtractor: | ||
| def __init__( | ||
| self, base_domain: str, expected_prefix: str = "/collection/all/product/" | ||
| ): | ||
| self.base_domain = base_domain | ||
| self.expected_prefix = expected_prefix | ||
|
|
||
| def extract_links(self, soup: BeautifulSoup) -> List[str]: | ||
| """ | ||
| извлекает полные ссылки на книги | ||
| """ | ||
| try: | ||
| links = [] | ||
|
|
||
| container = soup.find("div", class_="products-list") | ||
| if not container: | ||
| return [] | ||
|
|
||
| for tag in container.find_all("a"): | ||
| href = tag.get("href") | ||
| if href and href.startswith(self.expected_prefix): | ||
| full_url = urljoin(self.base_domain, href) | ||
| links.append(full_url) | ||
|
|
||
| logger.info(f"collected links: {links}") | ||
| return links | ||
| except Exception as e: | ||
| logger.error(f"failed to parse links: {str(e)}") | ||
| return [] |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,32 @@ | ||
| import re | ||
| from bs4 import BeautifulSoup | ||
| from urllib.parse import urljoin | ||
|
|
||
| from logger.books.log import get_logger | ||
|
|
||
| logger = get_logger(__name__) | ||
|
|
||
|
|
||
| class Paginator: | ||
| def __init__(self, base_domain: str): | ||
| self.base_domain = base_domain | ||
|
|
||
| def get_next_page(self, soup: BeautifulSoup): | ||
| """ | ||
| возвращает полный URL следующей страницы, если она существует | ||
| """ | ||
| next_button = soup.find("a", string="Следующая") | ||
| if not next_button: | ||
| logger.info("no next page") | ||
| return None | ||
|
|
||
| href = next_button.get("href") | ||
| if not href: | ||
| return None | ||
|
|
||
| match = re.search(r"page=(\d+)", href) | ||
| if match: | ||
| logger.success(f"found next page: {match}") | ||
| return urljoin(self.base_domain, href) | ||
|
|
||
| return None |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Кажется лучше вынести в отдельный модуль