Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,7 @@ poetry.toml

# ruff
.ruff_cache/

pgdata
# LSP config files
pyrightconfig.json

Expand Down
29 changes: 28 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -253,6 +253,32 @@ docker-compose logs django # Только Django
- [Docker Documentation](https://docs.docker.com/)
- [PostgreSQL Documentation](https://www.postgresql.org/docs/)


**Сборка и запуск контейнеров:**
```bash
docker-compose build --no-cache
docker-compose up # Соберет и запустит сервисы
```

## Запуск задачи в Celery:

**Команда для запуска задачи:**
сначала redis:
```bash
docker run -d -p 6379:6379 --name redis redis:alpine
```

потом celery:
```bash
celery -A config beat -l info
```

```markdown
-A config - указывает где находится Celery-приложение
beat - запускает Celery Beat — компонент, который периодически отправляет задачи в очередь
-l info - уровень логирования (DEBUG, INFO, WARNING, ERROR)
```

## 📄 Лицензия

Этот проект лицензирован под MIT License - см. файл [LICENSE](LICENSE) для деталей.
Expand All @@ -266,4 +292,5 @@ docker-compose logs django # Только Django

---

**Разработано с ❤️ используя современные инструменты Python**
**Разработано с ❤️ используя современные инструменты Python**

Empty file.
Empty file.
83 changes: 83 additions & 0 deletions apps/books/management/commands/parse_books.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,83 @@
import asyncio
from asgiref.sync import sync_to_async
from django.core.management.base import BaseCommand
from urllib.parse import urljoin

from apps.books.models import Book, Author, Publisher
from apps.books.services.book_saver import BookSaver
from apps.books.scrapers.piter_publ.book_parser import BookParser
from apps.books.scrapers.piter_publ.piter_scraper import PiterScraper
from apps.books.scrapers.base_scraper import BaseScraper
from apps.books.services.author_service import AuthorService
from apps.books.services.publisher_service import PublisherService
from logger.books.log import get_logger

logger = get_logger(__name__)
author_service = AuthorService(Author)
publisher_service = PublisherService(Publisher)
book_saver = BookSaver(Book, publisher_service, author_service, logger)


class AsyncBookFetcher(BaseScraper):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Кажется лучше вынести в отдельный модуль

def __init__(self, base_domain: str, delay: float = 2.0, max_concurrent: int = 3):
super().__init__(delay)
self.base_domain = base_domain
self.semaphore = asyncio.Semaphore(max_concurrent)
self._last_request_time = 0

async def scrape_book(self, url: str):
async with self.semaphore:
now = asyncio.get_event_loop().time()
elapsed = now - self._last_request_time
if elapsed < self.delay:
await asyncio.sleep(self.delay - elapsed)
self._last_request_time = asyncio.get_event_loop().time()

if url.startswith("/"):
url = urljoin(self.base_domain, url)

logger.info(f"fetching book: {url}")
html = await self.fetch(url)
if not html:
logger.warning(f"failed to fetch {url}")
return None

parser = BookParser(html)
book_data = {
"url": url,
"book_title": parser.extract_book_name().get("book_title", ""),
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Вот тут для меня выглядит странно, почему не обработать внутри метода

"author": parser.extract_authors(),
"price": parser.extract_price(),
"details": parser.extract_all_params(),
"description": parser.extract_description().get("description", ""),
"cover": parser.extract_cover_image(),
}
logger.debug(f"parsed book data for: {book_data['book_title']}")
return book_data
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Посмотреть весь пул реквест на соответствие принципам SOLID (на первом месте SRP, на втором DI, остальные по желанию)

SOLID habr

SOLID claude



class Command(BaseCommand):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Можно добавить docstrings и type hints

help = "Парсит книги с сайта Piter и сохраняет в базу данных"

def handle(self, *args, **kwargs):
logger.info("starting book import from Piter")
asyncio.run(self.import_books())
logger.info("book import finished")

async def import_books(self):
piter = PiterScraper()
book_scraper = AsyncBookFetcher(base_domain="https://www.piter.com")

tasks = []
async for link in piter.scrape_book_links():
logger.debug(f"found book link: {link}")
task = asyncio.create_task(book_scraper.scrape_book(link))
tasks.append(task)

for coro in asyncio.as_completed(tasks):
book = await coro
if book:
await self.save_book(book)

async def save_book(self, item: dict):
await sync_to_async(book_saver.save_book)(item)
Empty file added apps/books/scrapers/__init__.py
Empty file.
24 changes: 24 additions & 0 deletions apps/books/scrapers/base_scraper.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,24 @@
import asyncio
import httpx

from bs4 import BeautifulSoup


class BaseScraper:
def __init__(self, delay: float = 1.0):
self.headers = {"User-Agent": "Mozilla/5.0"}
self.delay = delay

async def fetch(self, url):
try:
await asyncio.sleep(self.delay)
async with httpx.AsyncClient(timeout=10.0) as client:
response = await client.get(url, headers=self.headers)
response.raise_for_status()
return response.text
except (httpx.HTTPError, asyncio.TimeoutError) as e:
print(f"request failed: {url}, error: {str(e)}")
return None

def parse(self, html) -> BeautifulSoup:
return BeautifulSoup(html, "html.parser")
Empty file.
148 changes: 148 additions & 0 deletions apps/books/scrapers/piter_publ/book_parser.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,148 @@
from bs4 import BeautifulSoup
from typing import List, Dict
from urllib.parse import urljoin

from logger.books.log import get_logger

logger = get_logger(__name__)


class BookParser:
def __init__(self, html: str, base_domain: str = "https://www.piter.com"):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

https://www.piter.com я бы вынес куда-то в глобальную переменную

self.soup = BeautifulSoup(html, "lxml")
self.base_domain = base_domain

def extract_book_name(self):
try:
title = self.soup.select_one("div.product-info h1")
if not title:
logger.warning("book title not found in HTML")
return {"book_title": ""}

result = {"book_title": title.get_text(strip=True) if title else ""}
logger.debug(f"extracted book title: {result['book_title']}")
return result
except Exception as e:
logger.error(f"book title extraction failed {str(e)}")
return {"book_title": ""}

def extract_description(self):
try:
description = self.soup.find("div", id="tab-1")
if not description:
logger.warning("book description not found in HTML")
return {"description": ""}
full_text = description.get_text(separator="\n", strip=True)
logger.debug(f"extracted book description: {full_text}")
return {"description": full_text}
except Exception as e:
logger.error(f"description extraction failed {str(e)}")
return {"description": ""}

def extract_all_params(self):
result = {}
items = self.soup.select("div.params li")

for li in items:
label = li.select_one("span.grid-5")
value = li.select_one("span.grid-7")
if label and value:
key = label.get_text(strip=True).rstrip(":")
val = value.get_text(strip=True)
result[key] = val
return result

def extract_cover_image(self):
try:
container = self.soup.select_one('div.photo, div[class*="photo"]')
if container:
img = container.select_one("img")
if img and img.get("src"):
src = img["src"].strip()
logger.debug(f"extracted cover image from container: {src}")
return {"cover_image": urljoin("https://www.piter.com", src)}

img = self.soup.select_one("img")
if img and img.get("src"):
src = img["src"].strip()
logger.debug(f"extracted cover image from img: {src}")
return {"cover_image": urljoin("https://www.piter.com", src)}

return {"cover_image": ""}
except Exception as e:
logger.error(f"cover extraction failed {str(e)}")
return {"cover": ""}

def extract_authors(self) -> List[Dict[str, str]]:
try:
authors = []
author_blocks = self.soup.select("#tab-2 .autor-wrapper")

for block in author_blocks:
name_tag = block.select_one("h2")
if name_tag:
full_name = name_tag.get_text(strip=True)
parts = full_name.split()
if len(parts) == 1:
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

123 - антипаттерн magic numbers

last_name = parts[0]
first_name = ""
elif len(parts) == 2:
last_name, first_name = parts
elif len(parts) >= 3:
first_name = parts[1]
last_name = " ".join([parts[0]] + parts[2:])
else:
last_name = ""
first_name = ""
logger.warning(f"empty author name: {full_name}")

description_block = name_tag.parent
bio_parts = []
for bio in description_block.contents:
if bio != name_tag and isinstance(bio, str):
bio_parts.append(bio.strip())
bio = " ".join(bio_parts).strip()

authors.append(
{
"first_name": first_name.strip("."),
"last_name": last_name,
"bio": bio,
}
)

logger.info(f"parsed {len(authors)} authors from tab-2")
return authors
except Exception as e:
logger.error(f"failed to parse authors from tab-2: {str(e)}")
logger.exception("tab-2 author parsing error details")
return []

def extract_author_bio(self) -> str:
try:
block = self.soup.select_one("div.author-wrapper div.grid-9.s-grid-12")
if not block:
logger.warning("block with author_bio not found in HTML")
return ""
name_tag = block.select_one("h2")
name_text = name_tag.get_text(strip=True) if name_tag else ""
full_text = block.get_text(separator=" ", strip=True)
logger.debug(f"extracted author_bio: {full_text}")
return full_text.replace(name_text, "", 1).strip()
except Exception as e:
logger.error(f"author_bio extraction failed {str(e)}")
return {"author_bio": ""}

def extract_price(self):
try:
price = self.soup.select("div.price.color")
if len(price) >= 2:
logger.debug(f"extracted price: {price}")
return {
"price": price[0].text.strip(),
"electronic_price": price[1].text.strip(),
}
return {}
except Exception as e:
logger.error(f"price extraction failed {str(e)}")
return {"price": "", "electronic_price": ""}
38 changes: 38 additions & 0 deletions apps/books/scrapers/piter_publ/link_extractor.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,38 @@
from bs4 import BeautifulSoup
from urllib.parse import urljoin
from typing import List

from logger.books.log import get_logger

logger = get_logger(__name__)


class LinkExtractor:
def __init__(
self, base_domain: str, expected_prefix: str = "/collection/all/product/"
):
self.base_domain = base_domain
self.expected_prefix = expected_prefix

def extract_links(self, soup: BeautifulSoup) -> List[str]:
"""
извлекает полные ссылки на книги
"""
try:
links = []

container = soup.find("div", class_="products-list")
if not container:
return []

for tag in container.find_all("a"):
href = tag.get("href")
if href and href.startswith(self.expected_prefix):
full_url = urljoin(self.base_domain, href)
links.append(full_url)

logger.info(f"collected links: {links}")
return links
except Exception as e:
logger.error(f"failed to parse links: {str(e)}")
return []
32 changes: 32 additions & 0 deletions apps/books/scrapers/piter_publ/paginator.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,32 @@
import re
from bs4 import BeautifulSoup
from urllib.parse import urljoin

from logger.books.log import get_logger

logger = get_logger(__name__)


class Paginator:
def __init__(self, base_domain: str):
self.base_domain = base_domain

def get_next_page(self, soup: BeautifulSoup):
"""
возвращает полный URL следующей страницы, если она существует
"""
next_button = soup.find("a", string="Следующая")
if not next_button:
logger.info("no next page")
return None

href = next_button.get("href")
if not href:
return None

match = re.search(r"page=(\d+)", href)
if match:
logger.success(f"found next page: {match}")
return urljoin(self.base_domain, href)

return None
Loading