From d766df4b12478336b88d5340e9fc9cc0e4336da9 Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 15 Sep 2025 18:55:59 +0900 Subject: [PATCH 1/3] =?UTF-8?q?feat:=20crawling=5Futil=20=ED=95=9C?= =?UTF-8?q?=EA=B0=9C=EB=A1=9C=20=ED=95=A9=EC=B9=A8=201.=20service=20?= =?UTF-8?q?=EC=88=98=EC=A0=95=202.=20schemas=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/api/endpoints/blog.py | 6 +- .../app/api/endpoints/keywords.py | 6 +- .../app/api/endpoints/product.py | 45 ++--- apps/pre-processing-service/app/api/router.py | 1 - .../service/blog/base_blog_post_service.py | 65 +++---- .../service/blog/blogger_blog_post_adapter.py | 82 ++++++++ .../service/blog/blogger_blog_post_service.py | 93 ++++----- .../app/service/crawl_service.py | 5 +- .../app/service/crawlers/__init__.py | 0 .../app/service/crawlers/base_crawler.py | 56 ++++++ .../crawlers/detail_crawler.py} | 182 +----------------- .../app/service/crawlers/search_crawler.py | 136 +++++++++++++ .../app/service/search_service.py | 2 +- .../app/service/similarity_service.py | 12 -- .../app/test/test_keyword.py | 11 -- .../app/test/test_match_service.py | 10 - .../app/test/test_sadagu_crawl.py | 11 +- .../app/test/test_search_service.py | 8 +- .../app/test/test_similarity_service.py | 13 -- .../app/utils/crawling_util.py | 77 ++++---- 20 files changed, 420 insertions(+), 401 deletions(-) create mode 100644 apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py create mode 100644 apps/pre-processing-service/app/service/crawlers/__init__.py create mode 100644 apps/pre-processing-service/app/service/crawlers/base_crawler.py rename apps/pre-processing-service/app/{utils/crawler_utils.py => service/crawlers/detail_crawler.py} (55%) create mode 100644 apps/pre-processing-service/app/service/crawlers/search_crawler.py diff --git a/apps/pre-processing-service/app/api/endpoints/blog.py b/apps/pre-processing-service/app/api/endpoints/blog.py index 138fb706..158faf20 100644 --- a/apps/pre-processing-service/app/api/endpoints/blog.py +++ b/apps/pre-processing-service/app/api/endpoints/blog.py @@ -4,7 +4,7 @@ from ...model.schemas import * from app.service.blog.tistory_blog_post_service import TistoryBlogPostService from app.service.blog.naver_blog_post_service import NaverBlogPostService -from ...service.blog.blogger_blog_post_service import BloggerBlogPostService +from ...service.blog.blogger_blog_post_adapter import BloggerBlogPostAdapter # 수정된 import router = APIRouter() @@ -62,7 +62,7 @@ async def publish(request: RequestBlogPublish): return ResponseBlogPublish(status="success", metadata=result) elif request.tag == "blogger": - blogger_service = BloggerBlogPostService() + blogger_service = BloggerBlogPostAdapter() # 수정: Adapter 사용 result = blogger_service.post_content( title=request.post_title, content=request.post_content, @@ -74,4 +74,4 @@ async def publish(request: RequestBlogPublish): "블로거 블로그 포스팅에 실패했습니다.", status_code=500 ) - return ResponseBlogPublish(status="success", metadata=result) + return ResponseBlogPublish(status="success", metadata=result) \ No newline at end of file diff --git a/apps/pre-processing-service/app/api/endpoints/keywords.py b/apps/pre-processing-service/app/api/endpoints/keywords.py index 92c8a66b..43c0049b 100644 --- a/apps/pre-processing-service/app/api/endpoints/keywords.py +++ b/apps/pre-processing-service/app/api/endpoints/keywords.py @@ -5,6 +5,9 @@ router = APIRouter() +@router.get("/") +async def root(): + return {"message": "keyword API"} @router.post( "/search", response_model=ResponseNaverSearch, summary="네이버 키워드 검색" @@ -15,9 +18,6 @@ async def search(request: RequestNaverSearch): 요청 예시: { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, "tag": "naver", "category": "50000000", "start_date": "2025-09-01", diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 95b983e4..ab309595 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -8,6 +8,7 @@ from ...service.crawl_service import CrawlService from ...service.search_service import SearchService from ...service.match_service import MatchService +from ...service.similarity_service import SimilarityService # from ...service.similarity_service import SimilarityService @@ -54,33 +55,33 @@ async def match(request: RequestSadaguMatch): raise HTTPException(status_code=500, detail=str(e)) -# @router.post( -# "/similarity", response_model=ResponseSadaguSimilarity, summary="상품 유사도 분석" -# ) -# async def similarity(request: RequestSadaguSimilarity): -# """ -# 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. -# """ -# try: -# similarity_service = SimilarityService() -# result = similarity_service.select_product_by_similarity(request) -# -# if not result: -# raise CustomException( -# 500, "유사도 분석에 실패했습니다.", "SIMILARITY_FAILED" -# ) -# -# return result -# except InvalidItemDataException as e: -# raise HTTPException(status_code=e.status_code, detail=e.detail) -# except Exception as e: -# raise HTTPException(status_code=500, detail=str(e)) +@router.post( + "/similarity", response_model=ResponseSadaguSimilarity, summary="상품 유사도 분석" +) +async def similarity(request: RequestSadaguSimilarity): + """ + 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. + """ + try: + similarity_service = SimilarityService() + result = similarity_service.select_product_by_similarity(request) + + if not result: + raise CustomException( + 500, "유사도 분석에 실패했습니다.", "SIMILARITY_FAILED" + ) + + return result + except InvalidItemDataException as e: + raise HTTPException(status_code=e.status_code, detail=e.detail) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) @router.post( "/crawl", response_model=ResponseSadaguCrawl, summary="상품 상세 정보 크롤링" ) -async def crawl(request: Request, body: RequestSadaguCrawl): +async def crawl(body: RequestSadaguCrawl): """ 상품 상세 페이지를 크롤링하여 상세 정보를 수집합니다. """ diff --git a/apps/pre-processing-service/app/api/router.py b/apps/pre-processing-service/app/api/router.py index 99286cf6..b180c97e 100644 --- a/apps/pre-processing-service/app/api/router.py +++ b/apps/pre-processing-service/app/api/router.py @@ -17,7 +17,6 @@ # 모듈 테스터를 위한 endpoint -> 추후 삭제 예정 api_router.include_router(test.router, prefix="/tests", tags=["Test"]) - @api_router.get("/ping") async def root(): return {"message": "서버 실행중입니다."} diff --git a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py index ff4b2754..d6d6989b 100644 --- a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py @@ -1,61 +1,48 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Dict from app.utils.crawling_util import CrawlingUtil from app.errors.BlogPostingException import * from app.errors.CrawlingException import * - class BaseBlogPostService(ABC): """ 블로그 포스팅 서비스 추상 클래스 """ - def __init__(self, config_file="blog_config.json"): - """공통 초기화 로직""" - # Selenium 기반 서비스를 위한 초기화 - if self._requires_webdriver(): + def __init__(self, use_webdriver=True): + """ + 공통 초기화 로직 + :param use_webdriver: 웹드라이버 사용 여부 (API 서비스의 경우 False) + """ + self.use_webdriver = use_webdriver + + if self.use_webdriver: try: - self.crawling_service = CrawlingUtil() + # 블로그 포스팅용 설정으로 초기화 + self.crawling_service = CrawlingUtil( + headless=False, # 네이버 탐지 우회를 위해 headless 비활성화 + for_blog_posting=True + ) self.web_driver = self.crawling_service.get_driver() self.wait_driver = self.crawling_service.get_wait() except Exception: raise WebDriverConnectionException() else: - # API 기반 서비스의 경우 WebDriver가 필요 없음 self.crawling_service = None self.web_driver = None self.wait_driver = None - # API 기반 서비스를 위한 초기화 - self.config_file = config_file - self.config = {} - self.current_upload_account = None - - # API 관련 속성들 (사용하지 않는 서비스에서는 None으로 유지) - self.blogger_service = None - self.blog_id = None - self.scopes = None - self._load_config() - def _requires_webdriver(self) -> bool: - """ - 서브클래스에서 WebDriver가 필요한지 여부를 반환 - 기본값은 True (Selenium 기반), API 기반 서비스에서는 False로 오버라이드 - """ - return True - @abstractmethod def _load_config(self) -> None: """플랫폼별 설정 로드""" pass + @abstractmethod def _login(self) -> None: - """ - 플랫폼별 로그인 구현 (API 기반 서비스의 경우 인증으로 대체) - 기본 구현은 아무것도 하지 않음 (API 서비스용) - """ + """플랫폼별 로그인 구현""" pass @abstractmethod @@ -74,15 +61,21 @@ def _get_platform_name(self) -> str: pass @abstractmethod - def _validate_content( - self, title: str, content: str, tags: Optional[List[str]] = None - ) -> None: + def _validate_content(self, title: str, content: str, tags: Optional[List[str]] = None) -> None: """ 공통 유효성 검사 로직 :param title: 포스트 제목 :param content: 포스트 내용 :param tags: 포스트 태그 리스트 """ + # if not title or not title.strip(): + # raise BlogContentValidationException("title", "제목이 비어있습니다") + # + # if not content or not content.strip(): + # raise BlogContentValidationException("content", "내용이 비어있습니다") + # + # if tags is None: + # raise BlogContentValidationException("tags", "태그가 비어있습니다") pass def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict: @@ -96,7 +89,7 @@ def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict # 1. 콘텐츠 유효성 검사 self._validate_content(title, content, tags) - # 2. 로그인 (Selenium 기반) 또는 인증 (API 기반) + # 2. 로그인 self._login() # 3. 포스트 작성 및 발행 @@ -107,10 +100,10 @@ def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict "platform": self._get_platform_name(), "title": title, "content_length": len(content), - "tags": tags or [], + "tags": tags or [] } def __del__(self): """공통 리소스 정리""" - if hasattr(self, "web_driver") and self.web_driver: - self.web_driver.quit() + if hasattr(self, 'web_driver') and self.web_driver: + self.web_driver.quit() \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py new file mode 100644 index 00000000..1daba4af --- /dev/null +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py @@ -0,0 +1,82 @@ +from typing import Dict, List, Optional +from app.service.blog.base_blog_post_service import BaseBlogPostService +from app.service.blog.blogger_blog_post_service import BloggerApiService +from app.errors.BlogPostingException import * + + +class BloggerBlogPostAdapter(BaseBlogPostService): + """ + BaseBlogPostService와 호환되도록 BloggerApiService를 감싼 어댑터 + 현재 BaseBlogPostService 인터페이스와 호환 + """ + + def __init__(self, config_file="blog_config.json"): + # API 전용 서비스 (Adaptee) 먼저 초기화 + self.api_service = BloggerApiService(config_file=config_file) + + try: + # 부모 클래스의 웹드라이버 초기화를 시도하지만, 실패해도 무시 + # 이렇게 하면 부모의 다른 초기화 로직은 실행됨 + super().__init__() + except Exception: + # 웹드라이버 초기화 실패 시 API 서비스용으로 속성 설정 + self.crawling_service = None + self.web_driver = None + self.wait_driver = None + # 설정 로드는 직접 호출 + self._load_config() + + def _load_config(self) -> None: + """ + BloggerApiService 내부에서 이미 처리되므로 별도 구현 불필요 + """ + # API 서비스의 설정이 이미 로드되었으므로 추가 작업 없음 + pass + + def _login(self) -> None: + """ + Selenium 로그인과 달리, OAuth 인증으로 대체 + """ + try: + self.api_service.authenticate_with_google_oauth() + except Exception as e: + raise BlogLoginException("Blogger", f"OAuth 인증 실패: {str(e)}") + + def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: + """ + API를 통한 포스트 작성 + """ + try: + result = self.api_service.create_post_via_api(title, content, labels=tags) + # 결과 로깅 + print(f"포스트 생성 완료: {result.get('published_url', 'URL 없음')}") + except Exception as e: + raise BlogPostPublishException("Blogger", f"포스트 작성 실패: {str(e)}") + + def _get_platform_name(self) -> str: + """플랫폼 이름 반환""" + return "Blogger" + + def _validate_content(self, title: str, content: str, tags: Optional[List[str]] = None) -> None: + """ + API 전용 유효성 검사 호출 + """ + try: + # Optional을 List로 변환 (None인 경우 빈 리스트) + tags_list = tags if tags is not None else [] + self.api_service.validate_api_content(title, content, labels=tags_list) + except Exception as e: + # BloggerApiService의 예외를 BaseBlogPostService 호환 예외로 변환 + if "title" in str(e).lower(): + raise BlogContentValidationException("title", str(e)) + elif "content" in str(e).lower(): + raise BlogContentValidationException("content", str(e)) + else: + raise BlogContentValidationException("general", str(e)) + + def __del__(self): + """ + API 서비스이므로 웹드라이버 정리가 불필요 + """ + # 웹드라이버가 없으므로 정리할 것이 없음 + pass \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py index 07e337d9..86de82a6 100644 --- a/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py @@ -1,39 +1,32 @@ import json import os import pickle -from typing import Dict, List, Optional - from googleapiclient.discovery import build from google.auth.transport.requests import Request from google_auth_oauthlib.flow import InstalledAppFlow - from app.errors.BlogPostingException import * -from app.service.blog.base_blog_post_service import BaseBlogPostService +from typing import Dict -class BloggerBlogPostService(BaseBlogPostService): +class BloggerApiService: """ - Blogger API를 사용하여 포스팅을 관리하는 서비스 + 호환되지 않는 Blogger API 서비스 (Adaptee) + 완전히 다른 초기화/인증 방식을 사용 """ def __init__(self, config_file="blog_config.json"): - # 부모 클래스 생성자 호출 (WebDriver는 None으로 설정됨) - super().__init__() - - # API 관련 추가 초기화 self.config_file = config_file + self.config = {} + self.current_upload_account = None self.blogger_service = None self.blog_id = None self.scopes = ["https://www.googleapis.com/auth/blogger"] + self.authenticated = False - def _requires_webdriver(self) -> bool: - """API 기반 서비스는 WebDriver가 필요하지 않음""" - return False + self._load_api_config() - def _load_config(self) -> None: - """ - 플랫폼별 설정 로드 - """ + def _load_api_config(self) -> None: + """API 전용 설정 로드""" try: with open(self.config_file, "r", encoding="utf-8") as f: self.config = json.load(f) @@ -48,16 +41,11 @@ def _load_config(self) -> None: self.config = default_config self.current_upload_account = self.config["upload_account"] - def _login(self) -> None: - """ - API 인증 (Selenium의 로그인을 대체) - """ - self._authenticate_api() + def authenticate_with_google_oauth(self) -> bool: + """Google OAuth 인증 (Selenium 로그인과 완전히 다름)""" + if self.authenticated: + return True - def _authenticate_api(self): - """ - API 인증 및 서비스 객체 생성 - """ token_file = f"token_{self.current_upload_account.replace('@', '_').replace('.', '_')}.pkl" try: @@ -85,22 +73,24 @@ def _authenticate_api(self): if blogs.get("items"): self.blog_id = blogs["items"][0]["id"] print(f"API 설정 완료 - 블로그: {blogs['items'][0]['name']}") + self.authenticated = True return True else: - print("블로그를 찾을 수 없습니다.") - return False + raise BloggerApiException("블로그를 찾을 수 없습니다") + except Exception as e: - print(f"API 인증/설정 실패: {e}") raise BloggerApiException("API 인증 실패", e) - def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: - """ - API를 사용하여 포스팅 작성 - """ - if not self.blogger_service or not self.blog_id: - self._authenticate_api() + def create_post_via_api(self, title: str, content: str, labels: List[str] = None) -> Dict: + """API를 통한 포스트 생성 (Selenium write_content와 완전히 다름)""" + if not self.authenticated: + self.authenticate_with_google_oauth() - post_data = {"title": title, "content": content, "labels": tags or []} + post_data = { + "title": title, + "content": content, + "labels": labels or [] + } try: result = ( @@ -109,35 +99,20 @@ def _write_content(self, title: str, content: str, tags: List[str] = None) -> No .execute() ) - print(f"포스트 생성 완료: {result.get('url')}") + return { + "blogger_post_id": result.get("id"), + "published_url": result.get("url"), + "status": "published" + } except Exception as e: raise BlogPostPublishException( platform="Blogger", reason="API 통신 중 오류가 발생했습니다." ) from e - def _get_platform_name(self) -> str: - """플랫폼 이름 반환""" - return "Blogger" - - def _validate_content( - self, title: str, content: str, tags: Optional[List[str]] = None - ) -> None: - """ - 공통 유효성 검사 로직 - """ + def validate_api_content(self, title: str, content: str, labels: List[str] = None) -> None: + """API 전용 유효성 검사""" if not title or not title.strip(): raise BlogContentValidationException("title", "제목이 비어있습니다") - if not content or not content.strip(): raise BlogContentValidationException("content", "내용이 비어있습니다") - - # 태그 유효성 검사도 필요에 따라 추가 - # if not tags or not isinstance(tags, list): - # raise BlogContentValidationException("tags", "태그는 리스트 형태여야 합니다") - - def __del__(self): - """ - 리소스 정리 - API 기반 서비스는 별도 정리 불필요 - 부모 클래스의 __del__이 WebDriver 정리를 처리 - """ - super().__del__() + # Blogger는 태그가 선택사항 \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index 4122bb2e..8543658e 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -1,5 +1,5 @@ import time -from app.utils.crawler_utils import DetailCrawler +from app.service.crawlers.detail_crawler import DetailCrawler from app.errors.CustomException import InvalidItemDataException from app.model.schemas import RequestSadaguCrawl from loguru import logger @@ -37,9 +37,6 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: # 응답 데이터 구성 response_data = { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "tag": request.tag, "product_url": str(request.product_url), "product_detail": product_detail, diff --git a/apps/pre-processing-service/app/service/crawlers/__init__.py b/apps/pre-processing-service/app/service/crawlers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/pre-processing-service/app/service/crawlers/base_crawler.py b/apps/pre-processing-service/app/service/crawlers/base_crawler.py new file mode 100644 index 00000000..dc495843 --- /dev/null +++ b/apps/pre-processing-service/app/service/crawlers/base_crawler.py @@ -0,0 +1,56 @@ +import httpx +import time +from abc import ABC, abstractmethod +from bs4 import BeautifulSoup +from loguru import logger +from app.utils.crawling_util import CrawlingUtil + + +class BaseCrawler(ABC): + """크롤러 기본 클래스""" + + def __init__(self, use_selenium: bool = True, headless: bool = True): + self.base_url = "https://ssadagu.kr" + self.use_selenium = use_selenium + + if use_selenium: + self._setup_selenium(headless) + else: + self._setup_httpx() + + def _setup_selenium(self, headless: bool): + """Selenium WebDriver 초기화""" + try: + self.crawling_util = CrawlingUtil(headless=headless) + self.driver = self.crawling_util.get_driver() + self.wait = self.crawling_util.get_wait() + logger.info("Selenium WebDriver 초기화 완료") + except Exception as e: + logger.warning(f"Selenium 초기화 실패, httpx로 대체: {e}") + self.use_selenium = False + self._setup_httpx() + + def _setup_httpx(self): + """httpx 클라이언트 초기화""" + self.client = httpx.AsyncClient( + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + }, + timeout=30.0, + ) + logger.info("httpx 클라이언트 초기화 완료") + + async def close(self): + """리소스 정리""" + if self.use_selenium and hasattr(self, 'crawling_util'): + try: + self.crawling_util.close() + logger.info("Selenium WebDriver 종료 완료") + except Exception as e: + logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") + elif hasattr(self, 'client'): + try: + await self.client.aclose() + logger.info("httpx 클라이언트 종료 완료") + except Exception as e: + logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") \ No newline at end of file diff --git a/apps/pre-processing-service/app/utils/crawler_utils.py b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py similarity index 55% rename from apps/pre-processing-service/app/utils/crawler_utils.py rename to apps/pre-processing-service/app/service/crawlers/detail_crawler.py index 5c593b9f..83829f5a 100644 --- a/apps/pre-processing-service/app/utils/crawler_utils.py +++ b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py @@ -1,185 +1,9 @@ -import urllib.parse -import httpx -import re import time +import re from bs4 import BeautifulSoup -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.common.exceptions import TimeoutException, NoSuchElementException +from .search_crawler import SearchCrawler from loguru import logger - -class SearchCrawler: - def __init__(self, use_selenium=True): - self.base_url = "https://ssadagu.kr" - self.use_selenium = use_selenium - - if use_selenium: - self._setup_selenium() - else: - self._setup_httpx() - - def _setup_selenium(self): - """Selenium WebDriver 초기화""" - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument( - "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - ) - - try: - self.driver = webdriver.Chrome(options=chrome_options) - self.wait = WebDriverWait(self.driver, 10) - logger.info("Selenium WebDriver 초기화 완료") - except Exception as e: - logger.warning(f"Selenium 초기화 실패, httpx로 대체: {e}") - self.use_selenium = False - self._setup_httpx() - - def _setup_httpx(self): - """httpx 클라이언트 초기화""" - self.client = httpx.AsyncClient( - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - }, - timeout=30.0, - ) - logger.info("httpx 클라이언트 초기화 완료") - - async def search_products_selenium(self, keyword: str) -> list[dict]: - """Selenium을 사용한 상품 검색""" - encoded_keyword = urllib.parse.quote(keyword) - search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" - - try: - logger.info( - f"Selenium 상품 검색 시작: keyword='{keyword}', url='{search_url}'" - ) - self.driver.get(search_url) - time.sleep(5) - - product_links = [] - link_elements = self.driver.find_elements(By.TAG_NAME, "a") - - for element in link_elements: - href = element.get_attribute("href") - if ( - href - and "view.php" in href - and ("platform=1688" in href or "num_iid" in href) - ): - try: - title = element.get_attribute("title") or element.text.strip() - if title: - product_links.append({"url": href, "title": title}) - except: - product_links.append({"url": href, "title": "Unknown Title"}) - - # 중복 제거 - seen_urls = set() - unique_products = [] - for product in product_links: - if product["url"] not in seen_urls: - seen_urls.add(product["url"]) - unique_products.append(product) - - logger.info( - f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" - ) - return unique_products[:20] - - except Exception as e: - logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") - return [] - - async def search_products_httpx(self, keyword: str) -> list[dict]: - """httpx를 사용한 상품 검색""" - encoded_keyword = urllib.parse.quote(keyword) - search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" - - try: - logger.info( - f"httpx 상품 검색 시작: keyword='{keyword}', url='{search_url}'" - ) - response = await self.client.get(search_url) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") - - product_links = [] - all_links = soup.find_all("a", href=True) - - for link in all_links: - href = link["href"] - if "view.php" in href and ( - "platform=1688" in href or "num_iid" in href - ): - full_url = ( - f"{self.base_url}{href}" if href.startswith("/") else href - ) - title = ( - link.get("title", "") - or link.get_text(strip=True) - or "Unknown Title" - ) - - product_links.append({"url": full_url, "title": title}) - - logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") - return product_links[:20] - - except Exception as e: - logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") - return [] - - async def get_basic_product_info(self, product_url: str) -> dict: - """기본 상품 정보만 크롤링""" - try: - logger.debug(f"기본 상품 정보 크롤링 시작: url='{product_url}'") - - if self.use_selenium: - self.driver.get(product_url) - self.wait.until( - lambda driver: driver.execute_script("return document.readyState") - == "complete" - ) - soup = BeautifulSoup(self.driver.page_source, "html.parser") - else: - response = await self.client.get(product_url) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") - - title_element = soup.find("h1", {"id": "kakaotitle"}) - title = title_element.get_text(strip=True) if title_element else "제목 없음" - - logger.debug(f"기본 상품 정보 크롤링 완료: title='{title[:50]}'") - return {"url": product_url, "title": title} - - except Exception as e: - logger.error(f"기본 상품 크롤링 오류: url='{product_url}', error='{e}'") - return None - - async def close(self): - """리소스 정리""" - if self.use_selenium and hasattr(self, "driver"): - try: - self.driver.quit() - logger.info("Selenium WebDriver 종료 완료") - except Exception as e: - logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") - elif hasattr(self, "client"): - try: - await self.client.aclose() - logger.info("httpx 클라이언트 종료 완료") - except Exception as e: - logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") - - class DetailCrawler(SearchCrawler): """SearchCrawler를 확장한 상세 크롤링 클래스""" @@ -404,4 +228,4 @@ def _extract_images(self, soup: BeautifulSoup) -> list[str]: logger.debug(f"이미지 URL 추출: {src}") logger.info(f"총 {len(images)}개 이미지 URL 추출 완료") - return images + return images \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py new file mode 100644 index 00000000..41610a2d --- /dev/null +++ b/apps/pre-processing-service/app/service/crawlers/search_crawler.py @@ -0,0 +1,136 @@ +import urllib.parse +import time +from .base_crawler import BaseCrawler +from loguru import logger +from bs4 import BeautifulSoup +from selenium.webdriver.common.by import By + +class SearchCrawler(BaseCrawler): + """상품 검색 전용 크롤러""" + + async def search_products_selenium(self, keyword: str) -> list[dict]: + """Selenium을 사용한 상품 검색""" + encoded_keyword = urllib.parse.quote(keyword) + search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" + + try: + logger.info( + f"Selenium 상품 검색 시작: keyword='{keyword}', url='{search_url}'" + ) + self.driver.get(search_url) + time.sleep(5) + + product_links = [] + link_elements = self.driver.find_elements(By.TAG_NAME, "a") + + for element in link_elements: + href = element.get_attribute("href") + if ( + href + and "view.php" in href + and ("platform=1688" in href or "num_iid" in href) + ): + try: + title = element.get_attribute("title") or element.text.strip() + if title: + product_links.append({"url": href, "title": title}) + except: + product_links.append({"url": href, "title": "Unknown Title"}) + + # 중복 제거 + seen_urls = set() + unique_products = [] + for product in product_links: + if product["url"] not in seen_urls: + seen_urls.add(product["url"]) + unique_products.append(product) + + logger.info( + f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" + ) + return unique_products[:20] + + except Exception as e: + logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") + return [] + + async def search_products_httpx(self, keyword: str) -> list[dict]: + """httpx를 사용한 상품 검색""" + encoded_keyword = urllib.parse.quote(keyword) + search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" + + try: + logger.info( + f"httpx 상품 검색 시작: keyword='{keyword}', url='{search_url}'" + ) + response = await self.client.get(search_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + product_links = [] + all_links = soup.find_all("a", href=True) + + for link in all_links: + href = link["href"] + if "view.php" in href and ( + "platform=1688" in href or "num_iid" in href + ): + full_url = ( + f"{self.base_url}{href}" if href.startswith("/") else href + ) + title = ( + link.get("title", "") + or link.get_text(strip=True) + or "Unknown Title" + ) + + product_links.append({"url": full_url, "title": title}) + + logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") + return product_links[:20] + + except Exception as e: + logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") + return [] + + async def get_basic_product_info(self, product_url: str) -> dict: + """기본 상품 정보만 크롤링""" + try: + logger.debug(f"기본 상품 정보 크롤링 시작: url='{product_url}'") + + if self.use_selenium: + self.driver.get(product_url) + self.wait.until( + lambda driver: driver.execute_script("return document.readyState") + == "complete" + ) + soup = BeautifulSoup(self.driver.page_source, "html.parser") + else: + response = await self.client.get(product_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + title_element = soup.find("h1", {"id": "kakaotitle"}) + title = title_element.get_text(strip=True) if title_element else "제목 없음" + + logger.debug(f"기본 상품 정보 크롤링 완료: title='{title[:50]}'") + return {"url": product_url, "title": title} + + except Exception as e: + logger.error(f"기본 상품 크롤링 오류: url='{product_url}', error='{e}'") + return None + + async def close(self): + """리소스 정리""" + if self.use_selenium and hasattr(self, "driver"): + try: + self.driver.quit() + logger.info("Selenium WebDriver 종료 완료") + except Exception as e: + logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") + elif hasattr(self, "client"): + try: + await self.client.aclose() + logger.info("httpx 클라이언트 종료 완료") + except Exception as e: + logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index 4cb1bf99..f7255e61 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -1,4 +1,4 @@ -from app.utils.crawler_utils import SearchCrawler +from app.service.crawlers.search_crawler import SearchCrawler from app.errors.CustomException import InvalidItemDataException from ..model.schemas import RequestSadaguSearch from loguru import logger diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index dbd2b762..9015bd94 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -27,9 +27,6 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict f"매칭된 상품과 검색 결과가 모두 없음: keyword='{keyword}'" ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "selected_product": None, "reason": "매칭된 상품과 검색 결과가 모두 없음", @@ -87,9 +84,6 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "selected_product": selected_product, "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", @@ -124,9 +118,6 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict f"최고 유사도 미달: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}" ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "selected_product": None, "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", @@ -161,9 +152,6 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "selected_product": selected_product, "reason": reason, diff --git a/apps/pre-processing-service/app/test/test_keyword.py b/apps/pre-processing-service/app/test/test_keyword.py index 2a96796e..095b6607 100644 --- a/apps/pre-processing-service/app/test/test_keyword.py +++ b/apps/pre-processing-service/app/test/test_keyword.py @@ -4,11 +4,6 @@ client = TestClient(app) -JOB_ID = 1 -SCHEDULE_ID = 1 -SCHEDULE_HIS_ID = 1 - - def test_read_root(): response = client.get("/keywords/") assert response.status_code == 200 @@ -26,9 +21,6 @@ def test_read_root(): ) def test_search(tag, category, start_date, end_date): body = { - "job_id": JOB_ID, - "schedule_id": SCHEDULE_ID, - "schedule_his_id": SCHEDULE_HIS_ID, # 오타 수정 "tag": tag, "category": category, "start_date": start_date, @@ -39,9 +31,6 @@ def test_search(tag, category, start_date, end_date): assert response.status_code == 200 response_data = response.json() - assert response_data["job_id"] == body["job_id"] - assert response_data["schedule_id"] == body["schedule_id"] - assert response_data["schedule_his_id"] == body["schedule_his_id"] # 오타 수정 assert response_data["status"] == "success" assert "keyword" in response_data assert isinstance(response_data["total_keyword"], dict) diff --git a/apps/pre-processing-service/app/test/test_match_service.py b/apps/pre-processing-service/app/test/test_match_service.py index 7750cd3d..3f50ffad 100644 --- a/apps/pre-processing-service/app/test/test_match_service.py +++ b/apps/pre-processing-service/app/test/test_match_service.py @@ -23,9 +23,6 @@ def test_match_success(): ] body = { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, "keyword": "반지", "search_results": sample_search_results, } @@ -35,7 +32,6 @@ def test_match_success(): assert response.status_code == 200 data = response.json() - assert data["job_id"] == body["job_id"] assert data["keyword"] == body["keyword"] assert data["status"] == "success" assert isinstance(data["matched_products"], list) @@ -51,9 +47,6 @@ def test_match_success(): def test_match_no_results(): """검색 결과가 없는 경우""" body = { - "job_id": 2, - "schedule_id": 2, - "schedule_his_id": 2, "keyword": "반지", "search_results": [], } @@ -80,9 +73,6 @@ def test_match_no_matches(): ] body = { - "job_id": 3, - "schedule_id": 3, - "schedule_his_id": 3, "keyword": "반지", "search_results": sample_search_results, } diff --git a/apps/pre-processing-service/app/test/test_sadagu_crawl.py b/apps/pre-processing-service/app/test/test_sadagu_crawl.py index 6c6ad84a..b419b5c6 100644 --- a/apps/pre-processing-service/app/test/test_sadagu_crawl.py +++ b/apps/pre-processing-service/app/test/test_sadagu_crawl.py @@ -7,9 +7,6 @@ def test_crawl_success(): body = { - "job_id": 1, # 문자열 -> 숫자로 수정 - "schedule_id": 1, # 문자열 -> 숫자로 수정 - "schedule_his_id": 1, "tag": "detail", "product_url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=886788894790", "use_selenium": False, @@ -21,8 +18,6 @@ def test_crawl_success(): assert response.status_code == 200 data = response.json() - assert data["job_id"] == body["job_id"] - assert data["schedule_id"] == body["schedule_id"] assert data["product_url"] == body["product_url"] assert "product_detail" in data @@ -39,7 +34,7 @@ def test_crawl_success(): # "include_images": False, # } # -# response = client.post("/products/crawl", json=body) +# response = client.post("/products/crawlers", json=body) # print(f"Response: {response.json()}") # # assert response.status_code == 200 @@ -62,7 +57,7 @@ def test_crawl_success(): # "include_images": False, # } # -# response = client.post("/products/crawl", json=body) +# response = client.post("/products/crawlers", json=body) # print(f"Response: {response.json()}") # # assert response.status_code in (400, 422, 500) @@ -79,7 +74,7 @@ def test_crawl_success(): # "include_images": True, # } # -# response = client.post("/products/crawl", json=body) +# response = client.post("/products/crawlers", json=body) # print(f"Response: {response.json()}") # # assert response.status_code == 200 diff --git a/apps/pre-processing-service/app/test/test_search_service.py b/apps/pre-processing-service/app/test/test_search_service.py index fc64c9cd..d5d3a618 100644 --- a/apps/pre-processing-service/app/test/test_search_service.py +++ b/apps/pre-processing-service/app/test/test_search_service.py @@ -7,14 +7,13 @@ def test_search_success(): """상품 검색 성공 테스트""" - body = {"job_id": 1, "schedule_id": 1, "schedule_his_id": 1, "keyword": "반지"} + body = {"keyword": "반지"} response = client.post("/products/search", json=body) print(f"Search Response: {response.json()}") assert response.status_code == 200 data = response.json() - assert data["job_id"] == body["job_id"] assert data["keyword"] == body["keyword"] assert data["status"] == "success" assert isinstance(data["search_results"], list) @@ -22,7 +21,7 @@ def test_search_success(): def test_search_empty_keyword(): """빈 키워드 검색 테스트""" - body = {"job_id": 2, "schedule_id": 2, "schedule_his_id": 2, "keyword": ""} + body = {"keyword": ""} response = client.post("/products/search", json=body) print(f"Empty keyword response: {response.json()}") @@ -36,9 +35,6 @@ def test_search_empty_keyword(): def test_search_nonexistent_keyword(): """존재하지 않는 키워드 검색""" body = { - "job_id": 3, - "schedule_id": 3, - "schedule_his_id": 3, "keyword": "zxcvbnmasdfghjklqwertyuiop123456789", } diff --git a/apps/pre-processing-service/app/test/test_similarity_service.py b/apps/pre-processing-service/app/test/test_similarity_service.py index cb84d3c3..5eeba78d 100644 --- a/apps/pre-processing-service/app/test/test_similarity_service.py +++ b/apps/pre-processing-service/app/test/test_similarity_service.py @@ -29,9 +29,6 @@ def test_similarity_with_matched_products(): ] body = { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, "keyword": "반지", "matched_products": matched_products, } @@ -41,7 +38,6 @@ def test_similarity_with_matched_products(): assert response.status_code == 200 data = response.json() - assert data["job_id"] == body["job_id"] assert data["keyword"] == body["keyword"] assert data["status"] == "success" @@ -65,9 +61,6 @@ def test_similarity_fallback_to_search_results(): ] body = { - "job_id": 2, - "schedule_id": 2, - "schedule_his_id": 2, "keyword": "반지", "matched_products": [], # 매칭된 상품 없음 "search_results": search_results, # 폴백용 @@ -100,9 +93,6 @@ def test_similarity_single_candidate(): ] body = { - "job_id": 3, - "schedule_id": 3, - "schedule_his_id": 3, "keyword": "반지", "matched_products": single_product, } @@ -122,9 +112,6 @@ def test_similarity_single_candidate(): def test_similarity_no_candidates(): """후보가 없는 경우""" body = { - "job_id": 4, - "schedule_id": 4, - "schedule_his_id": 4, "keyword": "반지", "matched_products": [], "search_results": [], diff --git a/apps/pre-processing-service/app/utils/crawling_util.py b/apps/pre-processing-service/app/utils/crawling_util.py index 8ec47518..ca9d0405 100644 --- a/apps/pre-processing-service/app/utils/crawling_util.py +++ b/apps/pre-processing-service/app/utils/crawling_util.py @@ -1,59 +1,70 @@ from selenium import webdriver from selenium.webdriver.chrome.options import Options from selenium.webdriver.support.ui import WebDriverWait +from typing import Optional class CrawlingUtil: + """ + 공통 Selenium WebDriver 유틸리티 + 블로그 포스팅과 상품 크롤링 모두 지원 + """ - def __init__(self): - self.options = self._get_chrome_options() - self.driver = None - - def _get_chrome_options(self): + def __init__(self, headless: bool = False, for_blog_posting: bool = False): """ - 크롬 옵션 설정 - 1. 헤드리스 모드 비활성화 (네이버 탐지 우회) - 2. 샌드박스 비활성화 - 3. GPU 비활성화 - 4. 완전한 사용자 에이전트 설정 - 5. 자동화 탐지 우회 설정 + :param headless: 헤드리스 모드 사용 여부 + :param for_blog_posting: 블로그 포스팅용 설정 사용 여부 """ + self.headless = headless + self.for_blog_posting = for_blog_posting + self.options = self._get_chrome_options() + self.driver = None + def _get_chrome_options(self) -> Options: + """크롬 옵션 설정""" options = Options() - options.add_argument( - "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" - ) - # options.add_argument('--headless') 백그라운드 실행시 주석 해제 + # 기본 설정 options.add_argument("--no-sandbox") options.add_argument("--disable-dev-shm-usage") options.add_argument("--disable-gpu") options.add_argument("--disable-extensions") - options.add_experimental_option("excludeSwitches", ["enable-automation"]) - options.add_experimental_option("useAutomationExtension", False) - options.add_argument("--disable-blink-features=AutomationControlled") - return options + # 헤드리스 모드 설정 + if self.headless: + options.add_argument("--headless") + options.add_argument("--window-size=1920,1080") - def get_driver(self): - """ - 셀레니움 웹 드라이버 반환 - :return: 셀레니움 웹 드라이버 - """ + # 블로그 포스팅용 특별 설정 (네이버 탐지 우회) + if self.for_blog_posting: + options.add_argument( + "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/131.0.0.0 Safari/537.36" + ) + options.add_experimental_option("excludeSwitches", ["enable-automation"]) + options.add_experimental_option("useAutomationExtension", False) + options.add_argument("--disable-blink-features=AutomationControlled") + else: + # 일반 크롤링용 설정 + options.add_argument( + "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + ) + return options + + def get_driver(self) -> webdriver.Chrome: + """셀레니움 웹 드라이버 반환""" if self.driver is None: self.driver = webdriver.Chrome(options=self.options) - return self.driver - def get_wait(self, timeout: int = 15): - """ - WebDriverWait 객체 반환 - :param timeout: 대기 시간 (초) - :return: WebDriverWait 객체 - """ - + def get_wait(self, timeout: int = 15) -> WebDriverWait: + """WebDriverWait 객체 반환""" if self.driver is None: self.get_driver() - return WebDriverWait(self.driver, timeout) + + def close(self): + """드라이버 종료""" + if self.driver: + self.driver.quit() + self.driver = None \ No newline at end of file From 1acd33c8a3aff9b99bf3276ed17739f64522ba5f Mon Sep 17 00:00:00 2001 From: thkim7 Date: Tue, 16 Sep 2025 11:06:31 +0900 Subject: [PATCH 2/3] =?UTF-8?q?chore:=20=EC=BD=94=EB=93=9C=EC=97=90=20?= =?UTF-8?q?=EC=9E=88=EB=8A=94=20job=5Fid=20=EC=A0=84=EB=B6=80=20=EC=A0=9C?= =?UTF-8?q?=EA=B1=B0=201.=20middleware=20=EC=88=98=EC=A0=95=ED=95=B4?= =?UTF-8?q?=EC=95=BC=ED=95=A8(=EA=B2=BD=EB=AF=BC)=202.=20pytest=20all=20pa?= =?UTF-8?q?ss?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/api/endpoints/test.py | 13 ++++--------- .../app/model/schemas.py | 18 ------------------ .../app/service/crawl_service.py | 6 +++--- .../app/service/search_service.py | 8 +------- .../app/service/similarity_service.py | 7 ++----- 5 files changed, 10 insertions(+), 42 deletions(-) diff --git a/apps/pre-processing-service/app/api/endpoints/test.py b/apps/pre-processing-service/app/api/endpoints/test.py index 9e17a7c4..91977a3f 100644 --- a/apps/pre-processing-service/app/api/endpoints/test.py +++ b/apps/pre-processing-service/app/api/endpoints/test.py @@ -62,11 +62,6 @@ def with_meta(data: Mapping[str, Any], meta: Mapping[str, Any]) -> Dict[str, Any @router.get("/tester", response_model=None) async def processing_tester(): - meta = { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, # ✅ 타이포 수정 - } request_dict = { "tag": "naver", "category": "50000000", @@ -74,7 +69,7 @@ async def processing_tester(): "end_date": "2025-09-02", } # 네이버 키워드 검색 - naver_request = RequestNaverSearch(**with_meta(meta, request_dict)) + naver_request = RequestNaverSearch(**with_meta(request_dict)) response_data = await keyword_search(naver_request) keyword = response_data.get("keyword") loguru.logger.info(keyword) @@ -84,21 +79,21 @@ async def processing_tester(): } # 싸다구 상품 검색 - sadagu_request = RequestSadaguSearch(**with_meta(meta, keyword)) + sadagu_request = RequestSadaguSearch(**with_meta(keyword)) search_service = SearchService() keyword_result = await search_service.search_products(sadagu_request) loguru.logger.info(keyword_result) # 싸다구 상품 매치 keyword["search_results"] = keyword_result.get("search_results") - keyword_match_request = RequestSadaguMatch(**with_meta(meta, keyword)) + keyword_match_request = RequestSadaguMatch(**with_meta(keyword)) match_service = MatchService() keyword_match_response = match_service.match_products(keyword_match_request) loguru.logger.info(keyword_match_response) # 싸다구 상품 유사도 분석 keyword["matched_products"] = keyword_match_response.get("matched_products") - keyword_similarity_request = RequestSadaguSimilarity(**with_meta(meta, keyword)) + keyword_similarity_request = RequestSadaguSimilarity(**with_meta(keyword)) # similarity_service = SimilarityService() # keyword_similarity_response = similarity_service.select_product_by_similarity( # keyword_similarity_request diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index 52775416..9581ad0f 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -5,29 +5,11 @@ # 기본 요청 class RequestBase(BaseModel): - # job_id: int = Field( - # ..., title="작업 ID", description="현재 실행 중인 작업의 고유 식별자" - # ) - # schedule_id: int = Field( - # ..., title="스케줄 ID", description="예약된 스케줄의 고유 식별자" - # ) - # schedule_his_id: Optional[int] = Field( - # None, title="스케줄 히스토리 ID", description="스케줄 실행 이력의 고유 식별자" - # ) pass # 기본 응답 class ResponseBase(BaseModel): - # job_id: int = Field( - # ..., title="작업 ID", description="현재 실행 중인 작업의 고유 식별자" - # ) - # schedule_id: int = Field( - # ..., title="스케줄 ID", description="예약된 스케줄의 고유 식별자" - # ) - # schedule_his_id: Optional[int] = Field( - # None, title="스케줄 히스토리 ID", description="스케줄 실행 이력의 고유 식별자" - # ) status: str = Field(..., title="상태", description="요청 처리 상태") pass diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index 8543658e..7d6a8d1a 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -18,7 +18,7 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: try: logger.info( - f"상품 상세 크롤링 서비스 시작: job_id={request.job_id}, schedule_id={request.schedule_id}, product_url={request.product_url}" + f"상품 상세 크롤링 서비스 시작: product_url={request.product_url}" ) # 상세 정보 크롤링 실행 @@ -45,13 +45,13 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: } logger.info( - f"상품 상세 크롤링 서비스 완료: job_id={request.job_id}, status=success" + f"상품 상세 크롤링 서비스 완료: status=success" ) return response_data except Exception as e: logger.error( - f"크롤링 서비스 오류: job_id={request.job_id}, product_url={request.product_url}, error='{e}'" + f"크롤링 서비스 오류: product_url={request.product_url}, error='{e}'" ) raise InvalidItemDataException() finally: diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index f7255e61..ec4ca59c 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -30,9 +30,6 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: if not search_results: logger.warning(f"검색 결과가 없습니다: keyword='{keyword}'") return { - # "job_id": request.job_id, - # "schedule_id": request.schedule_id, - # "schedule_his_id": request.schedule_his_id, "keyword": keyword, "search_results": [], "status": "success", @@ -91,9 +88,6 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: ) return { - # "job_id": request.job_id, - # "schedule_id": request.schedule_id, - # "schedule_his_id": request.schedule_his_id, "keyword": keyword, "search_results": enriched_results, "status": "success", @@ -101,7 +95,7 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: except Exception as e: logger.error( - f"검색 서비스 오류: job_id={request.job_id}, keyword='{keyword}', error='{e}'" + f"검색 서비스 오류: keyword='{keyword}', error='{e}'" ) raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 9015bd94..0241fca3 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -17,7 +17,7 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict fallback_products = request.search_results or [] logger.info( - f"유사도 분석 서비스 시작: job_id={request.job_id}, keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" + f"유사도 분석 서비스 시작: keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" ) # 매칭된 상품이 없으면 전체 검색 결과로 폴백 @@ -64,9 +64,6 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict f"단일 상품 유사도 미달: similarity={similarity:.4f} < threshold={similarity_threshold}" ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "selected_product": None, "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", @@ -160,6 +157,6 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict except Exception as e: logger.error( - f"유사도 분석 서비스 오류: job_id={request.job_id}, keyword='{keyword}', error='{e}'" + f"유사도 분석 서비스 오류: keyword='{keyword}', error='{e}'" ) raise InvalidItemDataException() From 5700b777962528fa83a7cdf6a5f49dc2ed7bf90c Mon Sep 17 00:00:00 2001 From: thkim7 Date: Tue, 16 Sep 2025 11:13:44 +0900 Subject: [PATCH 3/3] chore: poetry run black . --- .../app/api/endpoints/blog.py | 6 ++++-- .../app/api/endpoints/keywords.py | 2 ++ apps/pre-processing-service/app/api/router.py | 1 + .../app/service/blog/base_blog_post_service.py | 13 ++++++++----- .../service/blog/blogger_blog_post_adapter.py | 6 ++++-- .../service/blog/blogger_blog_post_service.py | 18 +++++++++--------- .../app/service/crawl_service.py | 4 +--- .../app/service/crawlers/base_crawler.py | 6 +++--- .../app/service/crawlers/detail_crawler.py | 3 ++- .../app/service/crawlers/search_crawler.py | 3 ++- .../app/service/search_service.py | 4 +--- .../app/service/similarity_service.py | 4 +--- .../app/test/test_keyword.py | 1 + .../app/utils/crawling_util.py | 2 +- 14 files changed, 40 insertions(+), 33 deletions(-) diff --git a/apps/pre-processing-service/app/api/endpoints/blog.py b/apps/pre-processing-service/app/api/endpoints/blog.py index 158faf20..85da62b2 100644 --- a/apps/pre-processing-service/app/api/endpoints/blog.py +++ b/apps/pre-processing-service/app/api/endpoints/blog.py @@ -4,7 +4,9 @@ from ...model.schemas import * from app.service.blog.tistory_blog_post_service import TistoryBlogPostService from app.service.blog.naver_blog_post_service import NaverBlogPostService -from ...service.blog.blogger_blog_post_adapter import BloggerBlogPostAdapter # 수정된 import +from ...service.blog.blogger_blog_post_adapter import ( + BloggerBlogPostAdapter, +) # 수정된 import router = APIRouter() @@ -74,4 +76,4 @@ async def publish(request: RequestBlogPublish): "블로거 블로그 포스팅에 실패했습니다.", status_code=500 ) - return ResponseBlogPublish(status="success", metadata=result) \ No newline at end of file + return ResponseBlogPublish(status="success", metadata=result) diff --git a/apps/pre-processing-service/app/api/endpoints/keywords.py b/apps/pre-processing-service/app/api/endpoints/keywords.py index 43c0049b..a1028391 100644 --- a/apps/pre-processing-service/app/api/endpoints/keywords.py +++ b/apps/pre-processing-service/app/api/endpoints/keywords.py @@ -5,10 +5,12 @@ router = APIRouter() + @router.get("/") async def root(): return {"message": "keyword API"} + @router.post( "/search", response_model=ResponseNaverSearch, summary="네이버 키워드 검색" ) diff --git a/apps/pre-processing-service/app/api/router.py b/apps/pre-processing-service/app/api/router.py index b180c97e..99286cf6 100644 --- a/apps/pre-processing-service/app/api/router.py +++ b/apps/pre-processing-service/app/api/router.py @@ -17,6 +17,7 @@ # 모듈 테스터를 위한 endpoint -> 추후 삭제 예정 api_router.include_router(test.router, prefix="/tests", tags=["Test"]) + @api_router.get("/ping") async def root(): return {"message": "서버 실행중입니다."} diff --git a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py index d6d6989b..f55bdba0 100644 --- a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py @@ -5,6 +5,7 @@ from app.errors.BlogPostingException import * from app.errors.CrawlingException import * + class BaseBlogPostService(ABC): """ 블로그 포스팅 서비스 추상 클래스 @@ -22,7 +23,7 @@ def __init__(self, use_webdriver=True): # 블로그 포스팅용 설정으로 초기화 self.crawling_service = CrawlingUtil( headless=False, # 네이버 탐지 우회를 위해 headless 비활성화 - for_blog_posting=True + for_blog_posting=True, ) self.web_driver = self.crawling_service.get_driver() self.wait_driver = self.crawling_service.get_wait() @@ -61,7 +62,9 @@ def _get_platform_name(self) -> str: pass @abstractmethod - def _validate_content(self, title: str, content: str, tags: Optional[List[str]] = None) -> None: + def _validate_content( + self, title: str, content: str, tags: Optional[List[str]] = None + ) -> None: """ 공통 유효성 검사 로직 :param title: 포스트 제목 @@ -100,10 +103,10 @@ def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict "platform": self._get_platform_name(), "title": title, "content_length": len(content), - "tags": tags or [] + "tags": tags or [], } def __del__(self): """공통 리소스 정리""" - if hasattr(self, 'web_driver') and self.web_driver: - self.web_driver.quit() \ No newline at end of file + if hasattr(self, "web_driver") and self.web_driver: + self.web_driver.quit() diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py index 1daba4af..717a102e 100644 --- a/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py @@ -57,7 +57,9 @@ def _get_platform_name(self) -> str: """플랫폼 이름 반환""" return "Blogger" - def _validate_content(self, title: str, content: str, tags: Optional[List[str]] = None) -> None: + def _validate_content( + self, title: str, content: str, tags: Optional[List[str]] = None + ) -> None: """ API 전용 유효성 검사 호출 """ @@ -79,4 +81,4 @@ def __del__(self): API 서비스이므로 웹드라이버 정리가 불필요 """ # 웹드라이버가 없으므로 정리할 것이 없음 - pass \ No newline at end of file + pass diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py index 86de82a6..8bdeb221 100644 --- a/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py @@ -81,16 +81,14 @@ def authenticate_with_google_oauth(self) -> bool: except Exception as e: raise BloggerApiException("API 인증 실패", e) - def create_post_via_api(self, title: str, content: str, labels: List[str] = None) -> Dict: + def create_post_via_api( + self, title: str, content: str, labels: List[str] = None + ) -> Dict: """API를 통한 포스트 생성 (Selenium write_content와 완전히 다름)""" if not self.authenticated: self.authenticate_with_google_oauth() - post_data = { - "title": title, - "content": content, - "labels": labels or [] - } + post_data = {"title": title, "content": content, "labels": labels or []} try: result = ( @@ -102,17 +100,19 @@ def create_post_via_api(self, title: str, content: str, labels: List[str] = None return { "blogger_post_id": result.get("id"), "published_url": result.get("url"), - "status": "published" + "status": "published", } except Exception as e: raise BlogPostPublishException( platform="Blogger", reason="API 통신 중 오류가 발생했습니다." ) from e - def validate_api_content(self, title: str, content: str, labels: List[str] = None) -> None: + def validate_api_content( + self, title: str, content: str, labels: List[str] = None + ) -> None: """API 전용 유효성 검사""" if not title or not title.strip(): raise BlogContentValidationException("title", "제목이 비어있습니다") if not content or not content.strip(): raise BlogContentValidationException("content", "내용이 비어있습니다") - # Blogger는 태그가 선택사항 \ No newline at end of file + # Blogger는 태그가 선택사항 diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index 7d6a8d1a..548df05d 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -44,9 +44,7 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - logger.info( - f"상품 상세 크롤링 서비스 완료: status=success" - ) + logger.info(f"상품 상세 크롤링 서비스 완료: status=success") return response_data except Exception as e: diff --git a/apps/pre-processing-service/app/service/crawlers/base_crawler.py b/apps/pre-processing-service/app/service/crawlers/base_crawler.py index dc495843..27934ab5 100644 --- a/apps/pre-processing-service/app/service/crawlers/base_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/base_crawler.py @@ -42,15 +42,15 @@ def _setup_httpx(self): async def close(self): """리소스 정리""" - if self.use_selenium and hasattr(self, 'crawling_util'): + if self.use_selenium and hasattr(self, "crawling_util"): try: self.crawling_util.close() logger.info("Selenium WebDriver 종료 완료") except Exception as e: logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") - elif hasattr(self, 'client'): + elif hasattr(self, "client"): try: await self.client.aclose() logger.info("httpx 클라이언트 종료 완료") except Exception as e: - logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") \ No newline at end of file + logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") diff --git a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py index 83829f5a..885fd2f0 100644 --- a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py @@ -4,6 +4,7 @@ from .search_crawler import SearchCrawler from loguru import logger + class DetailCrawler(SearchCrawler): """SearchCrawler를 확장한 상세 크롤링 클래스""" @@ -228,4 +229,4 @@ def _extract_images(self, soup: BeautifulSoup) -> list[str]: logger.debug(f"이미지 URL 추출: {src}") logger.info(f"총 {len(images)}개 이미지 URL 추출 완료") - return images \ No newline at end of file + return images diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py index 41610a2d..a0d46e02 100644 --- a/apps/pre-processing-service/app/service/crawlers/search_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/search_crawler.py @@ -5,6 +5,7 @@ from bs4 import BeautifulSoup from selenium.webdriver.common.by import By + class SearchCrawler(BaseCrawler): """상품 검색 전용 크롤러""" @@ -133,4 +134,4 @@ async def close(self): await self.client.aclose() logger.info("httpx 클라이언트 종료 완료") except Exception as e: - logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") \ No newline at end of file + logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index ec4ca59c..a71d6a8d 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -94,9 +94,7 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: } except Exception as e: - logger.error( - f"검색 서비스 오류: keyword='{keyword}', error='{e}'" - ) + logger.error(f"검색 서비스 오류: keyword='{keyword}', error='{e}'") raise InvalidItemDataException() finally: diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 0241fca3..c77aa8ba 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -156,7 +156,5 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict } except Exception as e: - logger.error( - f"유사도 분석 서비스 오류: keyword='{keyword}', error='{e}'" - ) + logger.error(f"유사도 분석 서비스 오류: keyword='{keyword}', error='{e}'") raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/test/test_keyword.py b/apps/pre-processing-service/app/test/test_keyword.py index 095b6607..11bd69fa 100644 --- a/apps/pre-processing-service/app/test/test_keyword.py +++ b/apps/pre-processing-service/app/test/test_keyword.py @@ -4,6 +4,7 @@ client = TestClient(app) + def test_read_root(): response = client.get("/keywords/") assert response.status_code == 200 diff --git a/apps/pre-processing-service/app/utils/crawling_util.py b/apps/pre-processing-service/app/utils/crawling_util.py index ca9d0405..315df32a 100644 --- a/apps/pre-processing-service/app/utils/crawling_util.py +++ b/apps/pre-processing-service/app/utils/crawling_util.py @@ -67,4 +67,4 @@ def close(self): """드라이버 종료""" if self.driver: self.driver.quit() - self.driver = None \ No newline at end of file + self.driver = None