diff --git a/.github/workflows/ci-python.yml b/.github/workflows/ci-python.yml index ad50c51b..54d1ab1a 100644 --- a/.github/workflows/ci-python.yml +++ b/.github/workflows/ci-python.yml @@ -2,6 +2,8 @@ name: CI (Python/FastAPI) on: push: + branches: + - feature/onnx tags: - 'pre-processing-v*' pull_request: diff --git a/.github/workflows/deploy-java.yml b/.github/workflows/deploy-java.yml index 9c876f2f..eb2865d6 100644 --- a/.github/workflows/deploy-java.yml +++ b/.github/workflows/deploy-java.yml @@ -52,6 +52,16 @@ jobs: target: "~/app/docker/production/" overwrite: true + - name: Copy Caddyfile to EC2 + uses: appleboy/scp-action@v0.1.7 + with: + host: ${{ secrets.SERVER_HOST }} + username: ubuntu + key: ${{ secrets.SERVER_SSH_KEY }} + source: "docker/production/Caddyfile" + target: "~/app/docker/production/" + overwrite: true + - name: Deploy on EC2 uses: appleboy/ssh-action@v1.0.3 with: diff --git a/apps/pre-processing-service/app/api/endpoints/blog.py b/apps/pre-processing-service/app/api/endpoints/blog.py index 04ae0b14..85da62b2 100644 --- a/apps/pre-processing-service/app/api/endpoints/blog.py +++ b/apps/pre-processing-service/app/api/endpoints/blog.py @@ -4,16 +4,13 @@ from ...model.schemas import * from app.service.blog.tistory_blog_post_service import TistoryBlogPostService from app.service.blog.naver_blog_post_service import NaverBlogPostService -from ...service.blog.blogger_blog_post_service import BloggerBlogPostService +from ...service.blog.blogger_blog_post_adapter import ( + BloggerBlogPostAdapter, +) # 수정된 import router = APIRouter() -@router.get("/", summary="블로그 API 상태 확인") -async def root(): - return {"message": "blog API"} - - @router.post( "/rag/create", response_model=ResponseBlogCreate, @@ -49,9 +46,7 @@ async def publish(request: RequestBlogPublish): raise CustomException( "네이버 블로그 포스팅에 실패했습니다.", status_code=500 ) - return ResponseBlogPublish( - job_id=1, schedule_id=1, schedule_his_id=1, status="200", metadata=result - ) + return ResponseBlogPublish(status="success", metadata=result) elif request.tag == "tistory": tistory_service = TistoryBlogPostService() @@ -66,12 +61,10 @@ async def publish(request: RequestBlogPublish): "티스토리 블로그 포스팅에 실패했습니다.", status_code=500 ) - return ResponseBlogPublish( - job_id=1, schedule_id=1, schedule_his_id=1, status="200", metadata=result - ) + return ResponseBlogPublish(status="success", metadata=result) elif request.tag == "blogger": - blogger_service = BloggerBlogPostService() + blogger_service = BloggerBlogPostAdapter() # 수정: Adapter 사용 result = blogger_service.post_content( title=request.post_title, content=request.post_content, @@ -83,6 +76,4 @@ async def publish(request: RequestBlogPublish): "블로거 블로그 포스팅에 실패했습니다.", status_code=500 ) - return ResponseBlogPublish( - job_id=1, schedule_id=1, schedule_his_id=1, status="200", metadata=result - ) + return ResponseBlogPublish(status="success", metadata=result) diff --git a/apps/pre-processing-service/app/api/endpoints/keywords.py b/apps/pre-processing-service/app/api/endpoints/keywords.py index 2b407d6d..a1028391 100644 --- a/apps/pre-processing-service/app/api/endpoints/keywords.py +++ b/apps/pre-processing-service/app/api/endpoints/keywords.py @@ -6,11 +6,8 @@ router = APIRouter() -@router.get("/", summary="키워드 API 상태 확인") +@router.get("/") async def root(): - """ - 키워드 API가 정상 동작하는지 확인 - """ return {"message": "keyword API"} @@ -23,9 +20,6 @@ async def search(request: RequestNaverSearch): 요청 예시: { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, "tag": "naver", "category": "50000000", "start_date": "2025-09-01", diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index ceb55c9d..ab309595 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -8,6 +8,7 @@ from ...service.crawl_service import CrawlService from ...service.search_service import SearchService from ...service.match_service import MatchService +from ...service.similarity_service import SimilarityService # from ...service.similarity_service import SimilarityService @@ -16,14 +17,6 @@ router = APIRouter() -@router.get("/", summary="상품 API 상태 확인") -async def root(): - """ - 상품 API 서버 상태 확인용 엔드포인트 - """ - return {"message": "product API"} - - @router.post("/search", response_model=ResponseSadaguSearch, summary="상품 검색") async def search(request: RequestSadaguSearch): """ @@ -62,33 +55,33 @@ async def match(request: RequestSadaguMatch): raise HTTPException(status_code=500, detail=str(e)) -# @router.post( -# "/similarity", response_model=ResponseSadaguSimilarity, summary="상품 유사도 분석" -# ) -# async def similarity(request: RequestSadaguSimilarity): -# """ -# 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. -# """ -# try: -# similarity_service = SimilarityService() -# result = similarity_service.select_product_by_similarity(request) -# -# if not result: -# raise CustomException( -# 500, "유사도 분석에 실패했습니다.", "SIMILARITY_FAILED" -# ) -# -# return result -# except InvalidItemDataException as e: -# raise HTTPException(status_code=e.status_code, detail=e.detail) -# except Exception as e: -# raise HTTPException(status_code=500, detail=str(e)) +@router.post( + "/similarity", response_model=ResponseSadaguSimilarity, summary="상품 유사도 분석" +) +async def similarity(request: RequestSadaguSimilarity): + """ + 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. + """ + try: + similarity_service = SimilarityService() + result = similarity_service.select_product_by_similarity(request) + + if not result: + raise CustomException( + 500, "유사도 분석에 실패했습니다.", "SIMILARITY_FAILED" + ) + + return result + except InvalidItemDataException as e: + raise HTTPException(status_code=e.status_code, detail=e.detail) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) @router.post( "/crawl", response_model=ResponseSadaguCrawl, summary="상품 상세 정보 크롤링" ) -async def crawl(request: Request, body: RequestSadaguCrawl): +async def crawl(body: RequestSadaguCrawl): """ 상품 상세 페이지를 크롤링하여 상세 정보를 수집합니다. """ diff --git a/apps/pre-processing-service/app/api/endpoints/test.py b/apps/pre-processing-service/app/api/endpoints/test.py index e26bd203..91977a3f 100644 --- a/apps/pre-processing-service/app/api/endpoints/test.py +++ b/apps/pre-processing-service/app/api/endpoints/test.py @@ -21,11 +21,6 @@ router = APIRouter() -@router.get("/") -async def root(): - return {"message": "테스트 API"} - - @router.get("/hello/{name}", tags=["hello"]) # @log_api_call async def say_hello(name: str): @@ -67,11 +62,6 @@ def with_meta(data: Mapping[str, Any], meta: Mapping[str, Any]) -> Dict[str, Any @router.get("/tester", response_model=None) async def processing_tester(): - meta = { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, # ✅ 타이포 수정 - } request_dict = { "tag": "naver", "category": "50000000", @@ -79,7 +69,7 @@ async def processing_tester(): "end_date": "2025-09-02", } # 네이버 키워드 검색 - naver_request = RequestNaverSearch(**with_meta(meta, request_dict)) + naver_request = RequestNaverSearch(**with_meta(request_dict)) response_data = await keyword_search(naver_request) keyword = response_data.get("keyword") loguru.logger.info(keyword) @@ -89,21 +79,21 @@ async def processing_tester(): } # 싸다구 상품 검색 - sadagu_request = RequestSadaguSearch(**with_meta(meta, keyword)) + sadagu_request = RequestSadaguSearch(**with_meta(keyword)) search_service = SearchService() keyword_result = await search_service.search_products(sadagu_request) loguru.logger.info(keyword_result) # 싸다구 상품 매치 keyword["search_results"] = keyword_result.get("search_results") - keyword_match_request = RequestSadaguMatch(**with_meta(meta, keyword)) + keyword_match_request = RequestSadaguMatch(**with_meta(keyword)) match_service = MatchService() keyword_match_response = match_service.match_products(keyword_match_request) loguru.logger.info(keyword_match_response) # 싸다구 상품 유사도 분석 keyword["matched_products"] = keyword_match_response.get("matched_products") - keyword_similarity_request = RequestSadaguSimilarity(**with_meta(meta, keyword)) + keyword_similarity_request = RequestSadaguSimilarity(**with_meta(keyword)) # similarity_service = SimilarityService() # keyword_similarity_response = similarity_service.select_product_by_similarity( # keyword_similarity_request diff --git a/apps/pre-processing-service/app/core/config.py b/apps/pre-processing-service/app/core/config.py index ed54cc69..69e29d35 100644 --- a/apps/pre-processing-service/app/core/config.py +++ b/apps/pre-processing-service/app/core/config.py @@ -80,6 +80,9 @@ class BaseSettingsConfig(BaseSettings): # MeCab 사전 경로 (자동 감지) mecab_path: Optional[str] = None + # 테스트/추가용 필드 + openai_api_key: Optional[str] = None # << 이 부분 추가 + def __init__(self, **kwargs): super().__init__(**kwargs) diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index 61720cb6..9581ad0f 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -5,29 +5,13 @@ # 기본 요청 class RequestBase(BaseModel): - job_id: int = Field( - ..., title="작업 ID", description="현재 실행 중인 작업의 고유 식별자" - ) - schedule_id: int = Field( - ..., title="스케줄 ID", description="예약된 스케줄의 고유 식별자" - ) - schedule_his_id: Optional[int] = Field( - None, title="스케줄 히스토리 ID", description="스케줄 실행 이력의 고유 식별자" - ) + pass # 기본 응답 class ResponseBase(BaseModel): - job_id: int = Field( - ..., title="작업 ID", description="현재 실행 중인 작업의 고유 식별자" - ) - schedule_id: int = Field( - ..., title="스케줄 ID", description="예약된 스케줄의 고유 식별자" - ) - schedule_his_id: Optional[int] = Field( - None, title="스케줄 히스토리 ID", description="스케줄 실행 이력의 고유 식별자" - ) status: str = Field(..., title="상태", description="요청 처리 상태") + pass # 네이버 키워드 추출 diff --git a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py index ff4b2754..f55bdba0 100644 --- a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py @@ -1,5 +1,5 @@ from abc import ABC, abstractmethod -from typing import Dict, List, Optional +from typing import Dict from app.utils.crawling_util import CrawlingUtil from app.errors.BlogPostingException import * @@ -11,51 +11,39 @@ class BaseBlogPostService(ABC): 블로그 포스팅 서비스 추상 클래스 """ - def __init__(self, config_file="blog_config.json"): - """공통 초기화 로직""" - # Selenium 기반 서비스를 위한 초기화 - if self._requires_webdriver(): + def __init__(self, use_webdriver=True): + """ + 공통 초기화 로직 + :param use_webdriver: 웹드라이버 사용 여부 (API 서비스의 경우 False) + """ + self.use_webdriver = use_webdriver + + if self.use_webdriver: try: - self.crawling_service = CrawlingUtil() + # 블로그 포스팅용 설정으로 초기화 + self.crawling_service = CrawlingUtil( + headless=False, # 네이버 탐지 우회를 위해 headless 비활성화 + for_blog_posting=True, + ) self.web_driver = self.crawling_service.get_driver() self.wait_driver = self.crawling_service.get_wait() except Exception: raise WebDriverConnectionException() else: - # API 기반 서비스의 경우 WebDriver가 필요 없음 self.crawling_service = None self.web_driver = None self.wait_driver = None - # API 기반 서비스를 위한 초기화 - self.config_file = config_file - self.config = {} - self.current_upload_account = None - - # API 관련 속성들 (사용하지 않는 서비스에서는 None으로 유지) - self.blogger_service = None - self.blog_id = None - self.scopes = None - self._load_config() - def _requires_webdriver(self) -> bool: - """ - 서브클래스에서 WebDriver가 필요한지 여부를 반환 - 기본값은 True (Selenium 기반), API 기반 서비스에서는 False로 오버라이드 - """ - return True - @abstractmethod def _load_config(self) -> None: """플랫폼별 설정 로드""" pass + @abstractmethod def _login(self) -> None: - """ - 플랫폼별 로그인 구현 (API 기반 서비스의 경우 인증으로 대체) - 기본 구현은 아무것도 하지 않음 (API 서비스용) - """ + """플랫폼별 로그인 구현""" pass @abstractmethod @@ -83,6 +71,14 @@ def _validate_content( :param content: 포스트 내용 :param tags: 포스트 태그 리스트 """ + # if not title or not title.strip(): + # raise BlogContentValidationException("title", "제목이 비어있습니다") + # + # if not content or not content.strip(): + # raise BlogContentValidationException("content", "내용이 비어있습니다") + # + # if tags is None: + # raise BlogContentValidationException("tags", "태그가 비어있습니다") pass def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict: @@ -96,7 +92,7 @@ def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict # 1. 콘텐츠 유효성 검사 self._validate_content(title, content, tags) - # 2. 로그인 (Selenium 기반) 또는 인증 (API 기반) + # 2. 로그인 self._login() # 3. 포스트 작성 및 발행 diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py new file mode 100644 index 00000000..717a102e --- /dev/null +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py @@ -0,0 +1,84 @@ +from typing import Dict, List, Optional +from app.service.blog.base_blog_post_service import BaseBlogPostService +from app.service.blog.blogger_blog_post_service import BloggerApiService +from app.errors.BlogPostingException import * + + +class BloggerBlogPostAdapter(BaseBlogPostService): + """ + BaseBlogPostService와 호환되도록 BloggerApiService를 감싼 어댑터 + 현재 BaseBlogPostService 인터페이스와 호환 + """ + + def __init__(self, config_file="blog_config.json"): + # API 전용 서비스 (Adaptee) 먼저 초기화 + self.api_service = BloggerApiService(config_file=config_file) + + try: + # 부모 클래스의 웹드라이버 초기화를 시도하지만, 실패해도 무시 + # 이렇게 하면 부모의 다른 초기화 로직은 실행됨 + super().__init__() + except Exception: + # 웹드라이버 초기화 실패 시 API 서비스용으로 속성 설정 + self.crawling_service = None + self.web_driver = None + self.wait_driver = None + # 설정 로드는 직접 호출 + self._load_config() + + def _load_config(self) -> None: + """ + BloggerApiService 내부에서 이미 처리되므로 별도 구현 불필요 + """ + # API 서비스의 설정이 이미 로드되었으므로 추가 작업 없음 + pass + + def _login(self) -> None: + """ + Selenium 로그인과 달리, OAuth 인증으로 대체 + """ + try: + self.api_service.authenticate_with_google_oauth() + except Exception as e: + raise BlogLoginException("Blogger", f"OAuth 인증 실패: {str(e)}") + + def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: + """ + API를 통한 포스트 작성 + """ + try: + result = self.api_service.create_post_via_api(title, content, labels=tags) + # 결과 로깅 + print(f"포스트 생성 완료: {result.get('published_url', 'URL 없음')}") + except Exception as e: + raise BlogPostPublishException("Blogger", f"포스트 작성 실패: {str(e)}") + + def _get_platform_name(self) -> str: + """플랫폼 이름 반환""" + return "Blogger" + + def _validate_content( + self, title: str, content: str, tags: Optional[List[str]] = None + ) -> None: + """ + API 전용 유효성 검사 호출 + """ + try: + # Optional을 List로 변환 (None인 경우 빈 리스트) + tags_list = tags if tags is not None else [] + self.api_service.validate_api_content(title, content, labels=tags_list) + except Exception as e: + # BloggerApiService의 예외를 BaseBlogPostService 호환 예외로 변환 + if "title" in str(e).lower(): + raise BlogContentValidationException("title", str(e)) + elif "content" in str(e).lower(): + raise BlogContentValidationException("content", str(e)) + else: + raise BlogContentValidationException("general", str(e)) + + def __del__(self): + """ + API 서비스이므로 웹드라이버 정리가 불필요 + """ + # 웹드라이버가 없으므로 정리할 것이 없음 + pass diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py index 07e337d9..8bdeb221 100644 --- a/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_service.py @@ -1,39 +1,32 @@ import json import os import pickle -from typing import Dict, List, Optional - from googleapiclient.discovery import build from google.auth.transport.requests import Request from google_auth_oauthlib.flow import InstalledAppFlow - from app.errors.BlogPostingException import * -from app.service.blog.base_blog_post_service import BaseBlogPostService +from typing import Dict -class BloggerBlogPostService(BaseBlogPostService): +class BloggerApiService: """ - Blogger API를 사용하여 포스팅을 관리하는 서비스 + 호환되지 않는 Blogger API 서비스 (Adaptee) + 완전히 다른 초기화/인증 방식을 사용 """ def __init__(self, config_file="blog_config.json"): - # 부모 클래스 생성자 호출 (WebDriver는 None으로 설정됨) - super().__init__() - - # API 관련 추가 초기화 self.config_file = config_file + self.config = {} + self.current_upload_account = None self.blogger_service = None self.blog_id = None self.scopes = ["https://www.googleapis.com/auth/blogger"] + self.authenticated = False - def _requires_webdriver(self) -> bool: - """API 기반 서비스는 WebDriver가 필요하지 않음""" - return False + self._load_api_config() - def _load_config(self) -> None: - """ - 플랫폼별 설정 로드 - """ + def _load_api_config(self) -> None: + """API 전용 설정 로드""" try: with open(self.config_file, "r", encoding="utf-8") as f: self.config = json.load(f) @@ -48,16 +41,11 @@ def _load_config(self) -> None: self.config = default_config self.current_upload_account = self.config["upload_account"] - def _login(self) -> None: - """ - API 인증 (Selenium의 로그인을 대체) - """ - self._authenticate_api() + def authenticate_with_google_oauth(self) -> bool: + """Google OAuth 인증 (Selenium 로그인과 완전히 다름)""" + if self.authenticated: + return True - def _authenticate_api(self): - """ - API 인증 및 서비스 객체 생성 - """ token_file = f"token_{self.current_upload_account.replace('@', '_').replace('.', '_')}.pkl" try: @@ -85,22 +73,22 @@ def _authenticate_api(self): if blogs.get("items"): self.blog_id = blogs["items"][0]["id"] print(f"API 설정 완료 - 블로그: {blogs['items'][0]['name']}") + self.authenticated = True return True else: - print("블로그를 찾을 수 없습니다.") - return False + raise BloggerApiException("블로그를 찾을 수 없습니다") + except Exception as e: - print(f"API 인증/설정 실패: {e}") raise BloggerApiException("API 인증 실패", e) - def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: - """ - API를 사용하여 포스팅 작성 - """ - if not self.blogger_service or not self.blog_id: - self._authenticate_api() + def create_post_via_api( + self, title: str, content: str, labels: List[str] = None + ) -> Dict: + """API를 통한 포스트 생성 (Selenium write_content와 완전히 다름)""" + if not self.authenticated: + self.authenticate_with_google_oauth() - post_data = {"title": title, "content": content, "labels": tags or []} + post_data = {"title": title, "content": content, "labels": labels or []} try: result = ( @@ -109,35 +97,22 @@ def _write_content(self, title: str, content: str, tags: List[str] = None) -> No .execute() ) - print(f"포스트 생성 완료: {result.get('url')}") + return { + "blogger_post_id": result.get("id"), + "published_url": result.get("url"), + "status": "published", + } except Exception as e: raise BlogPostPublishException( platform="Blogger", reason="API 통신 중 오류가 발생했습니다." ) from e - def _get_platform_name(self) -> str: - """플랫폼 이름 반환""" - return "Blogger" - - def _validate_content( - self, title: str, content: str, tags: Optional[List[str]] = None + def validate_api_content( + self, title: str, content: str, labels: List[str] = None ) -> None: - """ - 공통 유효성 검사 로직 - """ + """API 전용 유효성 검사""" if not title or not title.strip(): raise BlogContentValidationException("title", "제목이 비어있습니다") - if not content or not content.strip(): raise BlogContentValidationException("content", "내용이 비어있습니다") - - # 태그 유효성 검사도 필요에 따라 추가 - # if not tags or not isinstance(tags, list): - # raise BlogContentValidationException("tags", "태그는 리스트 형태여야 합니다") - - def __del__(self): - """ - 리소스 정리 - API 기반 서비스는 별도 정리 불필요 - 부모 클래스의 __del__이 WebDriver 정리를 처리 - """ - super().__del__() + # Blogger는 태그가 선택사항 diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index 4122bb2e..548df05d 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -1,5 +1,5 @@ import time -from app.utils.crawler_utils import DetailCrawler +from app.service.crawlers.detail_crawler import DetailCrawler from app.errors.CustomException import InvalidItemDataException from app.model.schemas import RequestSadaguCrawl from loguru import logger @@ -18,7 +18,7 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: try: logger.info( - f"상품 상세 크롤링 서비스 시작: job_id={request.job_id}, schedule_id={request.schedule_id}, product_url={request.product_url}" + f"상품 상세 크롤링 서비스 시작: product_url={request.product_url}" ) # 상세 정보 크롤링 실행 @@ -37,9 +37,6 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: # 응답 데이터 구성 response_data = { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "tag": request.tag, "product_url": str(request.product_url), "product_detail": product_detail, @@ -47,14 +44,12 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - logger.info( - f"상품 상세 크롤링 서비스 완료: job_id={request.job_id}, status=success" - ) + logger.info(f"상품 상세 크롤링 서비스 완료: status=success") return response_data except Exception as e: logger.error( - f"크롤링 서비스 오류: job_id={request.job_id}, product_url={request.product_url}, error='{e}'" + f"크롤링 서비스 오류: product_url={request.product_url}, error='{e}'" ) raise InvalidItemDataException() finally: diff --git a/apps/pre-processing-service/app/service/crawlers/__init__.py b/apps/pre-processing-service/app/service/crawlers/__init__.py new file mode 100644 index 00000000..e69de29b diff --git a/apps/pre-processing-service/app/service/crawlers/base_crawler.py b/apps/pre-processing-service/app/service/crawlers/base_crawler.py new file mode 100644 index 00000000..27934ab5 --- /dev/null +++ b/apps/pre-processing-service/app/service/crawlers/base_crawler.py @@ -0,0 +1,56 @@ +import httpx +import time +from abc import ABC, abstractmethod +from bs4 import BeautifulSoup +from loguru import logger +from app.utils.crawling_util import CrawlingUtil + + +class BaseCrawler(ABC): + """크롤러 기본 클래스""" + + def __init__(self, use_selenium: bool = True, headless: bool = True): + self.base_url = "https://ssadagu.kr" + self.use_selenium = use_selenium + + if use_selenium: + self._setup_selenium(headless) + else: + self._setup_httpx() + + def _setup_selenium(self, headless: bool): + """Selenium WebDriver 초기화""" + try: + self.crawling_util = CrawlingUtil(headless=headless) + self.driver = self.crawling_util.get_driver() + self.wait = self.crawling_util.get_wait() + logger.info("Selenium WebDriver 초기화 완료") + except Exception as e: + logger.warning(f"Selenium 초기화 실패, httpx로 대체: {e}") + self.use_selenium = False + self._setup_httpx() + + def _setup_httpx(self): + """httpx 클라이언트 초기화""" + self.client = httpx.AsyncClient( + headers={ + "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" + }, + timeout=30.0, + ) + logger.info("httpx 클라이언트 초기화 완료") + + async def close(self): + """리소스 정리""" + if self.use_selenium and hasattr(self, "crawling_util"): + try: + self.crawling_util.close() + logger.info("Selenium WebDriver 종료 완료") + except Exception as e: + logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") + elif hasattr(self, "client"): + try: + await self.client.aclose() + logger.info("httpx 클라이언트 종료 완료") + except Exception as e: + logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") diff --git a/apps/pre-processing-service/app/utils/crawler_utils.py b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py similarity index 55% rename from apps/pre-processing-service/app/utils/crawler_utils.py rename to apps/pre-processing-service/app/service/crawlers/detail_crawler.py index 5c593b9f..885fd2f0 100644 --- a/apps/pre-processing-service/app/utils/crawler_utils.py +++ b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py @@ -1,185 +1,10 @@ -import urllib.parse -import httpx -import re import time +import re from bs4 import BeautifulSoup -from selenium import webdriver -from selenium.webdriver.chrome.options import Options -from selenium.webdriver.common.by import By -from selenium.webdriver.support.ui import WebDriverWait -from selenium.common.exceptions import TimeoutException, NoSuchElementException +from .search_crawler import SearchCrawler from loguru import logger -class SearchCrawler: - def __init__(self, use_selenium=True): - self.base_url = "https://ssadagu.kr" - self.use_selenium = use_selenium - - if use_selenium: - self._setup_selenium() - else: - self._setup_httpx() - - def _setup_selenium(self): - """Selenium WebDriver 초기화""" - chrome_options = Options() - chrome_options.add_argument("--headless") - chrome_options.add_argument("--no-sandbox") - chrome_options.add_argument("--disable-dev-shm-usage") - chrome_options.add_argument("--disable-gpu") - chrome_options.add_argument("--window-size=1920,1080") - chrome_options.add_argument( - "--user-agent=Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - ) - - try: - self.driver = webdriver.Chrome(options=chrome_options) - self.wait = WebDriverWait(self.driver, 10) - logger.info("Selenium WebDriver 초기화 완료") - except Exception as e: - logger.warning(f"Selenium 초기화 실패, httpx로 대체: {e}") - self.use_selenium = False - self._setup_httpx() - - def _setup_httpx(self): - """httpx 클라이언트 초기화""" - self.client = httpx.AsyncClient( - headers={ - "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36" - }, - timeout=30.0, - ) - logger.info("httpx 클라이언트 초기화 완료") - - async def search_products_selenium(self, keyword: str) -> list[dict]: - """Selenium을 사용한 상품 검색""" - encoded_keyword = urllib.parse.quote(keyword) - search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" - - try: - logger.info( - f"Selenium 상품 검색 시작: keyword='{keyword}', url='{search_url}'" - ) - self.driver.get(search_url) - time.sleep(5) - - product_links = [] - link_elements = self.driver.find_elements(By.TAG_NAME, "a") - - for element in link_elements: - href = element.get_attribute("href") - if ( - href - and "view.php" in href - and ("platform=1688" in href or "num_iid" in href) - ): - try: - title = element.get_attribute("title") or element.text.strip() - if title: - product_links.append({"url": href, "title": title}) - except: - product_links.append({"url": href, "title": "Unknown Title"}) - - # 중복 제거 - seen_urls = set() - unique_products = [] - for product in product_links: - if product["url"] not in seen_urls: - seen_urls.add(product["url"]) - unique_products.append(product) - - logger.info( - f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" - ) - return unique_products[:20] - - except Exception as e: - logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") - return [] - - async def search_products_httpx(self, keyword: str) -> list[dict]: - """httpx를 사용한 상품 검색""" - encoded_keyword = urllib.parse.quote(keyword) - search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" - - try: - logger.info( - f"httpx 상품 검색 시작: keyword='{keyword}', url='{search_url}'" - ) - response = await self.client.get(search_url) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") - - product_links = [] - all_links = soup.find_all("a", href=True) - - for link in all_links: - href = link["href"] - if "view.php" in href and ( - "platform=1688" in href or "num_iid" in href - ): - full_url = ( - f"{self.base_url}{href}" if href.startswith("/") else href - ) - title = ( - link.get("title", "") - or link.get_text(strip=True) - or "Unknown Title" - ) - - product_links.append({"url": full_url, "title": title}) - - logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") - return product_links[:20] - - except Exception as e: - logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") - return [] - - async def get_basic_product_info(self, product_url: str) -> dict: - """기본 상품 정보만 크롤링""" - try: - logger.debug(f"기본 상품 정보 크롤링 시작: url='{product_url}'") - - if self.use_selenium: - self.driver.get(product_url) - self.wait.until( - lambda driver: driver.execute_script("return document.readyState") - == "complete" - ) - soup = BeautifulSoup(self.driver.page_source, "html.parser") - else: - response = await self.client.get(product_url) - response.raise_for_status() - soup = BeautifulSoup(response.content, "html.parser") - - title_element = soup.find("h1", {"id": "kakaotitle"}) - title = title_element.get_text(strip=True) if title_element else "제목 없음" - - logger.debug(f"기본 상품 정보 크롤링 완료: title='{title[:50]}'") - return {"url": product_url, "title": title} - - except Exception as e: - logger.error(f"기본 상품 크롤링 오류: url='{product_url}', error='{e}'") - return None - - async def close(self): - """리소스 정리""" - if self.use_selenium and hasattr(self, "driver"): - try: - self.driver.quit() - logger.info("Selenium WebDriver 종료 완료") - except Exception as e: - logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") - elif hasattr(self, "client"): - try: - await self.client.aclose() - logger.info("httpx 클라이언트 종료 완료") - except Exception as e: - logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") - - class DetailCrawler(SearchCrawler): """SearchCrawler를 확장한 상세 크롤링 클래스""" diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py new file mode 100644 index 00000000..a0d46e02 --- /dev/null +++ b/apps/pre-processing-service/app/service/crawlers/search_crawler.py @@ -0,0 +1,137 @@ +import urllib.parse +import time +from .base_crawler import BaseCrawler +from loguru import logger +from bs4 import BeautifulSoup +from selenium.webdriver.common.by import By + + +class SearchCrawler(BaseCrawler): + """상품 검색 전용 크롤러""" + + async def search_products_selenium(self, keyword: str) -> list[dict]: + """Selenium을 사용한 상품 검색""" + encoded_keyword = urllib.parse.quote(keyword) + search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" + + try: + logger.info( + f"Selenium 상품 검색 시작: keyword='{keyword}', url='{search_url}'" + ) + self.driver.get(search_url) + time.sleep(5) + + product_links = [] + link_elements = self.driver.find_elements(By.TAG_NAME, "a") + + for element in link_elements: + href = element.get_attribute("href") + if ( + href + and "view.php" in href + and ("platform=1688" in href or "num_iid" in href) + ): + try: + title = element.get_attribute("title") or element.text.strip() + if title: + product_links.append({"url": href, "title": title}) + except: + product_links.append({"url": href, "title": "Unknown Title"}) + + # 중복 제거 + seen_urls = set() + unique_products = [] + for product in product_links: + if product["url"] not in seen_urls: + seen_urls.add(product["url"]) + unique_products.append(product) + + logger.info( + f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" + ) + return unique_products[:20] + + except Exception as e: + logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") + return [] + + async def search_products_httpx(self, keyword: str) -> list[dict]: + """httpx를 사용한 상품 검색""" + encoded_keyword = urllib.parse.quote(keyword) + search_url = f"{self.base_url}/shop/search.php?ss_tx={encoded_keyword}" + + try: + logger.info( + f"httpx 상품 검색 시작: keyword='{keyword}', url='{search_url}'" + ) + response = await self.client.get(search_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + product_links = [] + all_links = soup.find_all("a", href=True) + + for link in all_links: + href = link["href"] + if "view.php" in href and ( + "platform=1688" in href or "num_iid" in href + ): + full_url = ( + f"{self.base_url}{href}" if href.startswith("/") else href + ) + title = ( + link.get("title", "") + or link.get_text(strip=True) + or "Unknown Title" + ) + + product_links.append({"url": full_url, "title": title}) + + logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") + return product_links[:20] + + except Exception as e: + logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") + return [] + + async def get_basic_product_info(self, product_url: str) -> dict: + """기본 상품 정보만 크롤링""" + try: + logger.debug(f"기본 상품 정보 크롤링 시작: url='{product_url}'") + + if self.use_selenium: + self.driver.get(product_url) + self.wait.until( + lambda driver: driver.execute_script("return document.readyState") + == "complete" + ) + soup = BeautifulSoup(self.driver.page_source, "html.parser") + else: + response = await self.client.get(product_url) + response.raise_for_status() + soup = BeautifulSoup(response.content, "html.parser") + + title_element = soup.find("h1", {"id": "kakaotitle"}) + title = title_element.get_text(strip=True) if title_element else "제목 없음" + + logger.debug(f"기본 상품 정보 크롤링 완료: title='{title[:50]}'") + return {"url": product_url, "title": title} + + except Exception as e: + logger.error(f"기본 상품 크롤링 오류: url='{product_url}', error='{e}'") + return None + + async def close(self): + """리소스 정리""" + if self.use_selenium and hasattr(self, "driver"): + try: + self.driver.quit() + logger.info("Selenium WebDriver 종료 완료") + except Exception as e: + logger.warning(f"Selenium WebDriver 종료 중 오류: {e}") + elif hasattr(self, "client"): + try: + await self.client.aclose() + logger.info("httpx 클라이언트 종료 완료") + except Exception as e: + logger.warning(f"httpx 클라이언트 종료 중 오류: {e}") diff --git a/apps/pre-processing-service/app/service/keyword_service.py b/apps/pre-processing-service/app/service/keyword_service.py index 575767ee..f8065fa3 100644 --- a/apps/pre-processing-service/app/service/keyword_service.py +++ b/apps/pre-processing-service/app/service/keyword_service.py @@ -1,4 +1,3 @@ -# Pydantic 모델을 가져오기 위해 schemas 파일 import import json import random diff --git a/apps/pre-processing-service/app/service/match_service.py b/apps/pre-processing-service/app/service/match_service.py index 5816957a..9f340683 100644 --- a/apps/pre-processing-service/app/service/match_service.py +++ b/apps/pre-processing-service/app/service/match_service.py @@ -16,15 +16,16 @@ def match_products(self, request: RequestSadaguMatch) -> dict: products = request.search_results logger.info( - f"키워드 매칭 서비스 시작: job_id={request.job_id}, schedule_id={request.schedule_id}, keyword='{keyword}', products_count={len(products) if products else 0}" + # f"키워드 매칭 서비스 시작: job_id={request.job_id}, schedule_id={request.schedule_id}, keyword='{keyword}', products_count={len(products) if products else 0}" + f"keyword='{keyword}'" ) if not products: logger.warning(f"매칭할 상품이 없음: keyword='{keyword}'") return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, + # "job_id": request.job_id, + # "schedule_id": request.schedule_id, + # "schedule_his_id": request.schedule_his_id, "keyword": keyword, "matched_products": [], "status": "success", @@ -80,9 +81,9 @@ def match_products(self, request: RequestSadaguMatch) -> dict: ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, + # "job_id": request.job_id, + # "schedule_id": request.schedule_id, + # "schedule_his_id": request.schedule_his_id, "keyword": keyword, "matched_products": matched_products, "status": "success", @@ -90,6 +91,6 @@ def match_products(self, request: RequestSadaguMatch) -> dict: except Exception as e: logger.error( - f"매칭 서비스 오류: job_id={request.job_id}, keyword='{keyword}', error='{e}'" + # f"매칭 서비스 오류: job_id={request.job_id}, keyword='{keyword}', error='{e}'" ) raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/service/product_blog_posting_service.py b/apps/pre-processing-service/app/service/product_blog_posting_service.py new file mode 100644 index 00000000..fa947855 --- /dev/null +++ b/apps/pre-processing-service/app/service/product_blog_posting_service.py @@ -0,0 +1,362 @@ +import json +import logging +import os +from datetime import datetime +from typing import Dict, List, Optional, Any +from dataclasses import dataclass +from enum import Enum + +from openai import OpenAI +from dotenv import load_dotenv + +from app.service.blog.blogger_blog_post_adapter import BloggerBlogPostAdapter +from app.errors.BlogPostingException import * + +# 환경변수 로드 +load_dotenv('.env.dev') + +client = OpenAI() + +class PostingStatus(Enum): + PENDING = "pending" + PROCESSING = "processing" + SUCCESS = "success" + FAILED = "failed" + RETRY = "retry" + + +@dataclass +class ProductData: + """크롤링된 상품 데이터 모델""" + tag: str + product_url: str + title: str + price: int + rating: float + options: List[Dict[str, Any]] + material_info: Dict[str, str] + product_images: List[str] + crawled_at: str + + @classmethod + def from_dict(cls, data: Dict) -> 'ProductData': + """딕셔너리에서 ProductData 객체 생성""" + product_detail = data.get('product_detail', {}) + return cls( + tag=data.get('tag', ''), + product_url=product_detail.get('url', ''), + title=product_detail.get('title', ''), + price=product_detail.get('price', 0), + rating=product_detail.get('rating', 0.0), + options=product_detail.get('options', []), + material_info=product_detail.get('material_info', {}), + product_images=product_detail.get('product_images', []), + crawled_at=data.get('crawled_at', '') + ) + + +@dataclass +class BlogPostContent: + """생성된 블로그 포스트 콘텐츠""" + title: str + content: str + tags: List[str] + + +@dataclass +class BlogContentRequest: + """블로그 콘텐츠 생성 요청""" + content_style: str = "informative" # "informative", "promotional", "review" + target_keywords: List[str] = None + include_pricing: bool = True + include_specifications: bool = True + content_length: str = "medium" # "short", "medium", "long" + + +class ProductContentGenerator: + """GPT를 활용한 상품 블로그 콘텐츠 생성""" + + def __init__(self): + # 환경변수에서 OpenAI API 키 로드 + self.openai_api_key = os.getenv('OPENAI_API_KEY') + if not self.openai_api_key: + raise ValueError("OPENAI_API_KEY가 .env.dev 파일에 설정되지 않았습니다.") + + client.api_key = self.openai_api_key + + def generate_blog_content(self, product_data: ProductData, request: BlogContentRequest) -> BlogPostContent: + """상품 데이터를 기반으로 블로그 콘텐츠 생성""" + + # 1. 상품 정보 정리 + product_info = self._format_product_info(product_data, request) + + # 2. 프롬프트 생성 + prompt = self._create_blog_prompt(product_info, request) + + # 3. GPT를 통한 콘텐츠 생성 + try: + + response = client.chat.completions.create( + model="gpt-4o-mini", + messages=[ + { + "role": "system", + "content": "당신은 전문적인 블로그 콘텐츠 작성자입니다. 상품 리뷰와 정보성 콘텐츠를 매력적이고 SEO 친화적으로 작성합니다." + }, + { + "role": "user", + "content": prompt + } + ], + temperature=0.7, + max_tokens=2000 + ) + + generated_content = response.choices[0].message.content + + # 4. 콘텐츠 파싱 및 구조화 + return self._parse_generated_content(generated_content, product_data, request) + + except Exception as e: + logging.error(f"콘텐츠 생성 실패: {e}") + return self._create_fallback_content(product_data, request) + + def _format_product_info(self, product_data: ProductData, request: BlogContentRequest) -> str: + """상품 정보를 텍스트로 포맷팅""" + info_parts = [ + f"상품명: {product_data.title}", + ] + + # 가격 정보 추가 + if request.include_pricing and product_data.price: + info_parts.append(f"가격: {product_data.price:,}원") + + # 평점 정보 추가 + if product_data.rating: + info_parts.append(f"평점: {product_data.rating}/5.0") + + # 사양 정보 추가 + if request.include_specifications and product_data.material_info: + info_parts.append("\n상품 사양:") + for key, value in product_data.material_info.items(): + info_parts.append(f"- {key}: {value}") + + # 옵션 정보 추가 + if product_data.options: + info_parts.append(f"\n구매 옵션 ({len(product_data.options)}개):") + for i, option in enumerate(product_data.options[:5], 1): # 처음 5개만 + info_parts.append(f"{i}. {option.get('name', 'N/A')}") + + # 구매 링크 + if product_data.product_url: + info_parts.append(f"\n구매 링크: {product_data.product_url}") + + return "\n".join(info_parts) + + def _create_blog_prompt(self, product_info: str, request: BlogContentRequest) -> str: + """블로그 작성용 프롬프트 생성""" + + # 스타일별 가이드라인 + style_guidelines = { + "informative": "객관적이고 상세한 정보 제공 중심으로, 독자가 제품을 이해할 수 있도록 전문적으로 작성", + "promotional": "제품의 장점과 매력을 강조하며, 구매 의욕을 자극할 수 있도록 매력적으로 작성", + "review": "실제 사용 경험을 바탕으로 한 솔직한 평가와 추천 중심으로 작성" + } + + # 길이별 가이드라인 + length_guidelines = { + "short": "800자 내외의 간결한 내용", + "medium": "1200자 내외의 적당한 길이", + "long": "1500자 이상의 상세한 내용" + } + + style_guide = style_guidelines.get(request.content_style, style_guidelines["informative"]) + length_guide = length_guidelines.get(request.content_length, length_guidelines["medium"]) + + # 키워드 정보 + keywords_text = "" + if request.target_keywords: + keywords_text = f"\n포함할 키워드: {', '.join(request.target_keywords)}" + + prompt = f""" +다음 상품 정보를 바탕으로 매력적인 블로그 포스트를 작성해주세요. + +상품 정보: +{product_info} + +작성 가이드라인: +- 스타일: {style_guide} +- 길이: {length_guide} +- 톤: 친근하면서도 신뢰할 수 있는, 정보 제공 중심{keywords_text} + +작성 요구사항: +1. SEO 친화적이고 클릭하고 싶은 매력적인 제목 +2. 독자의 관심을 끄는 도입부 +3. 상품의 핵심 특징과 장점을 구체적으로 설명 +4. 실제 사용 시나리오나 활용 팁 +5. 구매 결정에 도움이 되는 정보 + +⚠️ 주의: +- 절대로 마지막에 '이 HTML 구조는…' 같은 자기 평가 문장을 추가하지 마세요. +- 출력 시 ```나 ```html 같은 코드 블록 구문을 포함하지 마세요. +- 오직 HTML 태그만 사용하여 구조화된 콘텐츠를 작성해주세요. +(예:
,
{product_data.title}에 대한 상세한 정보를 소개합니다.
+ +판매가: {product_data.price:,}원
+""" + + if product_data.material_info: + content += "상품 구매는 여기에서 가능합니다.
+""" + + return BlogPostContent( + title=title, + content=content, + tags=[product_data.tag] if product_data.tag else ["상품정보"] + ) + +class ProductBlogPostingService: + """상품 데이터를 Blogger에 포스팅하는 메인 서비스""" + + def __init__(self): + self.content_generator = ProductContentGenerator() + self.blogger_service = BloggerBlogPostAdapter() + + def post_product_to_blogger(self, product_data: ProductData, request: BlogContentRequest) -> dict: + """상품 데이터를 Blogger에 포스팅""" + try: + # 1. GPT를 통한 콘텐츠 생성 + blog_content = self.content_generator.generate_blog_content(product_data, request) + + # 2. Blogger에 포스팅 + self.blogger_service.post_content( + title=blog_content.title, + content=blog_content.content, + tags=blog_content.tags + ) + + # 3. 성공 결과 반환 + return { + "status": "success", + "platform": "blogger", + "title": blog_content.title, + "tags": blog_content.tags, + "posted_at": datetime.now().isoformat(), + "product_tag": product_data.tag + } + + except Exception as e: + logging.error(f"Blogger 포스팅 실패: {e}") + # ProductData 객체 기준으로 처리 + return { + "status": "failed", + "error": str(e), + "platform": "blogger", + "attempted_at": datetime.now().isoformat(), + "product_tag": getattr(product_data, "tag", "unknown") + } + + # def batch_post_products(self, products_data: List[Dict], request: BlogContentRequest) -> List[Dict[str, Any]]: + # """여러 상품을 일괄 포스팅""" + # results = [] + # + # for product_data in products_data: + # result = self.post_product_to_blogger(product_data, request) + # results.append(result) + # + # # API 호출 제한을 고려한 딜레이 + # import time + # time.sleep(3) # 3초 대기 + # + # return results \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index 6fb09c0f..a71d6a8d 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -1,4 +1,4 @@ -from app.utils.crawler_utils import SearchCrawler +from app.service.crawlers.search_crawler import SearchCrawler from app.errors.CustomException import InvalidItemDataException from ..model.schemas import RequestSadaguSearch from loguru import logger @@ -17,7 +17,8 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: try: logger.info( - f"상품 검색 서비스 시작: job_id={request.job_id}, schedule_id={request.schedule_id}, keyword='{keyword}'" + # f"상품 검색 서비스 시작: job_id={request.job_id}, schedule_id={request.schedule_id}, keyword='{keyword}'" + f"keyword='{keyword}'" ) # Selenium 또는 httpx로 상품 검색 @@ -29,9 +30,6 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: if not search_results: logger.warning(f"검색 결과가 없습니다: keyword='{keyword}'") return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "search_results": [], "status": "success", @@ -90,18 +88,13 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: ) return { - "job_id": request.job_id, - "schedule_id": request.schedule_id, - "schedule_his_id": request.schedule_his_id, "keyword": keyword, "search_results": enriched_results, "status": "success", } except Exception as e: - logger.error( - f"검색 서비스 오류: job_id={request.job_id}, keyword='{keyword}', error='{e}'" - ) + logger.error(f"검색 서비스 오류: keyword='{keyword}', error='{e}'") raise InvalidItemDataException() finally: diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 0e245da1..c77aa8ba 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -1,177 +1,160 @@ -# from app.utils.similarity_analyzer import SimilarityAnalyzer -# from app.errors.CustomException import InvalidItemDataException -# from ..model.schemas import RequestSadaguSimilarity -# from loguru import logger -# -# -# class SimilarityService: -# def __init__(self): -# pass -# -# def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict: -# """ -# BERT 기반 유사도 분석 후 상품 선택 - 4단계 -# """ -# keyword = request.keyword -# candidates = request.matched_products -# fallback_products = request.search_results or [] -# -# logger.info( -# f"유사도 분석 서비스 시작: job_id={request.job_id}, keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" -# ) -# -# # 매칭된 상품이 없으면 전체 검색 결과로 폴백 -# if not candidates: -# if not fallback_products: -# logger.warning( -# f"매칭된 상품과 검색 결과가 모두 없음: keyword='{keyword}'" -# ) -# return { -# "job_id": request.job_id, -# "schedule_id": request.schedule_id, -# "schedule_his_id": request.schedule_his_id, -# "keyword": keyword, -# "selected_product": None, -# "reason": "매칭된 상품과 검색 결과가 모두 없음", -# "status": "success", -# } -# -# logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석") -# candidates = fallback_products -# analysis_mode = "fallback_similarity_only" -# else: -# analysis_mode = "matched_products" -# -# try: -# analyzer = SimilarityAnalyzer() -# -# logger.info( -# f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" -# ) -# -# # 한 개만 있으면 바로 선택 -# if len(candidates) == 1: -# selected_product = candidates[0] -# -# logger.info("단일 후보 상품 - 유사도 검증 진행") -# # 유사도 계산 -# similarity = analyzer.calculate_similarity( -# keyword, selected_product["title"] -# ) -# -# # 폴백 모드에서는 임계값 검증 -# if analysis_mode == "fallback_similarity_only": -# similarity_threshold = 0.3 -# if similarity < similarity_threshold: -# logger.warning( -# f"단일 상품 유사도 미달: similarity={similarity:.4f} < threshold={similarity_threshold}" -# ) -# return { -# "job_id": request.job_id, -# "schedule_id": request.schedule_id, -# "schedule_his_id": request.schedule_his_id, -# "keyword": keyword, -# "selected_product": None, -# "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", -# "status": "success", -# } -# -# selected_product["similarity_info"] = { -# "similarity_score": float(similarity), -# "analysis_type": "single_candidate", -# "analysis_mode": analysis_mode, -# } -# -# logger.success( -# f"단일 상품 선택 완료: title='{selected_product['title'][:30]}', similarity={similarity:.4f}" -# ) -# -# return { -# "job_id": request.job_id, -# "schedule_id": request.schedule_id, -# "schedule_his_id": request.schedule_his_id, -# "keyword": keyword, -# "selected_product": selected_product, -# "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", -# "status": "success", -# } -# -# # 여러 개가 있으면 유사도 비교 -# logger.info("여러 상품 중 최고 유사도로 선택...") -# -# # 제목만 추출해서 배치 분석 -# titles = [product["title"] for product in candidates] -# similarity_results = analyzer.analyze_similarity_batch(keyword, titles) -# -# # 결과 출력 -# logger.info("유사도 분석 결과:") -# for i, result in enumerate(similarity_results[:5]): # 상위 5개만 로그 -# logger.info( -# f" {i+1}위: {result['title'][:40]} | 유사도: {result['similarity']:.4f}" -# ) -# -# # 최고 유사도 선택 -# best_result = similarity_results[0] -# selected_product = candidates[best_result["index"]].copy() -# -# # 폴백 모드에서는 임계값 검증 -# similarity_threshold = 0.3 -# if ( -# analysis_mode == "fallback_similarity_only" -# and best_result["similarity"] < similarity_threshold -# ): -# logger.warning( -# f"최고 유사도 미달: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}" -# ) -# return { -# "job_id": request.job_id, -# "schedule_id": request.schedule_id, -# "schedule_his_id": request.schedule_his_id, -# "keyword": keyword, -# "selected_product": None, -# "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", -# "status": "success", -# } -# -# # 유사도 정보 추가 -# selected_product["similarity_info"] = { -# "similarity_score": best_result["similarity"], -# "analysis_type": "multi_candidate_bert", -# "analysis_mode": analysis_mode, -# "rank": 1, -# "total_candidates": len(candidates), -# } -# -# # 매칭 모드에서는 종합 점수도 계산 -# if analysis_mode == "matched_products" and "match_info" in selected_product: -# match_score = selected_product["match_info"]["match_score"] -# similarity_score = best_result["similarity"] -# # 가중치: 매칭 40%, 유사도 60% -# final_score = match_score * 0.4 + similarity_score * 0.6 -# selected_product["final_score"] = final_score -# reason = f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6" -# logger.info( -# f"종합 점수 계산: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}" -# ) -# else: -# reason = f"유사도({best_result['similarity']:.4f}) 기준 선택 ({analysis_mode})" -# -# logger.success( -# f"상품 선택 완료: title='{selected_product['title'][:30]}', {reason}" -# ) -# -# return { -# "job_id": request.job_id, -# "schedule_id": request.schedule_id, -# "schedule_his_id": request.schedule_his_id, -# "keyword": keyword, -# "selected_product": selected_product, -# "reason": reason, -# "status": "success", -# } -# -# except Exception as e: -# logger.error( -# f"유사도 분석 서비스 오류: job_id={request.job_id}, keyword='{keyword}', error='{e}'" -# ) -# raise InvalidItemDataException() +from app.utils.similarity_analyzer import SimilarityAnalyzerONNX +from app.errors.CustomException import InvalidItemDataException +from ..model.schemas import RequestSadaguSimilarity +from loguru import logger + + +class SimilarityService: + def __init__(self): + pass + + def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict: + """ + BERT 기반 유사도 분석 후 상품 선택 - 4단계 + """ + keyword = request.keyword + candidates = request.matched_products + fallback_products = request.search_results or [] + + logger.info( + f"유사도 분석 서비스 시작: keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" + ) + + # 매칭된 상품이 없으면 전체 검색 결과로 폴백 + if not candidates: + if not fallback_products: + logger.warning( + f"매칭된 상품과 검색 결과가 모두 없음: keyword='{keyword}'" + ) + return { + "keyword": keyword, + "selected_product": None, + "reason": "매칭된 상품과 검색 결과가 모두 없음", + "status": "success", + } + + logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석") + candidates = fallback_products + analysis_mode = "fallback_similarity_only" + else: + analysis_mode = "matched_products" + + try: + analyzer = SimilarityAnalyzerONNX() + + logger.info( + f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" + ) + + # 한 개만 있으면 바로 선택 + if len(candidates) == 1: + selected_product = candidates[0] + + logger.info("단일 후보 상품 - 유사도 검증 진행") + # 유사도 계산 + similarity = analyzer.calculate_similarity( + keyword, selected_product["title"] + ) + + # 폴백 모드에서는 임계값 검증 + if analysis_mode == "fallback_similarity_only": + similarity_threshold = 0.3 + if similarity < similarity_threshold: + logger.warning( + f"단일 상품 유사도 미달: similarity={similarity:.4f} < threshold={similarity_threshold}" + ) + return { + "keyword": keyword, + "selected_product": None, + "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", + "status": "success", + } + + selected_product["similarity_info"] = { + "similarity_score": float(similarity), + "analysis_type": "single_candidate", + "analysis_mode": analysis_mode, + } + + logger.success( + f"단일 상품 선택 완료: title='{selected_product['title'][:30]}', similarity={similarity:.4f}" + ) + + return { + "keyword": keyword, + "selected_product": selected_product, + "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", + "status": "success", + } + + # 여러 개가 있으면 유사도 비교 + logger.info("여러 상품 중 최고 유사도로 선택...") + + # 제목만 추출해서 배치 분석 + titles = [product["title"] for product in candidates] + similarity_results = analyzer.analyze_similarity_batch(keyword, titles) + + # 결과 출력 + logger.info("유사도 분석 결과:") + for i, result in enumerate(similarity_results[:5]): # 상위 5개만 로그 + logger.info( + f" {i+1}위: {result['title'][:40]} | 유사도: {result['similarity']:.4f}" + ) + + # 최고 유사도 선택 + best_result = similarity_results[0] + selected_product = candidates[best_result["index"]].copy() + + # 폴백 모드에서는 임계값 검증 + similarity_threshold = 0.3 + if ( + analysis_mode == "fallback_similarity_only" + and best_result["similarity"] < similarity_threshold + ): + logger.warning( + f"최고 유사도 미달: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}" + ) + return { + "keyword": keyword, + "selected_product": None, + "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", + "status": "success", + } + + # 유사도 정보 추가 + selected_product["similarity_info"] = { + "similarity_score": best_result["similarity"], + "analysis_type": "multi_candidate_bert", + "analysis_mode": analysis_mode, + "rank": 1, + "total_candidates": len(candidates), + } + + # 매칭 모드에서는 종합 점수도 계산 + if analysis_mode == "matched_products" and "match_info" in selected_product: + match_score = selected_product["match_info"]["match_score"] + similarity_score = best_result["similarity"] + # 가중치: 매칭 40%, 유사도 60% + final_score = match_score * 0.4 + similarity_score * 0.6 + selected_product["final_score"] = final_score + reason = f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6" + logger.info( + f"종합 점수 계산: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}" + ) + else: + reason = f"유사도({best_result['similarity']:.4f}) 기준 선택 ({analysis_mode})" + + logger.success( + f"상품 선택 완료: title='{selected_product['title'][:30]}', {reason}" + ) + + return { + "keyword": keyword, + "selected_product": selected_product, + "reason": reason, + "status": "success", + } + + except Exception as e: + logger.error(f"유사도 분석 서비스 오류: keyword='{keyword}', error='{e}'") + raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/test/test_keyword.py b/apps/pre-processing-service/app/test/test_keyword.py index 2a96796e..11bd69fa 100644 --- a/apps/pre-processing-service/app/test/test_keyword.py +++ b/apps/pre-processing-service/app/test/test_keyword.py @@ -4,10 +4,6 @@ client = TestClient(app) -JOB_ID = 1 -SCHEDULE_ID = 1 -SCHEDULE_HIS_ID = 1 - def test_read_root(): response = client.get("/keywords/") @@ -26,9 +22,6 @@ def test_read_root(): ) def test_search(tag, category, start_date, end_date): body = { - "job_id": JOB_ID, - "schedule_id": SCHEDULE_ID, - "schedule_his_id": SCHEDULE_HIS_ID, # 오타 수정 "tag": tag, "category": category, "start_date": start_date, @@ -39,9 +32,6 @@ def test_search(tag, category, start_date, end_date): assert response.status_code == 200 response_data = response.json() - assert response_data["job_id"] == body["job_id"] - assert response_data["schedule_id"] == body["schedule_id"] - assert response_data["schedule_his_id"] == body["schedule_his_id"] # 오타 수정 assert response_data["status"] == "success" assert "keyword" in response_data assert isinstance(response_data["total_keyword"], dict) diff --git a/apps/pre-processing-service/app/test/test_match_service.py b/apps/pre-processing-service/app/test/test_match_service.py index 7750cd3d..3f50ffad 100644 --- a/apps/pre-processing-service/app/test/test_match_service.py +++ b/apps/pre-processing-service/app/test/test_match_service.py @@ -23,9 +23,6 @@ def test_match_success(): ] body = { - "job_id": 1, - "schedule_id": 1, - "schedule_his_id": 1, "keyword": "반지", "search_results": sample_search_results, } @@ -35,7 +32,6 @@ def test_match_success(): assert response.status_code == 200 data = response.json() - assert data["job_id"] == body["job_id"] assert data["keyword"] == body["keyword"] assert data["status"] == "success" assert isinstance(data["matched_products"], list) @@ -51,9 +47,6 @@ def test_match_success(): def test_match_no_results(): """검색 결과가 없는 경우""" body = { - "job_id": 2, - "schedule_id": 2, - "schedule_his_id": 2, "keyword": "반지", "search_results": [], } @@ -80,9 +73,6 @@ def test_match_no_matches(): ] body = { - "job_id": 3, - "schedule_id": 3, - "schedule_his_id": 3, "keyword": "반지", "search_results": sample_search_results, } diff --git a/apps/pre-processing-service/app/test/test_product_blog_posting_service.py b/apps/pre-processing-service/app/test/test_product_blog_posting_service.py new file mode 100644 index 00000000..2757eb14 --- /dev/null +++ b/apps/pre-processing-service/app/test/test_product_blog_posting_service.py @@ -0,0 +1,80 @@ +import pytest +from app.service.product_blog_posting_service import ( + ProductBlogPostingService, BlogContentRequest, ProductData +) + +# 샘플 데이터 +sample_product_data = { + "tag": "test001", + "product_url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=902500949447", + "status": "success", + "product_detail": { + "url": "https://ssadagu.kr/shop/view.php?platform=1688&num_iid=902500949447", + "title": "코닝 적용 가능한 애플 13 강화 필름 iphone16/15promax 휴대 전화 필름 애플 11 안티-peep 및 먼지없는 빈", + "price": 430, + "rating": 5.0, + "options": [ + {"name": "먼지 없는 창고 2차 필름 [코닝글라스 방폭丨초투명]", "stock": 0}, + {"name": "먼지 없는 창고 2차 필름 [코닝글라스 방폭丨훔쳐보기 방지]", "stock": 0} + ], + "material_info": { + "상표": "다른", + "재료": "강화 유리", + "필름 종류": "전막", + "크기": "애플 16프로맥스( 6.9inch )", + "적용 모델": "iPhone13 Pro Max" + }, + "product_images": [] + }, + "crawled_at": "2025-09-16 11:49:24" +} + + +@pytest.fixture +def blog_service(): + return ProductBlogPostingService() + + +def test_generate_blog_content(blog_service): + """GPT를 통한 블로그 콘텐츠 생성 테스트""" + request = BlogContentRequest( + content_style="informative", + target_keywords=["아이폰", "강화필름", "보호필름", "스마트폰액세서리"], + include_pricing=True, + content_length="medium" + ) + + product_obj = ProductData.from_dict(sample_product_data) + + # 순수 콘텐츠 생성만 테스트 + blog_content = blog_service.content_generator.generate_blog_content(product_obj, request) + + assert blog_content.title + assert "