diff --git a/.github/workflows/ci-java.yml b/.github/workflows/ci-java.yml index efe949c6..fd55f05f 100644 --- a/.github/workflows/ci-java.yml +++ b/.github/workflows/ci-java.yml @@ -78,7 +78,7 @@ jobs: run: | ./gradlew unitTest ./gradlew integrationTest - if [ "${{ github.base_ref }}" = "main" ]; then + if [ "${{ github.base_ref }}" = "main" ] || [[ "${{ github.ref }}" == refs/tags/* ]]; then ./gradlew e2eTest fi working-directory: apps/user-service @@ -153,33 +153,57 @@ jobs: echo "=== Image Layer Analysis ===" docker history ghcr.io/${{ env.REPO_LC }}/user-service:${{ needs.set-image-tag.outputs.image-tag }} --human --no-trunc -# swagger-docs: -# name: Deploy Swagger Documentation -# runs-on: ubuntu-latest -# needs: -# - build -# - set-image-tag -# if: startsWith(github.ref, 'refs/tags/user-service-v') -# -# steps: -# - name: Checkout repository -# uses: actions/checkout@v4 -# -# - name: Download OpenAPI spec artifacts -# uses: actions/download-artifact@v4 -# with: -# name: openapi-spec-${{ github.run_id }}-${{ github.run_attempt }} -# path: ./openapi-spec -# -# - name: Generate Swagger UI -# uses: Legion2/swagger-ui-action@v1 -# with: -# output: user-service-swagger-ui-${{ needs.set-image-tag.outputs.image-tag }} -# spec-file: openapi-spec/openapi3.yaml -# -# - name: Deploy to GitHub Pages -# uses: peaceiris/actions-gh-pages@v3 -# with: -# github_token: ${{ secrets.GITHUB_TOKEN }} -# publish_dir: ./user-service-swagger-ui-${{ needs.set-image-tag.outputs.image-tag }} -# destination_dir: user-service/${{ needs.set-image-tag.outputs.image-tag }} \ No newline at end of file + swagger-docs: + name: Deploy Swagger Documentation + runs-on: ubuntu-latest + needs: + - build + - set-image-tag + if: startsWith(github.ref, 'refs/tags/user-service-v') + + steps: + - name: Checkout repository + uses: actions/checkout@v4 + + - name: Download OpenAPI spec artifacts + uses: actions/download-artifact@v4 + with: + name: openapi-spec-${{ github.run_id }}-${{ github.run_attempt }} + path: ./openapi-spec + + - name: Check OpenAPI spec file exists + id: check-openapi + run: | + if [ -f "./openapi-spec/openapi3.yaml" ]; then + echo "openapi_exists=true" >> $GITHUB_OUTPUT + echo "✅ OpenAPI spec file found" + ls -la ./openapi-spec/ + else + echo "openapi_exists=false" >> $GITHUB_OUTPUT + echo "❌ OpenAPI spec file not found" + echo "Available files:" + ls -la ./openapi-spec/ || echo "No openapi-spec directory found" + find . -name "*.yaml" -o -name "*.yml" -o -name "*.json" | grep -i openapi || echo "No OpenAPI files found" + fi + + - name: Generate Swagger UI + if: steps.check-openapi.outputs.openapi_exists == 'true' + uses: Legion2/swagger-ui-action@v1 + with: + output: user-service-swagger-ui-${{ needs.set-image-tag.outputs.image-tag }} + spec-file: openapi-spec/openapi3.yaml + + - name: Deploy to GitHub Pages + if: steps.check-openapi.outputs.openapi_exists == 'true' + uses: peaceiris/actions-gh-pages@v3 + with: + github_token: ${{ secrets.GITHUB_TOKEN }} + publish_dir: ./user-service-swagger-ui-${{ needs.set-image-tag.outputs.image-tag }} + destination_dir: user-service/${{ needs.set-image-tag.outputs.image-tag }} + + - name: Skip deployment notice + if: steps.check-openapi.outputs.openapi_exists == 'false' + run: | + echo "⏭️ Skipping Swagger documentation deployment" + echo "Reason: OpenAPI spec file not found at ./openapi-spec/openapi3.yaml" + echo "Please check your build configuration to ensure OpenAPI spec is generated" \ No newline at end of file diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 32a4dcbe..2812ef79 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -6,6 +6,7 @@ CustomException, ) from ...service.crawl_service import CrawlService +from ...service.s3_upload_service import S3UploadService from ...service.search_service import SearchService from ...service.match_service import MatchService from ...service.similarity_service import SimilarityService @@ -60,11 +61,11 @@ async def match(request: RequestSadaguMatch): ) async def similarity(request: RequestSadaguSimilarity): """ - 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. + 매칭된 상품들 중 키워드와의 유사도를 계산하여 상위 10개 상품을 선택합니다. """ try: similarity_service = SimilarityService() - response_data = similarity_service.select_product_by_similarity(request) + response_data = similarity_service.select_top_products_by_similarity(request) if not response_data: raise CustomException( @@ -99,3 +100,24 @@ async def crawl(body: RequestSadaguCrawl): raise HTTPException(status_code=e.status_code, detail=e.detail) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/s3-upload", response_model=ResponseS3Upload, summary="S3 이미지 업로드") +async def s3_upload(request: RequestS3Upload): + """ + 크롤링 완료 후 별도로 호출하여 이미지들을 S3 저장소에 업로드합니다. + """ + try: + s3_upload_service = S3UploadService() + response_data = await s3_upload_service.upload_crawled_products_to_s3(request) + + if not response_data: + raise CustomException( + 500, "S3 이미지 업로드에 실패했습니다.", "S3_UPLOAD_FAILED" + ) + + return response_data + except InvalidItemDataException as e: + raise HTTPException(status_code=e.status_code, detail=e.detail) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index 36bef959..549ba7b5 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -110,8 +110,10 @@ class SadaguSimilarityData(BaseModel): keyword: str = Field( ..., title="분석 키워드", description="유사도 분석에 사용된 키워드" ) - selected_product: Optional[Dict] = Field( - None, title="선택된 상품", description="유사도 분석 결과 선택된 상품" + top_products: List[Dict] = Field( + default_factory=list, + title="선택된 상품들", + description="유사도 분석 결과 선택된 상위 상품 목록", ) reason: Optional[str] = Field( None, title="선택 이유", description="상품 선택 근거 및 점수 정보" @@ -129,16 +131,23 @@ class ResponseSadaguSimilarity(ResponseBase[SadaguSimilarityData]): class RequestSadaguCrawl(RequestBase): - product_url: HttpUrl = Field( + product_urls: List[HttpUrl] = Field( ..., title="상품 URL", description="크롤링할 상품 페이지의 URL" ) # 응답 데이터 모델 class SadaguCrawlData(BaseModel): - product_url: str = Field(..., title="상품 URL", description="크롤링된 상품 URL") - product_detail: Optional[Dict] = Field( - None, title="상품 상세정보", description="크롤링된 상품의 상세 정보" + crawled_products: List[Dict] = Field( + ..., + title="크롤링된 상품들", + description="크롤링된 상품들의 상세 정보 목록 (URL 포함)", + ) + success_count: int = Field( + ..., title="성공 개수", description="성공적으로 크롤링된 상품 개수" + ) + fail_count: int = Field( + ..., title="실패 개수", description="크롤링에 실패한 상품 개수" ) crawled_at: Optional[str] = Field( None, title="크롤링 시간", description="크롤링 완료 시간" @@ -152,6 +161,81 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]): pass +# ============== S3 이미지 업로드 ============== + + +class RequestS3Upload(RequestBase): + keyword: str = Field( + ..., title="검색 키워드", description="폴더명 생성용 키워드" + ) # 추가 + crawled_products: List[Dict] = Field( + ..., + title="크롤링된 상품 데이터", + description="이전 단계에서 크롤링된 상품들의 데이터", + ) + base_folder: Optional[str] = Field( + "product", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" + ) + + +# S3 업로드된 이미지 정보 +class S3ImageInfo(BaseModel): + index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번") + original_url: str = Field( + ..., title="원본 URL", description="크롤링된 원본 이미지 URL" + ) + s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL") + + +# 상품별 S3 업로드 결과 +class ProductS3UploadResult(BaseModel): + product_index: int = Field(..., title="상품 순번", description="크롤링 순번") + product_title: str = Field(..., title="상품 제목", description="상품명") + status: str = Field(..., title="업로드 상태", description="completed/skipped/error") + uploaded_images: List[S3ImageInfo] = Field( + default_factory=list, title="업로드 성공 이미지" + ) + success_count: int = Field( + ..., title="성공 개수", description="업로드 성공한 이미지 수" + ) + fail_count: int = Field( + ..., title="실패 개수", description="업로드 실패한 이미지 수" + ) + + +# S3 업로드 요약 정보 +class S3UploadSummary(BaseModel): + total_products: int = Field( + ..., title="총 상품 수", description="처리 대상 상품 총 개수" + ) + total_success_images: int = Field( + ..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수" + ) + total_fail_images: int = Field( + ..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수" + ) + + +# 응답 데이터 모델 +class S3UploadData(BaseModel): + upload_results: List[ProductS3UploadResult] = Field( + ..., title="업로드 결과", description="각 상품의 S3 업로드 결과" + ) + summary: S3UploadSummary = Field( + ..., title="업로드 요약", description="전체 업로드 결과 요약" + ) + uploaded_at: str = Field( + ..., title="업로드 완료 시간", description="S3 업로드 완료 시간" + ) + + +# 최종 응답 모델 +class ResponseS3Upload(ResponseBase[S3UploadData]): + """S3 이미지 업로드 API 응답""" + + pass + + # ============== 블로그 콘텐츠 생성 ============== @@ -193,6 +277,7 @@ class RequestBlogPublish(RequestBase): tag: str = Field(..., title="블로그 태그", description="블로그 플랫폼 종류") blog_id: str = Field(..., description="블로그 아이디") blog_pw: str = Field(..., description="블로그 비밀번호") + blog_name: Optional[str] = Field(None, description="블로그 이름") post_title: str = Field(..., description="포스팅 제목") post_content: str = Field(..., description="포스팅 내용") post_tags: List[str] = Field(default_factory=list, description="포스팅 태그 목록") diff --git a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py index f55bdba0..8bc9c9a8 100644 --- a/apps/pre-processing-service/app/service/blog/base_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/base_blog_post_service.py @@ -47,12 +47,13 @@ def _login(self) -> None: pass @abstractmethod - def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: + def _write_content(self, title: str, content: str, tags: List[str] = None) -> str: """ 플랫폼별 포스팅 작성 구현 :param title: 포스트 제목 :param content: 포스트 내용 :param tags: 포스트 태그 리스트 + :return: 발행된 블로그 포스트 URL """ pass @@ -96,14 +97,15 @@ def post_content(self, title: str, content: str, tags: List[str] = None) -> Dict self._login() # 3. 포스트 작성 및 발행 - self._write_content(title, content, tags) + post_url = self._write_content(title, content, tags) # 4. 결과 반환 return { - "platform": self._get_platform_name(), - "title": title, - "content_length": len(content), + "tag": self._get_platform_name(), + "post_title": title, "tags": tags or [], + "publish_success": True, + "post_url": post_url, } def __del__(self): diff --git a/apps/pre-processing-service/app/service/blog/blog_publish_service.py b/apps/pre-processing-service/app/service/blog/blog_publish_service.py index b7727cce..0848f123 100644 --- a/apps/pre-processing-service/app/service/blog/blog_publish_service.py +++ b/apps/pre-processing-service/app/service/blog/blog_publish_service.py @@ -1,4 +1,4 @@ -from typing import Dict +from typing import Dict, Optional from app.errors.CustomException import CustomException from app.model.schemas import RequestBlogPublish from app.service.blog.blog_service_factory import BlogServiceFactory @@ -10,31 +10,37 @@ class BlogPublishService: def __init__(self): self.factory = BlogServiceFactory() - def publish_content(self, request: RequestBlogPublish) -> Dict: + def publish_content( + self, + request: RequestBlogPublish, + ) -> Dict: """ 생성된 블로그 콘텐츠를 배포합니다. + + Args: + request: 블로그 발행 요청 데이터 + blog_id: 블로그 아이디 + blog_password: 블로그 비밀번호 """ try: # 팩토리를 통해 적절한 서비스 생성 - blog_service = self.factory.create_service(request.tag) + blog_service = self.factory.create_service( + request.tag, + blog_id=request.blog_id, + blog_password=request.blog_pw, + blog_name=request.blog_name, + ) # 공통 인터페이스로 포스팅 실행 - blog_service.post_content( + response_data = blog_service.post_content( title=request.post_title, content=request.post_content, tags=request.post_tags, ) - # 올바른 응답 데이터를 직접 구성 - response_data = { - "tag": request.tag, - "post_title": request.post_title, - "publish_success": True, # 포스팅 성공 가정 - } - if not response_data: raise CustomException( - f"{request.tag} 블로그 포스팅에 실패했습니다.", status_code=500 + 500, f"{request.tag} 블로그 포스팅에 실패했습니다.", "POSTING_FAIL" ) return response_data @@ -45,5 +51,5 @@ def publish_content(self, request: RequestBlogPublish) -> Dict: except Exception as e: # 예상치 못한 예외 처리 raise CustomException( - f"블로그 포스팅 중 오류가 발생했습니다: {str(e)}", status_code=500 + 500, f"블로그 포스팅 중 오류가 발생했습니다: {str(e)}", "ERROR" ) diff --git a/apps/pre-processing-service/app/service/blog/blog_service_factory.py b/apps/pre-processing-service/app/service/blog/blog_service_factory.py index b6bc6883..4759b5ab 100644 --- a/apps/pre-processing-service/app/service/blog/blog_service_factory.py +++ b/apps/pre-processing-service/app/service/blog/blog_service_factory.py @@ -1,4 +1,4 @@ -from typing import Dict, Type +from typing import Dict, Type, Optional from app.service.blog.base_blog_post_service import BaseBlogPostService from app.service.blog.naver_blog_post_service import NaverBlogPostService from app.service.blog.tistory_blog_post_service import TistoryBlogPostService @@ -11,15 +11,26 @@ class BlogServiceFactory: # 서비스 타입별 클래스 매핑 _services: Dict[str, Type[BaseBlogPostService]] = { - "naver": NaverBlogPostService, - "tistory": TistoryBlogPostService, + "naver_blog": NaverBlogPostService, + "tistory_blog": TistoryBlogPostService, "blogger": BloggerBlogPostAdapter, } @classmethod - def create_service(cls, platform: str) -> BaseBlogPostService: + def create_service( + cls, + platform: str, + blog_id: str, + blog_password: str, + blog_name: Optional[str] = None, + ) -> BaseBlogPostService: """ 플랫폼에 따른 블로그 서비스 인스턴스 생성 + + Args: + platform: 블로그 플랫폼 (naver, tistory, blogger) + blog_id: 블로그 아이디 + blog_password: 블로그 비밀번호 """ service_class = cls._services.get(platform.lower()) @@ -30,7 +41,18 @@ def create_service(cls, platform: str) -> BaseBlogPostService: status_code=400, ) - return service_class() + # 각 서비스의 설정을 의존성 주입 + if platform.lower() == "tistory_blog": + if not blog_name: + raise CustomException( + 200, + "티스토리 블로그가 존재하지않습니다.", + "NOT_FOUND_BLOG", + ) + return service_class(blog_id, blog_password, blog_name) + if platform.lower() == "blogger": + return service_class() + return service_class(blog_id, blog_password) @classmethod def get_supported_platforms(cls) -> list: diff --git a/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py index 717a102e..3f4a67e9 100644 --- a/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py +++ b/apps/pre-processing-service/app/service/blog/blogger_blog_post_adapter.py @@ -55,7 +55,7 @@ def _write_content(self, title: str, content: str, tags: List[str] = None) -> No def _get_platform_name(self) -> str: """플랫폼 이름 반환""" - return "Blogger" + return "BLOGGER" def _validate_content( self, title: str, content: str, tags: Optional[List[str]] = None diff --git a/apps/pre-processing-service/app/service/blog/naver_blog_post_service.py b/apps/pre-processing-service/app/service/blog/naver_blog_post_service.py index 0e33a9fd..702211a4 100644 --- a/apps/pre-processing-service/app/service/blog/naver_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/naver_blog_post_service.py @@ -1,4 +1,3 @@ -import os import time import pyperclip @@ -15,13 +14,28 @@ class NaverBlogPostService(BaseBlogPostService): """네이버 블로그 포스팅 서비스 구현""" + def __init__(self, blog_id: str, blog_password: str, use_webdriver=True): + """네이버 블로그 서비스 초기화 + + Args: + blog_id: 네이버 아이디 + blog_password: 네이버 비밀번호 + use_webdriver: 웹드라이버 사용 여부 + """ + self.blog_id = blog_id + self.blog_password = blog_password + print(blog_id) + print(blog_password) + super().__init__(use_webdriver) + def _load_config(self) -> None: """네이버 블로그 설정 로드""" - - self.id = os.getenv("NAVER_ID", "all2641") - self.password = os.getenv("NAVER_PASSWORD", "cjh83520*") + self.id = self.blog_id + self.password = self.blog_password self.login_url = "https://nid.naver.com/nidlogin.login" self.post_content_url = f"https://blog.naver.com/PostWriteForm.naver?blogId={self.id}&Redirect=Write&redirect=Write&widgetTypeCall=true&noTrackingCode=true&directAccess=false" + # print(self.id) + # print(self.password) def _get_platform_name(self) -> str: return "NAVER_BLOG" @@ -93,7 +107,7 @@ def _login(self) -> None: except Exception as e: raise BlogLoginException("네이버 블로그", f"예상치 못한 오류: {str(e)}") - def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: + def _write_content(self, title: str, content: str, tags: List[str] = None) -> str: """네이버 블로그 포스팅 작성 구현""" from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC @@ -193,8 +207,10 @@ def _write_content(self, title: str, content: str, tags: List[str] = None) -> No self.web_driver.execute_script("arguments[0].click();", final_btn) except TimeoutException: raise BlogElementInteractionException("최종 발행 버튼", "버튼 클릭") + time.sleep(5) - # 발행 완료 확인 + # 발행 완료 확인 및 URL 가져오기 + blog_url = None try: self.wait_driver.until( EC.any_of( @@ -204,8 +220,36 @@ def _write_content(self, title: str, content: str, tags: List[str] = None) -> No EC.url_contains("entry.naver"), ) ) + # 현재 URL 가져오기 + current_url = self.web_driver.current_url + + # PostView URL인 경우 해당 URL을 반환 + if "PostView.naver" in current_url or "entry.naver" in current_url: + blog_url = current_url + # postList인 경우 가장 최근 포스트 URL 찾기 + elif "postList" in current_url: + try: + # 가장 최근 포스트 링크 찾기 + recent_post = self.wait_driver.until( + EC.element_to_be_clickable( + ( + By.CSS_SELECTOR, + ".post_area .post_item:first-child .title_area a", + ) + ) + ) + blog_url = recent_post.get_attribute("href") + except TimeoutException: + # 대안으로 현재 URL 사용 + blog_url = current_url + else: + blog_url = current_url + except TimeoutException: - pass + # 발행 완료를 확인할 수 없는 경우 현재 URL 사용 + blog_url = self.web_driver.current_url + print(f"blog_url: {blog_url}") + return blog_url except (BlogElementInteractionException, BlogPostPublishException): raise diff --git a/apps/pre-processing-service/app/service/blog/tistory_blog_post_service.py b/apps/pre-processing-service/app/service/blog/tistory_blog_post_service.py index cc830bac..0b4d98d0 100644 --- a/apps/pre-processing-service/app/service/blog/tistory_blog_post_service.py +++ b/apps/pre-processing-service/app/service/blog/tistory_blog_post_service.py @@ -1,5 +1,8 @@ import os import time +import json +import requests +from datetime import datetime from selenium.webdriver.common.by import By from selenium.webdriver.support import expected_conditions as EC @@ -13,12 +16,27 @@ class TistoryBlogPostService(BaseBlogPostService): """티스토리 블로그 포스팅 서비스""" + def __init__( + self, blog_id: str, blog_password: str, blog_name: str, use_webdriver=True + ): + """네이버 블로그 서비스 초기화 + + Args: + blog_id: 네이버 아이디 + blog_password: 네이버 비밀번호 + use_webdriver: 웹드라이버 사용 여부 + """ + self.blog_id = blog_id + self.blog_password = blog_password + self.blog_name = blog_name + super().__init__(use_webdriver) + def _load_config(self) -> None: """티스토리 블로그 설정 로드""" - self.blog_name = os.getenv("TISTORY_BLOG_NAME", "hoons2641") - self.id = os.getenv("TISTORY_ID", "fair_05@nate.com") - self.password = os.getenv("TISTORY_PASSWORD", "kdyn264105*") + self.blog_name = self.blog_name + self.id = self.blog_id + self.password = self.blog_password self.login_url = "https://accounts.kakao.com/login/?continue=https%3A%2F%2Fkauth.kakao.com%2Foauth%2Fauthorize%3Fclient_id%3D3e6ddd834b023f24221217e370daed18%26state%3DaHR0cHM6Ly93d3cudGlzdG9yeS5jb20v%26redirect_uri%3Dhttps%253A%252F%252Fwww.tistory.com%252Fauth%252Fkakao%252Fredirect%26response_type%3Dcode%26auth_tran_id%3Dslj3F.mFC~2JNOiCOGi5HdGPKOA.Pce4l5tiS~3fZkInLGuEG3tMq~xZkxx4%26ka%3Dsdk%252F2.7.3%2520os%252Fjavascript%2520sdk_type%252Fjavascript%2520lang%252Fko-KR%2520device%252FMacIntel%2520origin%252Fhttps%25253A%25252F%25252Fwww.tistory.com%26is_popup%3Dfalse%26through_account%3Dtrue&talk_login=hidden#login" self.post_content_url = f"https://{self.blog_name}.tistory.com/manage/newpost" @@ -90,7 +108,60 @@ def _login(self) -> None: except Exception as e: raise BlogLoginException("티스토리 블로그", f"예상치 못한 오류: {str(e)}") - def _write_content(self, title: str, content: str, tags: List[str] = None) -> None: + def _get_post_url_from_api(self, title: str) -> str: + """API를 통해 제목이 일치하는 가장 최근 포스트의 URL을 가져옴""" + try: + # 현재 세션의 쿠키를 가져와서 API 요청에 사용 + cookies = self.web_driver.get_cookies() + session_cookies = {} + for cookie in cookies: + session_cookies[cookie["name"]] = cookie["value"] + + # 포스트 목록 API 호출 + api_url = f"https://{self.blog_name}.tistory.com/manage/posts.json" + params = { + "category": "-3", + "page": "1", + "searchKeyword": "", + "searchType": "title", + "visibility": "all", + } + + response = requests.get(api_url, params=params, cookies=session_cookies) + + if response.status_code == 200: + data = response.json() + items = data.get("items", []) + + # 제목이 일치하는 포스트들 찾기 + matching_posts = [item for item in items if item["title"] == title] + + if matching_posts: + # created 시간으로 정렬하여 가장 최근 포스트 찾기 + latest_post = max( + matching_posts, + key=lambda x: datetime.strptime(x["created"], "%Y-%m-%d %H:%M"), + ) + return latest_post["permalink"] + else: + # 매칭되는 포스트가 없으면 가장 최근 포스트 반환 + if items: + latest_post = max( + items, + key=lambda x: datetime.strptime( + x["created"], "%Y-%m-%d %H:%M" + ), + ) + return latest_post["permalink"] + + # API 호출 실패 시 블로그 메인 URL 반환 + return f"https://{self.blog_name}.tistory.com" + + except Exception: + # 오류 발생 시 블로그 메인 URL 반환 + return f"https://{self.blog_name}.tistory.com" + + def _write_content(self, title: str, content: str, tags: List[str] = None) -> str: """티스토리 블로그 포스팅 작성 구현""" try: @@ -231,6 +302,11 @@ def _write_content(self, title: str, content: str, tags: List[str] = None) -> No "티스토리 블로그", "발행 과정에서 오류가 발생했습니다" ) + # 발행 완료 확인 및 URL 가져오기 + time.sleep(3) # 발행 완료 대기 + blog_url = self._get_post_url_from_api(title) + return blog_url + except (BlogElementInteractionException, BlogPostPublishException): raise except TimeoutException: diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index df90ba01..e8785f64 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -1,4 +1,5 @@ import time +import asyncio from app.service.crawlers.detail_crawler import DetailCrawler from app.errors.CustomException import InvalidItemDataException from app.model.schemas import RequestSadaguCrawl @@ -12,45 +13,133 @@ def __init__(self): async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: """ - 선택된 상품의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) - 상품 URL을 입력받아 상세 정보를 크롤링하여 딕셔너리로 반환합니다. + 선택된 상품들의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) + 여러 상품 URL을 입력받아 순차적으로 상세 정보를 크롤링하여 딕셔너리로 반환합니다. """ - crawler = DetailCrawler(use_selenium=True) + product_urls = [str(url) for url in request.product_urls] + + logger.info(f"상품 상세 크롤링 서비스 시작: 총 {len(product_urls)}개 상품") + + crawled_products = [] + success_count = 0 + fail_count = 0 try: - logger.info( - f"상품 상세 크롤링 서비스 시작: product_url={request.product_url}" + # 각 상품을 순차적으로 크롤링 (안정성 확보) + for i, product_url in enumerate(product_urls, 1): + logger.info(f"상품 {i}/{len(product_urls)} 크롤링 시작: {product_url}") + + crawler = DetailCrawler(use_selenium=True) + + try: + # 상세 정보 크롤링 실행 + product_detail = await crawler.crawl_detail(product_url) + + if product_detail: + product_title = product_detail.get("title", "Unknown")[:50] + logger.success( + f"상품 {i} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}" + ) + + # 성공한 상품 추가 + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": product_detail, + "status": "success", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + success_count += 1 + else: + logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음") + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": "상세 정보 없음", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + fail_count += 1 + + except Exception as e: + logger.error( + f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'" + ) + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": str(e), + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + fail_count += 1 + + finally: + # 각 크롤러 개별 정리 + await crawler.close() + + # 상품간 간격 (서버 부하 방지) + if i < len(product_urls): + await asyncio.sleep(1) + + logger.success( + f"전체 크롤링 완료: 총 {len(product_urls)}개, 성공 {success_count}개, 실패 {fail_count}개" ) - # 상세 정보 크롤링 실행 - product_detail = await crawler.crawl_detail( - product_url=str(request.product_url), include_images=False + # 응답 데이터 구성 + data = { + "crawled_products": crawled_products, + "success_count": success_count, + "fail_count": fail_count, + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + logger.info( + f"상품 상세 크롤링 서비스 완료: success_rate={success_count}/{len(product_urls)}" ) + return Response.ok(data) + + except Exception as e: + logger.error(f"배치 크롤링 서비스 오류: error='{e}'") + raise InvalidItemDataException() + + # 기존 단일 크롤링 메서드도 유지 (하위 호환성) + async def crawl_single_product_detail(self, product_url: str) -> dict: + """ + 단일 상품 크롤링 (하위 호환성용) + """ + crawler = DetailCrawler(use_selenium=True) + + try: + logger.info(f"단일 상품 크롤링 시작: {product_url}") + + product_detail = await crawler.crawl_detail(product_url) if not product_detail: - logger.error(f"상품 상세 정보 크롤링 실패: url={request.product_url}") + logger.error(f"상품 상세 정보 크롤링 실패: url={product_url}") raise InvalidItemDataException() product_title = product_detail.get("title", "Unknown")[:50] - logger.success( - f"크롤링 완료: title='{product_title}', price={product_detail.get('price', 0)}, options_count={len(product_detail.get('options', []))}" - ) + logger.success(f"크롤링 완료: title='{product_title}'") - # 응답 데이터 구성 data = { - "product_url": str(request.product_url), + "product_url": product_url, "product_detail": product_detail, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - logger.info(f"상품 상세 크롤링 서비스 완료: status=success") return Response.ok(data) except Exception as e: - logger.error( - f"크롤링 서비스 오류: product_url={request.product_url}, error='{e}'" - ) + logger.error(f"단일 크롤링 오류: url={product_url}, error='{e}'") raise InvalidItemDataException() finally: await crawler.close() - logger.debug("크롤러 리소스 정리 완료") diff --git a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py index 885fd2f0..f01ed53a 100644 --- a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py @@ -8,19 +8,17 @@ class DetailCrawler(SearchCrawler): """SearchCrawler를 확장한 상세 크롤링 클래스""" - async def crawl_detail( - self, product_url: str, include_images: bool = False - ) -> dict: - """상품 상세 정보 크롤링""" + async def crawl_detail(self, product_url: str) -> dict: + """상품 상세 정보 크롤링 (이미지 항상 포함)""" try: - logger.info( - f"상품 상세 크롤링 시작: url='{product_url}', include_images={include_images}" - ) + logger.info(f"상품 상세 크롤링 시작: url='{product_url}'") - if self.use_selenium: - soup = await self._get_soup_selenium(product_url) - else: - soup = await self._get_soup_httpx(product_url) + # HTML 가져오기 + soup = ( + await self._get_soup_selenium(product_url) + if self.use_selenium + else await self._get_soup_httpx(product_url) + ) # 기본 정보 추출 title = self._extract_title(soup) @@ -29,6 +27,15 @@ async def crawl_detail( options = self._extract_options(soup) material_info = self._extract_material_info(soup) + # 이미지 정보 추출 (항상 실행) + logger.info("이미지 정보 추출 중...") + page_images = self._extract_images(soup) + option_images = [ + opt["image_url"] for opt in options if opt.get("image_url") + ] + # 중복 제거 후 합치기 + all_images = list(set(page_images + option_images)) + product_data = { "url": product_url, "title": title, @@ -37,23 +44,13 @@ async def crawl_detail( "options": options, "material_info": material_info, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + "product_images": [{"original_url": url} for url in all_images], } + logger.info(f"추출된 이미지: {len(all_images)}개") logger.info( - f"기본 상품 정보 추출 완료: title='{title[:50]}', price={price}, rating={rating}, options_count={len(options)}" + f"상품 상세 크롤링 완료: title='{title[:50]}', price={price}, rating={rating}, options_count={len(options)}" ) - - if include_images: - logger.info("이미지 정보 추출 중...") - product_images = self._extract_images(soup) - product_data["product_images"] = [ - {"original_url": img_url} for img_url in product_images - ] - logger.info(f"추출된 이미지: {len(product_images)}개") - else: - product_data["product_images"] = [] - - logger.info(f"상품 상세 크롤링 완료: url='{product_url}'") return product_data except Exception as e: @@ -89,54 +86,43 @@ async def _get_soup_httpx(self, product_url: str) -> BeautifulSoup: raise Exception(f"HTTP 요청 실패: {e}") def _extract_title(self, soup: BeautifulSoup) -> str: - """제목 추출""" title_element = soup.find("h1", {"id": "kakaotitle"}) title = title_element.get_text(strip=True) if title_element else "제목 없음" logger.debug(f"제목 추출: '{title[:50]}'") return title def _extract_price(self, soup: BeautifulSoup) -> int: - """가격 추출""" price = 0 - price_selectors = [ + selectors = [ "span.price.gsItemPriceKWR", ".pdt_price span.price", "span.price", ".price", ] - - for selector in price_selectors: - price_element = soup.select_one(selector) - if price_element: - price_text = ( - price_element.get_text(strip=True) - .replace(",", "") - .replace("원", "") - ) - price_match = re.search(r"(\d+)", price_text) - if price_match: - price = int(price_match.group(1)) - logger.debug(f"가격 추출 성공: {price}원 (selector: {selector})") + for sel in selectors: + el = soup.select_one(sel) + if el: + text = el.get_text(strip=True).replace(",", "").replace("원", "") + match = re.search(r"(\d+)", text) + if match: + price = int(match.group(1)) + logger.debug(f"가격 추출 성공: {price}원 (selector: {sel})") break - if price == 0: logger.debug("가격 추출 실패 - 0원으로 설정") - return price def _extract_rating(self, soup: BeautifulSoup) -> float: - """평점 추출""" rating = 0.0 - star_containers = [ + containers = [ soup.find("a", class_="start"), soup.find("div", class_=re.compile(r"star|rating")), soup.find("a", href="#reviews_wrap"), ] - - for container in star_containers: - if container: - star_imgs = container.find_all("img") - for img in star_imgs: + for cont in containers: + if cont: + imgs = cont.find_all("img") + for img in imgs: src = img.get("src", "") if "icon_star.svg" in src: rating += 1 @@ -145,88 +131,57 @@ def _extract_rating(self, soup: BeautifulSoup) -> float: if rating > 0: logger.debug(f"평점 추출 성공: {rating}점") break - if rating == 0.0: logger.debug("평점 추출 실패 - 0.0점으로 설정") - return rating def _extract_options(self, soup: BeautifulSoup) -> list[dict]: - """상품 옵션 추출""" options = [] sku_list = soup.find("ul", {"id": "skubox"}) - if sku_list: - option_items = sku_list.find_all("li", class_=re.compile(r"imgWrapper")) - logger.debug(f"옵션 항목 발견: {len(option_items)}개") - - for item in option_items: - title_element = item.find("a", title=True) - if title_element: - option_name = title_element.get("title", "").strip() - - # 재고 정보 추출 + items = sku_list.find_all("li", class_=re.compile(r"imgWrapper")) + for item in items: + title_el = item.find("a", title=True) + if title_el: + name = title_el.get("title", "").strip() stock = 0 - item_text = item.get_text() - stock_match = re.search(r"재고\s*:\s*(\d+)", item_text) + stock_match = re.search(r"재고\s*:\s*(\d+)", item.get_text()) if stock_match: stock = int(stock_match.group(1)) - - # 이미지 URL 추출 - img_element = item.find("img", class_="colorSpec_hashPic") - image_url = "" - if img_element and img_element.get("src"): - image_url = img_element["src"] - - if option_name: + img_el = item.find("img", class_="colorSpec_hashPic") + img_url = img_el["src"] if img_el and img_el.get("src") else "" + if name: options.append( - { - "name": option_name, - "stock": stock, - "image_url": image_url, - } + {"name": name, "stock": stock, "image_url": img_url} ) - logger.debug(f"옵션 추출: name='{option_name}', stock={stock}") - logger.info(f"총 {len(options)}개 옵션 추출 완료") return options def _extract_material_info(self, soup: BeautifulSoup) -> dict: - """소재 정보 추출""" material_info = {} - info_items = soup.find_all("div", class_="pro-info-item") - - for item in info_items: - title_element = item.find("div", class_="pro-info-title") - info_element = item.find("div", class_="pro-info-info") - - if title_element and info_element: - title = title_element.get_text(strip=True) - info = info_element.get_text(strip=True) - material_info[title] = info - logger.debug(f"소재 정보 추출: {title}='{info}'") - + items = soup.find_all("div", class_="pro-info-item") + for item in items: + title_el = item.find("div", class_="pro-info-title") + info_el = item.find("div", class_="pro-info-info") + if title_el and info_el: + material_info[title_el.get_text(strip=True)] = info_el.get_text( + strip=True + ) logger.info(f"총 {len(material_info)}개 소재 정보 추출 완료") return material_info def _extract_images(self, soup: BeautifulSoup) -> list[str]: - """상품 이미지 추출""" images = [] + # img_translate_x 패턴 img_elements = soup.find_all("img", {"id": re.compile(r"img_translate_\d+")}) - for img in img_elements: - src = img.get("src", "") - if src: - if src.startswith("//"): - src = "https:" + src - elif src.startswith("/"): - src = self.base_url + src - elif src.startswith("http"): - pass - else: - continue - images.append(src) - logger.debug(f"이미지 URL 추출: {src}") - + src = img.get("src") or img.get("data-src") + if not src: + continue + if src.startswith("//"): + src = "https:" + src + elif src.startswith("/"): + src = self.base_url + src + images.append(src) logger.info(f"총 {len(images)}개 이미지 URL 추출 완료") return images diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py index a0d46e02..1bc36fc5 100644 --- a/apps/pre-processing-service/app/service/crawlers/search_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/search_crawler.py @@ -49,7 +49,7 @@ async def search_products_selenium(self, keyword: str) -> list[dict]: logger.info( f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" ) - return unique_products[:20] + return unique_products[:40] except Exception as e: logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") @@ -88,7 +88,7 @@ async def search_products_httpx(self, keyword: str) -> list[dict]: product_links.append({"url": full_url, "title": title}) logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") - return product_links[:20] + return product_links[:40] except Exception as e: logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") diff --git a/apps/pre-processing-service/app/service/s3_upload_service.py b/apps/pre-processing-service/app/service/s3_upload_service.py new file mode 100644 index 00000000..1c024a63 --- /dev/null +++ b/apps/pre-processing-service/app/service/s3_upload_service.py @@ -0,0 +1,125 @@ +import time +import asyncio +import aiohttp +from typing import List, Dict +from loguru import logger +from app.errors.CustomException import InvalidItemDataException +from app.model.schemas import RequestS3Upload +from app.utils.s3_upload_util import S3UploadUtil +from app.utils.response import Response + + +class S3UploadService: + """6단계: 크롤링된 상품 이미지들과 데이터를 S3에 업로드하는 서비스""" + + def __init__(self): + self.s3_util = S3UploadUtil() + + async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: + """ + 크롤링된 상품들의 이미지와 데이터를 S3에 업로드하는 비즈니스 로직 (6단계) + """ + keyword = request.keyword # 키워드 추가 + crawled_products = request.crawled_products + base_folder = ( + request.base_folder or "product" + ) # 🔸 기본값 변경: product-images → product + + logger.info( + f"S3 업로드 서비스 시작: keyword='{keyword}', {len(crawled_products)}개 상품" + ) + + upload_results = [] + total_success_images = 0 + total_fail_images = 0 + + try: + # HTTP 세션을 사용한 이미지 다운로드 + async with aiohttp.ClientSession() as session: + + # 각 상품별로 순차 업로드 + for product_info in crawled_products: + product_index = product_info.get("index", 0) + product_detail = product_info.get("product_detail") + + logger.info( + f"상품 {product_index}/{len(crawled_products)} S3 업로드 시작" + ) + + # 크롤링 실패한 상품은 스킵 + if not product_detail or product_info.get("status") != "success": + logger.warning( + f"상품 {product_index}: 크롤링 실패로 인한 업로드 스킵" + ) + upload_results.append( + { + "product_index": product_index, + "product_title": "Unknown", + "status": "skipped", + "folder_s3_url": None, + "uploaded_images": [], + "success_count": 0, + "fail_count": 0, + } + ) + continue + + try: + # 상품 이미지 + 데이터 업로드 (키워드 전달 추가!) + # 🔸 전체 크롤링 데이터를 전달 (product_detail이 아닌 product_info 전체) + upload_result = await self.s3_util.upload_single_product_images( + session, + product_info, + product_index, + keyword, + base_folder, # product_detail → product_info + ) + + upload_results.append(upload_result) + total_success_images += upload_result["success_count"] + total_fail_images += upload_result["fail_count"] + + logger.success( + f"상품 {product_index} S3 업로드 완료: 성공 {upload_result['success_count']}개, " + f"실패 {upload_result['fail_count']}개" + ) + + except Exception as e: + logger.error(f"상품 {product_index} S3 업로드 오류: {e}") + upload_results.append( + { + "product_index": product_index, + "product_title": product_detail.get("title", "Unknown"), + "status": "error", + "folder_s3_url": None, + "uploaded_images": [], + "success_count": 0, + "fail_count": 0, + } + ) + + # 상품간 간격 (서버 부하 방지) + if product_index < len(crawled_products): + await asyncio.sleep(1) + + logger.success( + f"S3 업로드 서비스 완료: 총 성공 이미지 {total_success_images}개, 총 실패 이미지 {total_fail_images}개" + ) + + # 간소화된 응답 데이터 구성 + data = { + "upload_results": upload_results, + "summary": { + "total_products": len(crawled_products), + "total_success_images": total_success_images, + "total_fail_images": total_fail_images, + }, + "uploaded_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + message = f"S3 업로드 완료: {total_success_images}개 이미지 업로드 성공, 상품 데이터 JSON 파일 포함" + return Response.ok(data, message) + + except Exception as e: + logger.error(f"S3 업로드 서비스 전체 오류: {e}") + raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index 171bd57f..070f6cc2 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -77,9 +77,9 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: logger.debug(f"상품 {i + 1}: 제목 추출 실패, 제외") continue - # 최대 20개까지만 처리 - if len(enriched_results) >= 20: - logger.info("최대 20개 상품 수집 완료") + # 최대 40개까지 처리 + if len(enriched_results) >= 40: + logger.info("최대 40개 상품 수집 완료") break except Exception as e: diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 516b0c63..cf943279 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -9,16 +9,19 @@ class SimilarityService: def __init__(self): pass - def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict: + def select_top_products_by_similarity( + self, request: RequestSadaguSimilarity + ) -> dict: """ - BERT 기반 유사도 분석 후 상품 선택 - 4단계 + 형태소 분석 후 Top 10 선택 (10개 이하면 유사도 분석 생략) """ keyword = request.keyword candidates = request.matched_products fallback_products = request.search_results or [] + top_count = 10 # Top 10 개수 설정 logger.info( - f"유사도 분석 서비스 시작: keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" + f"상품 선택 서비스 시작 (Top {top_count}): keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" ) # 매칭된 상품이 없으면 전체 검색 결과로 폴백 @@ -30,130 +33,151 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict data = { "keyword": keyword, - "selected_product": None, + "top_products": [], "reason": "매칭된 상품과 검색 결과가 모두 없음", } return Response.ok(data, "매칭된 상품과 검색 결과가 모두 없습니다.") - logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석") + logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석 진행") candidates = fallback_products analysis_mode = "fallback_similarity_only" + skip_similarity = False else: analysis_mode = "matched_products" + # 형태소 분석 결과가 10개 이하면 유사도 분석 생략 + skip_similarity = len(candidates) <= top_count try: - analyzer = SimilarityAnalyzerONNX() - - logger.info( - f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" - ) - - # 한 개만 있으면 바로 선택 - if len(candidates) == 1: - selected_product = candidates[0] - - logger.info("단일 후보 상품 - 유사도 검증 진행") - # 유사도 계산 - similarity = analyzer.calculate_similarity( - keyword, selected_product["title"] + # 형태소 분석 결과가 10개 이하면 유사도 분석 생략하고 바로 반환 + if skip_similarity and analysis_mode == "matched_products": + logger.info( + f"형태소 분석 결과가 {len(candidates)}개로 {top_count}개 이하 - 유사도 분석 생략" ) - # 폴백 모드에서는 임계값 검증 - if analysis_mode == "fallback_similarity_only": - similarity_threshold = 0.3 - if similarity < similarity_threshold: - logger.warning( - f"단일 상품 유사도 미달: similarity={similarity:.4f} < threshold={similarity_threshold}" - ) - data = { - "keyword": keyword, - "selected_product": None, - "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", - } - return Response.ok( - data, "단일 상품 유사도 미달 되어 상품이 존재하지않습니다." - ) - - selected_product["similarity_info"] = { - "similarity_score": float(similarity), - "analysis_type": "single_candidate", - "analysis_mode": analysis_mode, - } + # 매칭 스코어 기준으로 정렬된 상태 유지 (이미 match_service에서 정렬됨) + top_products = [] + for i, product in enumerate(candidates): + enhanced_product = product.copy() + enhanced_product["rank"] = i + 1 + enhanced_product["selection_info"] = { + "selection_type": "match_only", + "match_score": product.get("match_info", {}).get( + "match_score", 0.0 + ), + "reason": "형태소 분석만으로 선택 (유사도 분석 생략)", + "total_candidates": len(candidates), + } + top_products.append(enhanced_product) logger.success( - f"단일 상품 선택 완료: title='{selected_product['title'][:30]}', similarity={similarity:.4f}" + f"형태소 분석만으로 상품 선택 완료: keyword='{keyword}', selected_count={len(top_products)}" ) + data = { "keyword": keyword, - "selected_product": selected_product, - "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", + "top_products": top_products, + "reason": f"형태소 분석 결과 {len(candidates)}개 - 유사도 분석 생략", } return Response.ok(data) - # 여러 개가 있으면 유사도 비교 - logger.info("여러 상품 중 최고 유사도로 선택...") + # 유사도 분석 필요한 경우 (매칭 결과가 10개 초과이거나 폴백 모드) + analyzer = SimilarityAnalyzerONNX() + + logger.info( + f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" + ) - # 제목만 추출해서 배치 분석 + # 모든 후보에 대해 유사도 계산 titles = [product["title"] for product in candidates] similarity_results = analyzer.analyze_similarity_batch(keyword, titles) - # 결과 출력 - logger.info("유사도 분석 결과:") - for i, result in enumerate(similarity_results[:5]): # 상위 5개만 로그 - logger.info( - f" {i+1}위: {result['title'][:40]} | 유사도: {result['similarity']:.4f}" - ) + # 유사도 정보 추가 및 Top 10 선택 + enhanced_products = [] + similarity_threshold = ( + 0.3 if analysis_mode == "fallback_similarity_only" else 0.0 + ) - # 최고 유사도 선택 - best_result = similarity_results[0] - selected_product = candidates[best_result["index"]].copy() + for i, result in enumerate(similarity_results): + product = candidates[result["index"]].copy() - # 폴백 모드에서는 임계값 검증 - similarity_threshold = 0.3 - if ( - analysis_mode == "fallback_similarity_only" - and best_result["similarity"] < similarity_threshold - ): - logger.warning( - f"최고 유사도 미달: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}" - ) - data = { - "keyword": keyword, - "selected_product": None, - "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", + # 폴백 모드에서는 임계값 검증 + if ( + analysis_mode == "fallback_similarity_only" + and result["similarity"] < similarity_threshold + ): + logger.debug( + f"상품 {i + 1} 유사도 미달로 제외: similarity={result['similarity']:.4f} < threshold={similarity_threshold}" + ) + continue + + product["similarity_info"] = { + "similarity_score": result["similarity"], + "analysis_type": "batch_similarity", + "analysis_mode": analysis_mode, } - return Response.ok(data, "최고 유사도가 기준보다 미달 되었습니다.") - - # 유사도 정보 추가 - selected_product["similarity_info"] = { - "similarity_score": best_result["similarity"], - "analysis_type": "multi_candidate_bert", - "analysis_mode": analysis_mode, - "rank": 1, - "total_candidates": len(candidates), - } - # 매칭 모드에서는 종합 점수도 계산 - if analysis_mode == "matched_products" and "match_info" in selected_product: - match_score = selected_product["match_info"]["match_score"] - similarity_score = best_result["similarity"] - # 가중치: 매칭 40%, 유사도 60% - final_score = match_score * 0.4 + similarity_score * 0.6 - selected_product["final_score"] = final_score - reason = f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6" - logger.info( - f"종합 점수 계산: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}" + # 매칭 모드에서는 종합 점수 계산 + if analysis_mode == "matched_products" and "match_info" in product: + match_score = product["match_info"]["match_score"] + similarity_score = result["similarity"] + # 가중치: 매칭 40%, 유사도 60% + final_score = match_score * 0.4 + similarity_score * 0.6 + product["final_score"] = final_score + product["selection_info"] = { + "selection_type": "match_and_similarity", + "match_score": match_score, + "similarity_score": similarity_score, + "final_score": final_score, + "reason": f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6", + } + else: + product["selection_info"] = { + "selection_type": "similarity_only", + "similarity_score": result["similarity"], + "reason": f"유사도({result['similarity']:.4f}) 기준 선택 ({analysis_mode})", + } + + enhanced_products.append(product) + + # 종합 점수 또는 유사도 기준으로 재정렬 + if analysis_mode == "matched_products": + enhanced_products.sort( + key=lambda x: x.get( + "final_score", x["similarity_info"]["similarity_score"] + ), + reverse=True, ) else: - reason = f"유사도({best_result['similarity']:.4f}) 기준 선택 ({analysis_mode})" + enhanced_products.sort( + key=lambda x: x["similarity_info"]["similarity_score"], reverse=True + ) + + # Top 10 선택 + top_products = enhanced_products[:top_count] + + # 순위 정보 추가 + for i, product in enumerate(top_products): + product["rank"] = i + 1 logger.success( - f"상품 선택 완료: title='{selected_product['title'][:30]}', {reason}" + f"유사도 분석 완료: keyword='{keyword}', total_analyzed={len(candidates)}, valid_results={len(enhanced_products)}, top_selected={len(top_products)}" ) + + if top_products: + best_product = top_products[0] + if "final_score" in best_product: + logger.info( + f"1위 상품: title='{best_product['title'][:30]}', final_score={best_product['final_score']:.4f}" + ) + else: + logger.info( + f"1위 상품: title='{best_product['title'][:30]}', similarity={best_product['similarity_info']['similarity_score']:.4f}" + ) + data = { "keyword": keyword, - "selected_product": selected_product, - "reason": reason, + "top_products": top_products, + "reason": f"유사도 분석 후 상위 {len(top_products)}개 선택 ({analysis_mode})", } return Response.ok(data) diff --git a/apps/pre-processing-service/app/utils/s3_upload_util.py b/apps/pre-processing-service/app/utils/s3_upload_util.py new file mode 100644 index 00000000..0aaa5ace --- /dev/null +++ b/apps/pre-processing-service/app/utils/s3_upload_util.py @@ -0,0 +1,281 @@ +import os +import json +import boto3 +import aiohttp +import asyncio +from datetime import datetime +from urllib.parse import urlparse +from typing import Dict, Optional +from loguru import logger + + +class S3UploadUtil: + """S3 업로드 전용 유틸리티 클래스""" + + def __init__(self): + # 환경변수에서 AWS 설정 읽기 + self.aws_access_key = os.getenv("AWS_ACCESS_KEY_ID") + self.aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + self.bucket_name = os.getenv("S3_BUCKET_NAME", "icebang4-dev-bucket") + self.region = os.getenv("AWS_REGION", "ap-northeast-2") + + if not self.aws_access_key or not self.aws_secret_key: + raise ValueError( + "AWS_ACCESS_KEY_ID와 AWS_SECRET_ACCESS_KEY 환경변수가 필요합니다" + ) + + self.base_url = f"https://{self.bucket_name}.s3.{self.region}.amazonaws.com" + + # S3 클라이언트 초기화 + self.s3_client = boto3.client( + "s3", + aws_access_key_id=self.aws_access_key, + aws_secret_access_key=self.aws_secret_key, + region_name=self.region, + ) + + logger.info( + f"S3 클라이언트 초기화 완료: bucket={self.bucket_name}, region={self.region}" + ) + + async def download_image( + self, session: aiohttp.ClientSession, image_url: str + ) -> Optional[bytes]: + """이미지 URL에서 이미지 데이터 다운로드""" + try: + logger.debug(f"이미지 다운로드 시작: {image_url}") + + async with session.get( + image_url, timeout=aiohttp.ClientTimeout(total=30) + ) as response: + if response.status == 200: + image_data = await response.read() + logger.debug(f"이미지 다운로드 완료: {len(image_data)} bytes") + return image_data + else: + logger.warning( + f"이미지 다운로드 실패: {image_url}, status={response.status}" + ) + return None + + except Exception as e: + logger.error(f"이미지 다운로드 오류: {image_url}, error={e}") + return None + + def get_file_extension(self, image_url: str) -> str: + """URL에서 파일 확장자 추출""" + parsed = urlparse(image_url) + path = parsed.path.lower() + + # 일반적인 이미지 확장자 확인 + for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]: + if ext in path: + return ext + + # 기본값 + return ".jpg" + + def get_content_type(self, file_extension: str) -> str: + """파일 확장자에 따른 Content-Type 반환""" + content_types = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + } + return content_types.get(file_extension, "image/jpeg") + + def upload_to_s3( + self, data: bytes, s3_key: str, content_type: str = "image/jpeg" + ) -> bool: + """S3에 데이터 업로드 (이미지 또는 JSON)""" + try: + logger.debug(f"S3 업로드 시작: key={s3_key}") + + self.s3_client.put_object( + Bucket=self.bucket_name, + Key=s3_key, + Body=data, + ContentType=content_type, + ) + + logger.debug(f"S3 업로드 완료: key={s3_key}") + return True + + except Exception as e: + logger.error(f"S3 업로드 오류: key={s3_key}, error={e}") + return False + + def upload_json_to_s3(self, json_data: Dict, s3_key: str) -> bool: + """JSON 데이터를 S3에 업로드""" + try: + json_str = json.dumps(json_data, ensure_ascii=False, indent=2) + json_bytes = json_str.encode("utf-8") + + return self.upload_to_s3(json_bytes, s3_key, "application/json") + + except Exception as e: + logger.error(f"JSON S3 업로드 오류: key={s3_key}, error={e}") + return False + + def generate_product_folder_name(self, product_index: int, keyword: str) -> str: + """상품별 폴더명 생성 (시간_키워드_번호)""" + # 키워드에서 특수문자 제거 + safe_keyword = ( + keyword.replace("/", "-") + .replace("\\", "-") + .replace(" ", "_") + .replace(":", "-") + .replace("*", "-") + .replace("?", "-") + .replace('"', "-") + .replace("<", "-") + .replace(">", "-") + .replace("|", "-")[:20] # 길이 제한 + ) + + # 날짜 형식: 20250922 + date_str = datetime.now().strftime("%Y%m%d") + + # 폴더명: 20250922_키워드_1 + folder_name = f"{date_str}_{safe_keyword}_{product_index}" + + return folder_name + + def generate_s3_key( + self, + base_folder: str, + folder_name: str, + file_name: str, + ) -> str: + """S3 키 생성""" + # 최종 S3 키: product/20250922_산리오_1/image_001.jpg 또는 product_data.json + s3_key = f"{base_folder}/{folder_name}/{file_name}" + return s3_key + + def get_s3_url(self, s3_key: str) -> str: + """S3 키에서 접근 가능한 URL 생성""" + return f"{self.base_url}/{s3_key}" + + async def upload_single_product_images( + self, + session: aiohttp.ClientSession, + product_info: Dict, # 🔸 이름 변경: product_data → product_info (전체 크롤링 데이터) + product_index: int, + keyword: str, # 키워드 파라미터 추가 + base_folder: str = "product", # 🔸 기본 폴더 변경: product-images → product + ) -> Dict: + """단일 상품의 모든 데이터(이미지 + JSON)를 S3에 업로드""" + + # 🔸 전체 크롤링 데이터에서 필요한 정보 추출 + product_detail = product_info.get("product_detail", {}) + product_title = product_detail.get("title", "Unknown") + product_images = product_detail.get("product_images", []) + + uploaded_images = [] + + logger.info( + f"상품 {product_index} 업로드 시작: {len(product_images)}개 이미지, keyword='{keyword}'" + ) + + # 키워드 기반 폴더명 한 번만 생성 + folder_name = self.generate_product_folder_name(product_index, keyword) + + fail_count = 0 + folder_s3_url = f"{self.base_url}/{base_folder}/{folder_name}" + + # 🆕 1. 먼저 상품 데이터 JSON 파일 업로드 + try: + # 전체 크롤링 데이터를 JSON으로 저장 (S3 업로드 메타데이터 추가) + product_data_with_meta = { + **product_info, # 전체 크롤링 데이터 (index, url, product_detail, status, crawled_at 포함) + "s3_upload_keyword": keyword, # 추가 메타데이터 + "s3_uploaded_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + } + + json_s3_key = self.generate_s3_key( + base_folder, folder_name, "product_data.json" + ) + + if self.upload_json_to_s3(product_data_with_meta, json_s3_key): + logger.success(f"상품 {product_index} JSON 데이터 업로드 완료") + else: + logger.error(f"상품 {product_index} JSON 데이터 업로드 실패") + + except Exception as e: + logger.error(f"상품 {product_index} JSON 업로드 오류: {e}") + + # 2. 이미지 업로드 (기존 로직) + if not product_images: + logger.warning(f"상품 {product_index}: 업로드할 이미지가 없음") + return { + "product_index": product_index, + "product_title": product_title, + "status": "no_images", + "folder_s3_url": folder_s3_url, + "uploaded_images": uploaded_images, + "success_count": 0, + "fail_count": 0, + } + + # 각 이미지 업로드 + for img_idx, img_info in enumerate(product_images, 1): + original_url = img_info.get("original_url", "") + + if not original_url: + logger.warning(f"상품 {product_index}, 이미지 {img_idx}: URL이 없음") + fail_count += 1 + continue + + try: + # 이미지 다운로드 + image_data = await self.download_image(session, original_url) + + if not image_data: + fail_count += 1 + continue + + # S3 키 생성 (키워드 기반 폴더명 사용) + file_extension = self.get_file_extension(original_url) + image_file_name = f"image_{img_idx:03d}{file_extension}" + s3_key = self.generate_s3_key(base_folder, folder_name, image_file_name) + + # S3 업로드 + content_type = self.get_content_type(file_extension) + + if self.upload_to_s3(image_data, s3_key, content_type): + s3_url = self.get_s3_url(s3_key) + uploaded_images.append( + { + "index": img_idx, + "original_url": original_url, + "s3_url": s3_url, + } + ) + + logger.debug(f"상품 {product_index}, 이미지 {img_idx} 업로드 완료") + else: + fail_count += 1 + + except Exception as e: + logger.error(f"상품 {product_index}, 이미지 {img_idx} 처리 오류: {e}") + fail_count += 1 + + # 이미지 간 간격 (서버 부하 방지) + await asyncio.sleep(0.5) + + logger.success( + f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {fail_count}개, folder='{folder_name}'" + ) + + return { + "product_index": product_index, + "product_title": product_title, + "status": "completed", + "folder_s3_url": folder_s3_url, # 🔸 폴더 전체를 가리킴 (이미지 + JSON 포함) + "json_s3_url": f"{folder_s3_url}/product_data.json", # 🆕 JSON 파일 직접 링크 + "uploaded_images": uploaded_images, + "success_count": len(uploaded_images), + "fail_count": fail_count, + } diff --git a/apps/pre-processing-service/poetry.lock b/apps/pre-processing-service/poetry.lock index ca5c20ab..f02855bc 100644 --- a/apps/pre-processing-service/poetry.lock +++ b/apps/pre-processing-service/poetry.lock @@ -321,6 +321,46 @@ d = ["aiohttp (>=3.10)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "boto3" +version = "1.40.35" +description = "The AWS SDK for Python" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "boto3-1.40.35-py3-none-any.whl", hash = "sha256:f4c1b01dd61e7733b453bca38b004ce030e26ee36e7a3d4a9e45a730b67bc38d"}, + {file = "boto3-1.40.35.tar.gz", hash = "sha256:d718df3591c829bcca4c498abb7b09d64d1eecc4e5a2b6cef14b476501211b8a"}, +] + +[package.dependencies] +botocore = ">=1.40.35,<1.41.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.14.0,<0.15.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.40.35" +description = "Low-level, data-driven core of boto 3." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "botocore-1.40.35-py3-none-any.whl", hash = "sha256:c545de2cbbce161f54ca589fbb677bae14cdbfac7d5f1a27f6a620cb057c26f4"}, + {file = "botocore-1.40.35.tar.gz", hash = "sha256:67e062752ff579c8cc25f30f9c3a84c72d692516a41a9ee1cf17735767ca78be"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} + +[package.extras] +crt = ["awscrt (==0.27.6)"] + [[package]] name = "bs4" version = "0.0.2" @@ -1320,6 +1360,18 @@ files = [ {file = "jiter-0.11.0.tar.gz", hash = "sha256:1d9637eaf8c1d6a63d6562f2a6e5ab3af946c66037eb1b894e8fad75422266e4"}, ] +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] + [[package]] name = "joblib" version = "1.5.2" @@ -1693,14 +1745,14 @@ sympy = "*" [[package]] name = "openai" -version = "1.108.0" +version = "1.108.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "openai-1.108.0-py3-none-any.whl", hash = "sha256:31f2e58230e2703f13ddbb50c285f39dacf7fca64ab19882fd8a7a0b2bccd781"}, - {file = "openai-1.108.0.tar.gz", hash = "sha256:e859c64e4202d7f5956f19280eee92bb281f211c41cdd5be9e63bf51a024ff72"}, + {file = "openai-1.108.1-py3-none-any.whl", hash = "sha256:952fc027e300b2ac23be92b064eac136a2bc58274cec16f5d2906c361340d59b"}, + {file = "openai-1.108.1.tar.gz", hash = "sha256:6648468c1aec4eacfa554001e933a9fa075f57bacfc27588c2e34456cee9fef9"}, ] [package.dependencies] @@ -1793,14 +1845,14 @@ testing = ["coverage", "pytest", "pytest-benchmark"] [[package]] name = "poetry-core" -version = "2.2.0" +version = "2.2.1" description = "Poetry PEP 517 Build Backend" optional = false python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "poetry_core-2.2.0-py3-none-any.whl", hash = "sha256:0edea81d07e88cbd407369eef753c722da8ff1338f554788dc04636e756318fc"}, - {file = "poetry_core-2.2.0.tar.gz", hash = "sha256:b4033b71b99717a942030e074fec7e3082e5fde7a8ed10f02cd2413bdf940b1f"}, + {file = "poetry_core-2.2.1-py3-none-any.whl", hash = "sha256:bdfce710edc10bfcf9ab35041605c480829be4ab23f5bc01202cfe5db8f125ab"}, + {file = "poetry_core-2.2.1.tar.gz", hash = "sha256:97e50d8593c8729d3f49364b428583e044087ee3def1e010c6496db76bd65ac5"}, ] [[package]] @@ -2288,14 +2340,14 @@ rsa = ["cryptography"] [[package]] name = "pyparsing" -version = "3.2.4" +version = "3.2.5" description = "pyparsing - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "pyparsing-3.2.4-py3-none-any.whl", hash = "sha256:91d0fcde680d42cd031daf3a6ba20da3107e08a75de50da58360e7d94ab24d36"}, - {file = "pyparsing-3.2.4.tar.gz", hash = "sha256:fff89494f45559d0f2ce46613b419f632bbb6afbdaed49696d322bcf98a58e99"}, + {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"}, + {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"}, ] [package.extras] @@ -2364,6 +2416,21 @@ pygments = ">=2.7.2" [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "python-dotenv" version = "1.1.1" @@ -2638,6 +2705,24 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "s3transfer" +version = "0.14.0" +description = "An Amazon S3 Transfer Manager" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456"}, + {file = "s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125"}, +] + +[package.dependencies] +botocore = ">=1.37.4,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"] + [[package]] name = "safetensors" version = "0.6.2" @@ -2832,6 +2917,18 @@ typing_extensions = ">=4.14.0,<4.15.0" urllib3 = {version = ">=2.5.0,<3.0", extras = ["socks"]} websocket-client = ">=1.8.0,<1.9.0" +[[package]] +name = "six" +version = "1.17.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] +files = [ + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -3015,31 +3112,31 @@ files = [ [[package]] name = "tokenizers" -version = "0.22.0" +version = "0.22.1" description = "" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "tokenizers-0.22.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:eaa9620122a3fb99b943f864af95ed14c8dfc0f47afa3b404ac8c16b3f2bb484"}, - {file = "tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:71784b9ab5bf0ff3075bceeb198149d2c5e068549c0d18fe32d06ba0deb63f79"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec5b71f668a8076802b0241a42387d48289f25435b86b769ae1837cad4172a17"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ea8562fa7498850d02a16178105b58803ea825b50dc9094d60549a7ed63654bb"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4136e1558a9ef2e2f1de1555dcd573e1cbc4a320c1a06c4107a3d46dc8ac6e4b"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf5954de3962a5fd9781dc12048d24a1a6f1f5df038c6e95db328cd22964206"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8337ca75d0731fc4860e6204cc24bb36a67d9736142aa06ed320943b50b1e7ed"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a89264e26f63c449d8cded9061adea7b5de53ba2346fc7e87311f7e4117c1cc8"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:790bad50a1b59d4c21592f9c3cf5e5cf9c3c7ce7e1a23a739f13e01fb1be377a"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:76cf6757c73a10ef10bf06fa937c0ec7393d90432f543f49adc8cab3fb6f26cb"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:1626cb186e143720c62c6c6b5371e62bbc10af60481388c0da89bc903f37ea0c"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:da589a61cbfea18ae267723d6b029b84598dc8ca78db9951d8f5beff72d8507c"}, - {file = "tokenizers-0.22.0-cp39-abi3-win32.whl", hash = "sha256:dbf9d6851bddae3e046fedfb166f47743c1c7bd11c640f0691dd35ef0bcad3be"}, - {file = "tokenizers-0.22.0-cp39-abi3-win_amd64.whl", hash = "sha256:c78174859eeaee96021f248a56c801e36bfb6bd5b067f2e95aa82445ca324f00"}, - {file = "tokenizers-0.22.0.tar.gz", hash = "sha256:2e33b98525be8453f355927f3cab312c36cd3e44f4d7e9e97da2fa94d0a49dcb"}, + {file = "tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73"}, + {file = "tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390"}, + {file = "tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82"}, + {file = "tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138"}, + {file = "tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9"}, ] [package.dependencies] -huggingface-hub = ">=0.16.4,<1.0" +huggingface-hub = ">=0.16.4,<2.0" [package.extras] dev = ["tokenizers[testing]"] @@ -3070,14 +3167,14 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "4.56.1" +version = "4.56.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.9.0" groups = ["main"] files = [ - {file = "transformers-4.56.1-py3-none-any.whl", hash = "sha256:1697af6addfb6ddbce9618b763f4b52d5a756f6da4899ffd1b4febf58b779248"}, - {file = "transformers-4.56.1.tar.gz", hash = "sha256:0d88b1089a563996fc5f2c34502f10516cad3ea1aa89f179f522b54c8311fe74"}, + {file = "transformers-4.56.2-py3-none-any.whl", hash = "sha256:79c03d0e85b26cb573c109ff9eafa96f3c8d4febfd8a0774e8bba32702dd6dde"}, + {file = "transformers-4.56.2.tar.gz", hash = "sha256:5e7c623e2d7494105c726dd10f6f90c2c99a55ebe86eef7233765abd0cb1c529"}, ] [package.dependencies] @@ -3094,23 +3191,23 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.26.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] chat-template = ["jinja2 (>=3.1.0)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] -hf-xet = ["hf_xet"] +hf-xet = ["hf-xet"] hub-kernels = ["kernels (>=0.6.1,<=0.9)"] integrations = ["kernels (>=0.6.1,<=0.9)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] @@ -3129,7 +3226,7 @@ serving = ["accelerate (>=0.26.0)", "fastapi", "openai (>=1.98.0)", "pydantic (> sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] @@ -3139,7 +3236,7 @@ tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"] torch = ["accelerate (>=0.26.0)", "torch (>=2.2)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -3429,4 +3526,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.14" -content-hash = "42274fd00aedabf70dc419acd06e2f25b5c69b58b7bf76eef2ea7a9df6470b2c" +content-hash = "fe9799a3d3a101e05d75d5e193c6e9e4ef17a7581cb273f41101e12129f80a2f" diff --git a/apps/pre-processing-service/pyproject.toml b/apps/pre-processing-service/pyproject.toml index 84a957b9..8cb11c0f 100644 --- a/apps/pre-processing-service/pyproject.toml +++ b/apps/pre-processing-service/pyproject.toml @@ -38,6 +38,7 @@ openai = "^1.107.3" aiohttp = "^3.12.15" prometheus-client = "^0.23.1" prometheus-fastapi-instrumentator = "^7.1.0" +boto3 = "^1.40.35" [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] diff --git a/apps/user-service/build.gradle b/apps/user-service/build.gradle index 23c23124..4c5cb671 100644 --- a/apps/user-service/build.gradle +++ b/apps/user-service/build.gradle @@ -63,6 +63,7 @@ dependencies { implementation "io.micrometer:micrometer-tracing" implementation 'io.micrometer:micrometer-registry-prometheus' implementation "org.springframework.boot:spring-boot-starter-actuator" + implementation "io.micrometer:context-propagation" // Lombok compileOnly 'org.projectlombok:lombok:1.18.30' diff --git a/apps/user-service/src/main/java/site/icebang/domain/WorkflowLogInsertExampleController.java b/apps/user-service/src/main/java/site/icebang/domain/WorkflowLogInsertExampleController.java deleted file mode 100644 index c3e225b7..00000000 --- a/apps/user-service/src/main/java/site/icebang/domain/WorkflowLogInsertExampleController.java +++ /dev/null @@ -1,34 +0,0 @@ -package site.icebang.domain; - -import org.slf4j.Logger; -import org.slf4j.LoggerFactory; -import org.slf4j.MDC; -import org.springframework.web.bind.annotation.GetMapping; -import org.springframework.web.bind.annotation.RequestMapping; -import org.springframework.web.bind.annotation.RestController; - -import lombok.extern.slf4j.Slf4j; - -import site.icebang.common.dto.ApiResponse; - -@RestController -@RequestMapping("/v0/check-execution-log-insert") -@Slf4j -public class WorkflowLogInsertExampleController { - private static final Logger workflowLogger = LoggerFactory.getLogger("WORKFLOW_HISTORY"); - - @GetMapping("") - public ApiResponse test() { - log.info("@@"); - // MDC.put("traceId", UUID.randomUUID().toString()); - MDC.put("sourceId", "o1"); - MDC.put("executionType", "WORKFLOW"); - // MDC.put("sourceId", "test-controller"); - - // 이 로그는 DB에 저장됨 - workflowLogger.info("SLF4J로 찍은 워크플로우 로그"); - - MDC.clear(); - return ApiResponse.success("hi"); - } -} diff --git a/apps/user-service/src/main/java/site/icebang/domain/workflow/controller/WorkflowController.java b/apps/user-service/src/main/java/site/icebang/domain/workflow/controller/WorkflowController.java index 348058ee..9cd5933b 100644 --- a/apps/user-service/src/main/java/site/icebang/domain/workflow/controller/WorkflowController.java +++ b/apps/user-service/src/main/java/site/icebang/domain/workflow/controller/WorkflowController.java @@ -1,7 +1,5 @@ package site.icebang.domain.workflow.controller; -import java.util.concurrent.CompletableFuture; - import org.springframework.http.ResponseEntity; import org.springframework.web.bind.annotation.*; @@ -31,7 +29,7 @@ public ApiResponse> getWorkflowList( @PostMapping("/{workflowId}/run") public ResponseEntity runWorkflow(@PathVariable Long workflowId) { // HTTP 요청/응답 스레드를 블로킹하지 않도록 비동기 실행 - CompletableFuture.runAsync(() -> workflowExecutionService.executeWorkflow(workflowId)); + workflowExecutionService.executeWorkflow(workflowId); return ResponseEntity.accepted().build(); } } diff --git a/apps/user-service/src/main/java/site/icebang/domain/workflow/manager/ExecutionMdcManager.java b/apps/user-service/src/main/java/site/icebang/domain/workflow/manager/ExecutionMdcManager.java new file mode 100644 index 00000000..e61faa75 --- /dev/null +++ b/apps/user-service/src/main/java/site/icebang/domain/workflow/manager/ExecutionMdcManager.java @@ -0,0 +1,30 @@ +package site.icebang.domain.workflow.manager; + +import org.slf4j.MDC; +import org.springframework.stereotype.Component; + +@Component +public class ExecutionMdcManager { + private static final String SOURCE_ID = "sourceId"; + private static final String EXECUTION_TYPE = "executionType"; + + public void setWorkflowContext(Long workflowId) { + MDC.put(SOURCE_ID, workflowId.toString()); + MDC.put(EXECUTION_TYPE, "WORKFLOW"); + } + + public void setJobContext(Long jobRunId) { + MDC.put(SOURCE_ID, jobRunId.toString()); + MDC.put(EXECUTION_TYPE, "JOB"); + } + + public void setTaskContext(Long taskRunId) { + MDC.put(SOURCE_ID, taskRunId.toString()); + MDC.put(EXECUTION_TYPE, "TASK"); + } + + public void clearExecutionContext() { + MDC.remove(SOURCE_ID); + MDC.remove(EXECUTION_TYPE); + } +} diff --git a/apps/user-service/src/main/java/site/icebang/domain/workflow/service/WorkflowExecutionService.java b/apps/user-service/src/main/java/site/icebang/domain/workflow/service/WorkflowExecutionService.java index 60da5863..c6be9ac9 100644 --- a/apps/user-service/src/main/java/site/icebang/domain/workflow/service/WorkflowExecutionService.java +++ b/apps/user-service/src/main/java/site/icebang/domain/workflow/service/WorkflowExecutionService.java @@ -5,6 +5,9 @@ import java.util.Map; import java.util.stream.Collectors; +import org.slf4j.Logger; +import org.slf4j.LoggerFactory; +import org.springframework.scheduling.annotation.Async; import org.springframework.stereotype.Service; import org.springframework.transaction.annotation.Transactional; @@ -23,6 +26,7 @@ import site.icebang.domain.execution.model.TaskRun; import site.icebang.domain.execution.model.WorkflowRun; import site.icebang.domain.workflow.dto.TaskDto; +import site.icebang.domain.workflow.manager.ExecutionMdcManager; import site.icebang.domain.workflow.mapper.JobMapper; import site.icebang.domain.workflow.model.Job; import site.icebang.domain.workflow.model.Task; @@ -33,7 +37,7 @@ @Service @RequiredArgsConstructor public class WorkflowExecutionService { - + private static final Logger workflowLogger = LoggerFactory.getLogger("WORKFLOW_HISTORY"); private final JobMapper jobMapper; private final WorkflowRunMapper workflowRunMapper; private final JobRunMapper jobRunMapper; @@ -41,55 +45,73 @@ public class WorkflowExecutionService { private final Map taskRunners; private final ObjectMapper objectMapper; private final List bodyBuilders; + private final ExecutionMdcManager mdcManager; @Transactional + @Async("traceExecutor") public void executeWorkflow(Long workflowId) { - log.info("========== 워크플로우 실행 시작: WorkflowId={} ==========", workflowId); - WorkflowRun workflowRun = WorkflowRun.start(workflowId); - workflowRunMapper.insert(workflowRun); - - Map workflowContext = new HashMap<>(); - List jobs = jobMapper.findJobsByWorkflowId(workflowId); - log.info("총 {}개의 Job을 순차적으로 실행합니다.", jobs.size()); - - for (Job job : jobs) { - JobRun jobRun = JobRun.start(workflowRun.getId(), job.getId()); - jobRunMapper.insert(jobRun); - log.info( - "---------- Job 실행 시작: JobId={}, JobRunId={} ----------", job.getId(), jobRun.getId()); - - boolean jobSucceeded = executeTasksForJob(jobRun, workflowContext); - - jobRun.finish(jobSucceeded ? "SUCCESS" : "FAILED"); - jobRunMapper.update(jobRun); - - if (!jobSucceeded) { - workflowRun.finish("FAILED"); - workflowRunMapper.update(workflowRun); - log.error("Job 실패로 인해 워크플로우 실행을 중단합니다: WorkflowRunId={}", workflowRun.getId()); - return; + mdcManager.setWorkflowContext(workflowId); + + try { + workflowLogger.info("========== 워크플로우 실행 시작: WorkflowId={} ==========", workflowId); + + WorkflowRun workflowRun = WorkflowRun.start(workflowId); + workflowRunMapper.insert(workflowRun); + + Map workflowContext = new HashMap<>(); + List jobs = jobMapper.findJobsByWorkflowId(workflowId); + workflowLogger.info("총 {}개의 Job을 순차적으로 실행합니다.", jobs.size()); + + for (Job job : jobs) { + JobRun jobRun = JobRun.start(workflowRun.getId(), job.getId()); + jobRunMapper.insert(jobRun); + + // Job 컨텍스트로 전환 + mdcManager.setJobContext(jobRun.getId()); + workflowLogger.info( + "---------- Job 실행 시작: JobId={}, JobRunId={} ----------", job.getId(), jobRun.getId()); + + boolean jobSucceeded = executeTasksForJob(jobRun, workflowContext); + + jobRun.finish(jobSucceeded ? "SUCCESS" : "FAILED"); + jobRunMapper.update(jobRun); + + if (!jobSucceeded) { + workflowRun.finish("FAILED"); + workflowRunMapper.update(workflowRun); + workflowLogger.error("Job 실패로 인해 워크플로우 실행을 중단합니다: WorkflowRunId={}", workflowRun.getId()); + return; + } + + workflowLogger.info("---------- Job 실행 성공: JobRunId={} ----------", jobRun.getId()); + + // 다시 워크플로우 컨텍스트로 복원 + mdcManager.setWorkflowContext(workflowId); } - log.info("---------- Job 실행 성공: JobRunId={} ----------", jobRun.getId()); - } - workflowRun.finish("SUCCESS"); - workflowRunMapper.update(workflowRun); - log.info("========== 워크플로우 실행 성공: WorkflowRunId={} ==========", workflowRun.getId()); + workflowRun.finish("SUCCESS"); + workflowRunMapper.update(workflowRun); + workflowLogger.info( + "========== 워크플로우 실행 성공: WorkflowRunId={} ==========", workflowRun.getId()); + + } finally { + mdcManager.clearExecutionContext(); + } } private boolean executeTasksForJob(JobRun jobRun, Map workflowContext) { - // 📌 Mapper로부터 TaskDto 리스트를 조회합니다. List taskDtos = jobMapper.findTasksByJobId(jobRun.getJobId()); - - // 📌 convertToTask 메소드를 사용하여 Task 모델 리스트로 변환합니다. List tasks = taskDtos.stream().map(this::convertToTask).collect(Collectors.toList()); - log.info("Job (JobRunId={}) 내 총 {}개의 Task를 실행합니다.", jobRun.getId(), tasks.size()); + workflowLogger.info("Job (JobRunId={}) 내 총 {}개의 Task를 실행합니다.", jobRun.getId(), tasks.size()); for (Task task : tasks) { TaskRun taskRun = TaskRun.start(jobRun.getId(), task.getId()); taskRunMapper.insert(taskRun); - log.info("Task 실행 시작: TaskId={}, TaskRunId={}", task.getId(), taskRun.getId()); + + // Task 컨텍스트로 전환 + mdcManager.setTaskContext(taskRun.getId()); + workflowLogger.info("Task 실행 시작: TaskId={}, TaskRunId={}", task.getId(), taskRun.getId()); String runnerBeanName = task.getType().toLowerCase() + "TaskRunner"; TaskRunner runner = taskRunners.get(runnerBeanName); @@ -97,7 +119,8 @@ private boolean executeTasksForJob(JobRun jobRun, Map workflow if (runner == null) { taskRun.finish("FAILED", "지원하지 않는 Task 타입: " + task.getType()); taskRunMapper.update(taskRun); - log.error("Task 실행 실패 (미지원 타입): TaskRunId={}, Type={}", taskRun.getId(), task.getType()); + workflowLogger.error("Task 실행 실패 (미지원 타입): Type={}", task.getType()); + mdcManager.setJobContext(jobRun.getId()); // Job 컨텍스트로 복원 return false; } @@ -113,21 +136,26 @@ private boolean executeTasksForJob(JobRun jobRun, Map workflow taskRunMapper.update(taskRun); if (result.isFailure()) { - log.error("Task 실행 실패: TaskRunId={}, Message={}", taskRun.getId(), result.message()); + workflowLogger.error("Task 실행 실패: Message={}", result.message()); + mdcManager.setJobContext(jobRun.getId()); // Job 컨텍스트로 복원 return false; } try { JsonNode resultJson = objectMapper.readTree(result.message()); workflowContext.put(task.getName(), resultJson); - // TODO: task_io_data 테이블에 requestBody(INPUT)와 resultJson(OUTPUT) 저장 } catch (JsonProcessingException e) { - log.error("Task 결과 JSON 파싱 실패: TaskRunId={}", taskRun.getId(), e); + workflowLogger.error("Task 결과 JSON 파싱 실패"); taskRun.finish("FAILED", "결과 JSON 파싱 실패"); taskRunMapper.update(taskRun); + mdcManager.setJobContext(jobRun.getId()); // Job 컨텍스트로 복원 return false; } - log.info("Task 실행 성공: TaskRunId={}", taskRun.getId()); + + workflowLogger.info("Task 실행 성공: TaskRunId={}", taskRun.getId()); + + // 다시 Job 컨텍스트로 복원 + mdcManager.setJobContext(jobRun.getId()); } return true; } diff --git a/apps/user-service/src/main/java/site/icebang/global/config/asnyc/AsyncConfig.java b/apps/user-service/src/main/java/site/icebang/global/config/asnyc/AsyncConfig.java new file mode 100644 index 00000000..8d664028 --- /dev/null +++ b/apps/user-service/src/main/java/site/icebang/global/config/asnyc/AsyncConfig.java @@ -0,0 +1,24 @@ +package site.icebang.global.config.asnyc; + +import org.springframework.context.annotation.Bean; +import org.springframework.context.annotation.Configuration; +import org.springframework.core.task.support.ContextPropagatingTaskDecorator; +import org.springframework.scheduling.annotation.EnableAsync; +import org.springframework.scheduling.concurrent.ThreadPoolTaskExecutor; + +@Configuration +@EnableAsync +public class AsyncConfig { + + @Bean("traceExecutor") + public ThreadPoolTaskExecutor traceExecutor() { + ThreadPoolTaskExecutor executor = new ThreadPoolTaskExecutor(); + executor.setCorePoolSize(10); + executor.setMaxPoolSize(50); + executor.setQueueCapacity(100); + executor.setTaskDecorator(new ContextPropagatingTaskDecorator()); // 필수 + executor.setThreadNamePrefix("trace-"); + executor.initialize(); + return executor; + } +}