diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 32a4dcbe..2812ef79 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -6,6 +6,7 @@ CustomException, ) from ...service.crawl_service import CrawlService +from ...service.s3_upload_service import S3UploadService from ...service.search_service import SearchService from ...service.match_service import MatchService from ...service.similarity_service import SimilarityService @@ -60,11 +61,11 @@ async def match(request: RequestSadaguMatch): ) async def similarity(request: RequestSadaguSimilarity): """ - 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. + 매칭된 상품들 중 키워드와의 유사도를 계산하여 상위 10개 상품을 선택합니다. """ try: similarity_service = SimilarityService() - response_data = similarity_service.select_product_by_similarity(request) + response_data = similarity_service.select_top_products_by_similarity(request) if not response_data: raise CustomException( @@ -99,3 +100,24 @@ async def crawl(body: RequestSadaguCrawl): raise HTTPException(status_code=e.status_code, detail=e.detail) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + + +@router.post("/s3-upload", response_model=ResponseS3Upload, summary="S3 이미지 업로드") +async def s3_upload(request: RequestS3Upload): + """ + 크롤링 완료 후 별도로 호출하여 이미지들을 S3 저장소에 업로드합니다. + """ + try: + s3_upload_service = S3UploadService() + response_data = await s3_upload_service.upload_crawled_products_to_s3(request) + + if not response_data: + raise CustomException( + 500, "S3 이미지 업로드에 실패했습니다.", "S3_UPLOAD_FAILED" + ) + + return response_data + except InvalidItemDataException as e: + raise HTTPException(status_code=e.status_code, detail=e.detail) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index 36bef959..ebf19478 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -110,8 +110,10 @@ class SadaguSimilarityData(BaseModel): keyword: str = Field( ..., title="분석 키워드", description="유사도 분석에 사용된 키워드" ) - selected_product: Optional[Dict] = Field( - None, title="선택된 상품", description="유사도 분석 결과 선택된 상품" + top_products: List[Dict] = Field( + default_factory=list, + title="선택된 상품들", + description="유사도 분석 결과 선택된 상위 상품 목록", ) reason: Optional[str] = Field( None, title="선택 이유", description="상품 선택 근거 및 점수 정보" @@ -129,16 +131,23 @@ class ResponseSadaguSimilarity(ResponseBase[SadaguSimilarityData]): class RequestSadaguCrawl(RequestBase): - product_url: HttpUrl = Field( + product_urls: List[HttpUrl] = Field( ..., title="상품 URL", description="크롤링할 상품 페이지의 URL" ) # 응답 데이터 모델 class SadaguCrawlData(BaseModel): - product_url: str = Field(..., title="상품 URL", description="크롤링된 상품 URL") - product_detail: Optional[Dict] = Field( - None, title="상품 상세정보", description="크롤링된 상품의 상세 정보" + crawled_products: List[Dict] = Field( + ..., + title="크롤링된 상품들", + description="크롤링된 상품들의 상세 정보 목록 (URL 포함)", + ) + success_count: int = Field( + ..., title="성공 개수", description="성공적으로 크롤링된 상품 개수" + ) + fail_count: int = Field( + ..., title="실패 개수", description="크롤링에 실패한 상품 개수" ) crawled_at: Optional[str] = Field( None, title="크롤링 시간", description="크롤링 완료 시간" @@ -152,6 +161,81 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]): pass +# ============== S3 이미지 업로드 ============== + + +class RequestS3Upload(RequestBase): + keyword: str = Field( + ..., title="검색 키워드", description="폴더명 생성용 키워드" + ) # 추가 + crawled_products: List[Dict] = Field( + ..., + title="크롤링된 상품 데이터", + description="이전 단계에서 크롤링된 상품들의 데이터", + ) + base_folder: Optional[str] = Field( + "product", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" + ) + + +# S3 업로드된 이미지 정보 +class S3ImageInfo(BaseModel): + index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번") + original_url: str = Field( + ..., title="원본 URL", description="크롤링된 원본 이미지 URL" + ) + s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL") + + +# 상품별 S3 업로드 결과 +class ProductS3UploadResult(BaseModel): + product_index: int = Field(..., title="상품 순번", description="크롤링 순번") + product_title: str = Field(..., title="상품 제목", description="상품명") + status: str = Field(..., title="업로드 상태", description="completed/skipped/error") + uploaded_images: List[S3ImageInfo] = Field( + default_factory=list, title="업로드 성공 이미지" + ) + success_count: int = Field( + ..., title="성공 개수", description="업로드 성공한 이미지 수" + ) + fail_count: int = Field( + ..., title="실패 개수", description="업로드 실패한 이미지 수" + ) + + +# S3 업로드 요약 정보 +class S3UploadSummary(BaseModel): + total_products: int = Field( + ..., title="총 상품 수", description="처리 대상 상품 총 개수" + ) + total_success_images: int = Field( + ..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수" + ) + total_fail_images: int = Field( + ..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수" + ) + + +# 응답 데이터 모델 +class S3UploadData(BaseModel): + upload_results: List[ProductS3UploadResult] = Field( + ..., title="업로드 결과", description="각 상품의 S3 업로드 결과" + ) + summary: S3UploadSummary = Field( + ..., title="업로드 요약", description="전체 업로드 결과 요약" + ) + uploaded_at: str = Field( + ..., title="업로드 완료 시간", description="S3 업로드 완료 시간" + ) + + +# 최종 응답 모델 +class ResponseS3Upload(ResponseBase[S3UploadData]): + """S3 이미지 업로드 API 응답""" + + pass + + # ============== 블로그 콘텐츠 생성 ============== diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index af8f91bc..e8785f64 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -1,4 +1,5 @@ import time +import asyncio from app.service.crawlers.detail_crawler import DetailCrawler from app.errors.CustomException import InvalidItemDataException from app.model.schemas import RequestSadaguCrawl @@ -12,45 +13,133 @@ def __init__(self): async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: """ - 선택된 상품의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) - 상품 URL을 입력받아 상세 정보를 크롤링하여 딕셔너리로 반환합니다. + 선택된 상품들의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) + 여러 상품 URL을 입력받아 순차적으로 상세 정보를 크롤링하여 딕셔너리로 반환합니다. """ - crawler = DetailCrawler(use_selenium=True) + product_urls = [str(url) for url in request.product_urls] + + logger.info(f"상품 상세 크롤링 서비스 시작: 총 {len(product_urls)}개 상품") + + crawled_products = [] + success_count = 0 + fail_count = 0 try: - logger.info( - f"상품 상세 크롤링 서비스 시작: product_url={request.product_url}" + # 각 상품을 순차적으로 크롤링 (안정성 확보) + for i, product_url in enumerate(product_urls, 1): + logger.info(f"상품 {i}/{len(product_urls)} 크롤링 시작: {product_url}") + + crawler = DetailCrawler(use_selenium=True) + + try: + # 상세 정보 크롤링 실행 + product_detail = await crawler.crawl_detail(product_url) + + if product_detail: + product_title = product_detail.get("title", "Unknown")[:50] + logger.success( + f"상품 {i} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}" + ) + + # 성공한 상품 추가 + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": product_detail, + "status": "success", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + success_count += 1 + else: + logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음") + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": "상세 정보 없음", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + fail_count += 1 + + except Exception as e: + logger.error( + f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'" + ) + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": str(e), + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + fail_count += 1 + + finally: + # 각 크롤러 개별 정리 + await crawler.close() + + # 상품간 간격 (서버 부하 방지) + if i < len(product_urls): + await asyncio.sleep(1) + + logger.success( + f"전체 크롤링 완료: 총 {len(product_urls)}개, 성공 {success_count}개, 실패 {fail_count}개" ) - # 상세 정보 크롤링 실행 - product_detail = await crawler.crawl_detail( - product_url=str(request.product_url) + # 응답 데이터 구성 + data = { + "crawled_products": crawled_products, + "success_count": success_count, + "fail_count": fail_count, + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + logger.info( + f"상품 상세 크롤링 서비스 완료: success_rate={success_count}/{len(product_urls)}" ) + return Response.ok(data) + + except Exception as e: + logger.error(f"배치 크롤링 서비스 오류: error='{e}'") + raise InvalidItemDataException() + + # 기존 단일 크롤링 메서드도 유지 (하위 호환성) + async def crawl_single_product_detail(self, product_url: str) -> dict: + """ + 단일 상품 크롤링 (하위 호환성용) + """ + crawler = DetailCrawler(use_selenium=True) + + try: + logger.info(f"단일 상품 크롤링 시작: {product_url}") + + product_detail = await crawler.crawl_detail(product_url) if not product_detail: - logger.error(f"상품 상세 정보 크롤링 실패: url={request.product_url}") + logger.error(f"상품 상세 정보 크롤링 실패: url={product_url}") raise InvalidItemDataException() product_title = product_detail.get("title", "Unknown")[:50] - logger.success( - f"크롤링 완료: title='{product_title}', price={product_detail.get('price', 0)}, options_count={len(product_detail.get('options', []))}" - ) + logger.success(f"크롤링 완료: title='{product_title}'") - # 응답 데이터 구성 data = { - "product_url": str(request.product_url), + "product_url": product_url, "product_detail": product_detail, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - logger.info(f"상품 상세 크롤링 서비스 완료: status=success") return Response.ok(data) except Exception as e: - logger.error( - f"크롤링 서비스 오류: product_url={request.product_url}, error='{e}'" - ) + logger.error(f"단일 크롤링 오류: url={product_url}, error='{e}'") raise InvalidItemDataException() finally: await crawler.close() - logger.debug("크롤러 리소스 정리 완료") diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py index a0d46e02..1bc36fc5 100644 --- a/apps/pre-processing-service/app/service/crawlers/search_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/search_crawler.py @@ -49,7 +49,7 @@ async def search_products_selenium(self, keyword: str) -> list[dict]: logger.info( f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" ) - return unique_products[:20] + return unique_products[:40] except Exception as e: logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") @@ -88,7 +88,7 @@ async def search_products_httpx(self, keyword: str) -> list[dict]: product_links.append({"url": full_url, "title": title}) logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") - return product_links[:20] + return product_links[:40] except Exception as e: logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") diff --git a/apps/pre-processing-service/app/service/s3_upload_service.py b/apps/pre-processing-service/app/service/s3_upload_service.py new file mode 100644 index 00000000..1c024a63 --- /dev/null +++ b/apps/pre-processing-service/app/service/s3_upload_service.py @@ -0,0 +1,125 @@ +import time +import asyncio +import aiohttp +from typing import List, Dict +from loguru import logger +from app.errors.CustomException import InvalidItemDataException +from app.model.schemas import RequestS3Upload +from app.utils.s3_upload_util import S3UploadUtil +from app.utils.response import Response + + +class S3UploadService: + """6단계: 크롤링된 상품 이미지들과 데이터를 S3에 업로드하는 서비스""" + + def __init__(self): + self.s3_util = S3UploadUtil() + + async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: + """ + 크롤링된 상품들의 이미지와 데이터를 S3에 업로드하는 비즈니스 로직 (6단계) + """ + keyword = request.keyword # 키워드 추가 + crawled_products = request.crawled_products + base_folder = ( + request.base_folder or "product" + ) # 🔸 기본값 변경: product-images → product + + logger.info( + f"S3 업로드 서비스 시작: keyword='{keyword}', {len(crawled_products)}개 상품" + ) + + upload_results = [] + total_success_images = 0 + total_fail_images = 0 + + try: + # HTTP 세션을 사용한 이미지 다운로드 + async with aiohttp.ClientSession() as session: + + # 각 상품별로 순차 업로드 + for product_info in crawled_products: + product_index = product_info.get("index", 0) + product_detail = product_info.get("product_detail") + + logger.info( + f"상품 {product_index}/{len(crawled_products)} S3 업로드 시작" + ) + + # 크롤링 실패한 상품은 스킵 + if not product_detail or product_info.get("status") != "success": + logger.warning( + f"상품 {product_index}: 크롤링 실패로 인한 업로드 스킵" + ) + upload_results.append( + { + "product_index": product_index, + "product_title": "Unknown", + "status": "skipped", + "folder_s3_url": None, + "uploaded_images": [], + "success_count": 0, + "fail_count": 0, + } + ) + continue + + try: + # 상품 이미지 + 데이터 업로드 (키워드 전달 추가!) + # 🔸 전체 크롤링 데이터를 전달 (product_detail이 아닌 product_info 전체) + upload_result = await self.s3_util.upload_single_product_images( + session, + product_info, + product_index, + keyword, + base_folder, # product_detail → product_info + ) + + upload_results.append(upload_result) + total_success_images += upload_result["success_count"] + total_fail_images += upload_result["fail_count"] + + logger.success( + f"상품 {product_index} S3 업로드 완료: 성공 {upload_result['success_count']}개, " + f"실패 {upload_result['fail_count']}개" + ) + + except Exception as e: + logger.error(f"상품 {product_index} S3 업로드 오류: {e}") + upload_results.append( + { + "product_index": product_index, + "product_title": product_detail.get("title", "Unknown"), + "status": "error", + "folder_s3_url": None, + "uploaded_images": [], + "success_count": 0, + "fail_count": 0, + } + ) + + # 상품간 간격 (서버 부하 방지) + if product_index < len(crawled_products): + await asyncio.sleep(1) + + logger.success( + f"S3 업로드 서비스 완료: 총 성공 이미지 {total_success_images}개, 총 실패 이미지 {total_fail_images}개" + ) + + # 간소화된 응답 데이터 구성 + data = { + "upload_results": upload_results, + "summary": { + "total_products": len(crawled_products), + "total_success_images": total_success_images, + "total_fail_images": total_fail_images, + }, + "uploaded_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + message = f"S3 업로드 완료: {total_success_images}개 이미지 업로드 성공, 상품 데이터 JSON 파일 포함" + return Response.ok(data, message) + + except Exception as e: + logger.error(f"S3 업로드 서비스 전체 오류: {e}") + raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index 171bd57f..070f6cc2 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -77,9 +77,9 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: logger.debug(f"상품 {i + 1}: 제목 추출 실패, 제외") continue - # 최대 20개까지만 처리 - if len(enriched_results) >= 20: - logger.info("최대 20개 상품 수집 완료") + # 최대 40개까지 처리 + if len(enriched_results) >= 40: + logger.info("최대 40개 상품 수집 완료") break except Exception as e: diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 516b0c63..cf943279 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -9,16 +9,19 @@ class SimilarityService: def __init__(self): pass - def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict: + def select_top_products_by_similarity( + self, request: RequestSadaguSimilarity + ) -> dict: """ - BERT 기반 유사도 분석 후 상품 선택 - 4단계 + 형태소 분석 후 Top 10 선택 (10개 이하면 유사도 분석 생략) """ keyword = request.keyword candidates = request.matched_products fallback_products = request.search_results or [] + top_count = 10 # Top 10 개수 설정 logger.info( - f"유사도 분석 서비스 시작: keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" + f"상품 선택 서비스 시작 (Top {top_count}): keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" ) # 매칭된 상품이 없으면 전체 검색 결과로 폴백 @@ -30,130 +33,151 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict data = { "keyword": keyword, - "selected_product": None, + "top_products": [], "reason": "매칭된 상품과 검색 결과가 모두 없음", } return Response.ok(data, "매칭된 상품과 검색 결과가 모두 없습니다.") - logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석") + logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석 진행") candidates = fallback_products analysis_mode = "fallback_similarity_only" + skip_similarity = False else: analysis_mode = "matched_products" + # 형태소 분석 결과가 10개 이하면 유사도 분석 생략 + skip_similarity = len(candidates) <= top_count try: - analyzer = SimilarityAnalyzerONNX() - - logger.info( - f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" - ) - - # 한 개만 있으면 바로 선택 - if len(candidates) == 1: - selected_product = candidates[0] - - logger.info("단일 후보 상품 - 유사도 검증 진행") - # 유사도 계산 - similarity = analyzer.calculate_similarity( - keyword, selected_product["title"] + # 형태소 분석 결과가 10개 이하면 유사도 분석 생략하고 바로 반환 + if skip_similarity and analysis_mode == "matched_products": + logger.info( + f"형태소 분석 결과가 {len(candidates)}개로 {top_count}개 이하 - 유사도 분석 생략" ) - # 폴백 모드에서는 임계값 검증 - if analysis_mode == "fallback_similarity_only": - similarity_threshold = 0.3 - if similarity < similarity_threshold: - logger.warning( - f"단일 상품 유사도 미달: similarity={similarity:.4f} < threshold={similarity_threshold}" - ) - data = { - "keyword": keyword, - "selected_product": None, - "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", - } - return Response.ok( - data, "단일 상품 유사도 미달 되어 상품이 존재하지않습니다." - ) - - selected_product["similarity_info"] = { - "similarity_score": float(similarity), - "analysis_type": "single_candidate", - "analysis_mode": analysis_mode, - } + # 매칭 스코어 기준으로 정렬된 상태 유지 (이미 match_service에서 정렬됨) + top_products = [] + for i, product in enumerate(candidates): + enhanced_product = product.copy() + enhanced_product["rank"] = i + 1 + enhanced_product["selection_info"] = { + "selection_type": "match_only", + "match_score": product.get("match_info", {}).get( + "match_score", 0.0 + ), + "reason": "형태소 분석만으로 선택 (유사도 분석 생략)", + "total_candidates": len(candidates), + } + top_products.append(enhanced_product) logger.success( - f"단일 상품 선택 완료: title='{selected_product['title'][:30]}', similarity={similarity:.4f}" + f"형태소 분석만으로 상품 선택 완료: keyword='{keyword}', selected_count={len(top_products)}" ) + data = { "keyword": keyword, - "selected_product": selected_product, - "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", + "top_products": top_products, + "reason": f"형태소 분석 결과 {len(candidates)}개 - 유사도 분석 생략", } return Response.ok(data) - # 여러 개가 있으면 유사도 비교 - logger.info("여러 상품 중 최고 유사도로 선택...") + # 유사도 분석 필요한 경우 (매칭 결과가 10개 초과이거나 폴백 모드) + analyzer = SimilarityAnalyzerONNX() + + logger.info( + f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" + ) - # 제목만 추출해서 배치 분석 + # 모든 후보에 대해 유사도 계산 titles = [product["title"] for product in candidates] similarity_results = analyzer.analyze_similarity_batch(keyword, titles) - # 결과 출력 - logger.info("유사도 분석 결과:") - for i, result in enumerate(similarity_results[:5]): # 상위 5개만 로그 - logger.info( - f" {i+1}위: {result['title'][:40]} | 유사도: {result['similarity']:.4f}" - ) + # 유사도 정보 추가 및 Top 10 선택 + enhanced_products = [] + similarity_threshold = ( + 0.3 if analysis_mode == "fallback_similarity_only" else 0.0 + ) - # 최고 유사도 선택 - best_result = similarity_results[0] - selected_product = candidates[best_result["index"]].copy() + for i, result in enumerate(similarity_results): + product = candidates[result["index"]].copy() - # 폴백 모드에서는 임계값 검증 - similarity_threshold = 0.3 - if ( - analysis_mode == "fallback_similarity_only" - and best_result["similarity"] < similarity_threshold - ): - logger.warning( - f"최고 유사도 미달: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}" - ) - data = { - "keyword": keyword, - "selected_product": None, - "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", + # 폴백 모드에서는 임계값 검증 + if ( + analysis_mode == "fallback_similarity_only" + and result["similarity"] < similarity_threshold + ): + logger.debug( + f"상품 {i + 1} 유사도 미달로 제외: similarity={result['similarity']:.4f} < threshold={similarity_threshold}" + ) + continue + + product["similarity_info"] = { + "similarity_score": result["similarity"], + "analysis_type": "batch_similarity", + "analysis_mode": analysis_mode, } - return Response.ok(data, "최고 유사도가 기준보다 미달 되었습니다.") - - # 유사도 정보 추가 - selected_product["similarity_info"] = { - "similarity_score": best_result["similarity"], - "analysis_type": "multi_candidate_bert", - "analysis_mode": analysis_mode, - "rank": 1, - "total_candidates": len(candidates), - } - # 매칭 모드에서는 종합 점수도 계산 - if analysis_mode == "matched_products" and "match_info" in selected_product: - match_score = selected_product["match_info"]["match_score"] - similarity_score = best_result["similarity"] - # 가중치: 매칭 40%, 유사도 60% - final_score = match_score * 0.4 + similarity_score * 0.6 - selected_product["final_score"] = final_score - reason = f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6" - logger.info( - f"종합 점수 계산: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}" + # 매칭 모드에서는 종합 점수 계산 + if analysis_mode == "matched_products" and "match_info" in product: + match_score = product["match_info"]["match_score"] + similarity_score = result["similarity"] + # 가중치: 매칭 40%, 유사도 60% + final_score = match_score * 0.4 + similarity_score * 0.6 + product["final_score"] = final_score + product["selection_info"] = { + "selection_type": "match_and_similarity", + "match_score": match_score, + "similarity_score": similarity_score, + "final_score": final_score, + "reason": f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6", + } + else: + product["selection_info"] = { + "selection_type": "similarity_only", + "similarity_score": result["similarity"], + "reason": f"유사도({result['similarity']:.4f}) 기준 선택 ({analysis_mode})", + } + + enhanced_products.append(product) + + # 종합 점수 또는 유사도 기준으로 재정렬 + if analysis_mode == "matched_products": + enhanced_products.sort( + key=lambda x: x.get( + "final_score", x["similarity_info"]["similarity_score"] + ), + reverse=True, ) else: - reason = f"유사도({best_result['similarity']:.4f}) 기준 선택 ({analysis_mode})" + enhanced_products.sort( + key=lambda x: x["similarity_info"]["similarity_score"], reverse=True + ) + + # Top 10 선택 + top_products = enhanced_products[:top_count] + + # 순위 정보 추가 + for i, product in enumerate(top_products): + product["rank"] = i + 1 logger.success( - f"상품 선택 완료: title='{selected_product['title'][:30]}', {reason}" + f"유사도 분석 완료: keyword='{keyword}', total_analyzed={len(candidates)}, valid_results={len(enhanced_products)}, top_selected={len(top_products)}" ) + + if top_products: + best_product = top_products[0] + if "final_score" in best_product: + logger.info( + f"1위 상품: title='{best_product['title'][:30]}', final_score={best_product['final_score']:.4f}" + ) + else: + logger.info( + f"1위 상품: title='{best_product['title'][:30]}', similarity={best_product['similarity_info']['similarity_score']:.4f}" + ) + data = { "keyword": keyword, - "selected_product": selected_product, - "reason": reason, + "top_products": top_products, + "reason": f"유사도 분석 후 상위 {len(top_products)}개 선택 ({analysis_mode})", } return Response.ok(data) diff --git a/apps/pre-processing-service/app/utils/s3_upload_util.py b/apps/pre-processing-service/app/utils/s3_upload_util.py new file mode 100644 index 00000000..0aaa5ace --- /dev/null +++ b/apps/pre-processing-service/app/utils/s3_upload_util.py @@ -0,0 +1,281 @@ +import os +import json +import boto3 +import aiohttp +import asyncio +from datetime import datetime +from urllib.parse import urlparse +from typing import Dict, Optional +from loguru import logger + + +class S3UploadUtil: + """S3 업로드 전용 유틸리티 클래스""" + + def __init__(self): + # 환경변수에서 AWS 설정 읽기 + self.aws_access_key = os.getenv("AWS_ACCESS_KEY_ID") + self.aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + self.bucket_name = os.getenv("S3_BUCKET_NAME", "icebang4-dev-bucket") + self.region = os.getenv("AWS_REGION", "ap-northeast-2") + + if not self.aws_access_key or not self.aws_secret_key: + raise ValueError( + "AWS_ACCESS_KEY_ID와 AWS_SECRET_ACCESS_KEY 환경변수가 필요합니다" + ) + + self.base_url = f"https://{self.bucket_name}.s3.{self.region}.amazonaws.com" + + # S3 클라이언트 초기화 + self.s3_client = boto3.client( + "s3", + aws_access_key_id=self.aws_access_key, + aws_secret_access_key=self.aws_secret_key, + region_name=self.region, + ) + + logger.info( + f"S3 클라이언트 초기화 완료: bucket={self.bucket_name}, region={self.region}" + ) + + async def download_image( + self, session: aiohttp.ClientSession, image_url: str + ) -> Optional[bytes]: + """이미지 URL에서 이미지 데이터 다운로드""" + try: + logger.debug(f"이미지 다운로드 시작: {image_url}") + + async with session.get( + image_url, timeout=aiohttp.ClientTimeout(total=30) + ) as response: + if response.status == 200: + image_data = await response.read() + logger.debug(f"이미지 다운로드 완료: {len(image_data)} bytes") + return image_data + else: + logger.warning( + f"이미지 다운로드 실패: {image_url}, status={response.status}" + ) + return None + + except Exception as e: + logger.error(f"이미지 다운로드 오류: {image_url}, error={e}") + return None + + def get_file_extension(self, image_url: str) -> str: + """URL에서 파일 확장자 추출""" + parsed = urlparse(image_url) + path = parsed.path.lower() + + # 일반적인 이미지 확장자 확인 + for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]: + if ext in path: + return ext + + # 기본값 + return ".jpg" + + def get_content_type(self, file_extension: str) -> str: + """파일 확장자에 따른 Content-Type 반환""" + content_types = { + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", + } + return content_types.get(file_extension, "image/jpeg") + + def upload_to_s3( + self, data: bytes, s3_key: str, content_type: str = "image/jpeg" + ) -> bool: + """S3에 데이터 업로드 (이미지 또는 JSON)""" + try: + logger.debug(f"S3 업로드 시작: key={s3_key}") + + self.s3_client.put_object( + Bucket=self.bucket_name, + Key=s3_key, + Body=data, + ContentType=content_type, + ) + + logger.debug(f"S3 업로드 완료: key={s3_key}") + return True + + except Exception as e: + logger.error(f"S3 업로드 오류: key={s3_key}, error={e}") + return False + + def upload_json_to_s3(self, json_data: Dict, s3_key: str) -> bool: + """JSON 데이터를 S3에 업로드""" + try: + json_str = json.dumps(json_data, ensure_ascii=False, indent=2) + json_bytes = json_str.encode("utf-8") + + return self.upload_to_s3(json_bytes, s3_key, "application/json") + + except Exception as e: + logger.error(f"JSON S3 업로드 오류: key={s3_key}, error={e}") + return False + + def generate_product_folder_name(self, product_index: int, keyword: str) -> str: + """상품별 폴더명 생성 (시간_키워드_번호)""" + # 키워드에서 특수문자 제거 + safe_keyword = ( + keyword.replace("/", "-") + .replace("\\", "-") + .replace(" ", "_") + .replace(":", "-") + .replace("*", "-") + .replace("?", "-") + .replace('"', "-") + .replace("<", "-") + .replace(">", "-") + .replace("|", "-")[:20] # 길이 제한 + ) + + # 날짜 형식: 20250922 + date_str = datetime.now().strftime("%Y%m%d") + + # 폴더명: 20250922_키워드_1 + folder_name = f"{date_str}_{safe_keyword}_{product_index}" + + return folder_name + + def generate_s3_key( + self, + base_folder: str, + folder_name: str, + file_name: str, + ) -> str: + """S3 키 생성""" + # 최종 S3 키: product/20250922_산리오_1/image_001.jpg 또는 product_data.json + s3_key = f"{base_folder}/{folder_name}/{file_name}" + return s3_key + + def get_s3_url(self, s3_key: str) -> str: + """S3 키에서 접근 가능한 URL 생성""" + return f"{self.base_url}/{s3_key}" + + async def upload_single_product_images( + self, + session: aiohttp.ClientSession, + product_info: Dict, # 🔸 이름 변경: product_data → product_info (전체 크롤링 데이터) + product_index: int, + keyword: str, # 키워드 파라미터 추가 + base_folder: str = "product", # 🔸 기본 폴더 변경: product-images → product + ) -> Dict: + """단일 상품의 모든 데이터(이미지 + JSON)를 S3에 업로드""" + + # 🔸 전체 크롤링 데이터에서 필요한 정보 추출 + product_detail = product_info.get("product_detail", {}) + product_title = product_detail.get("title", "Unknown") + product_images = product_detail.get("product_images", []) + + uploaded_images = [] + + logger.info( + f"상품 {product_index} 업로드 시작: {len(product_images)}개 이미지, keyword='{keyword}'" + ) + + # 키워드 기반 폴더명 한 번만 생성 + folder_name = self.generate_product_folder_name(product_index, keyword) + + fail_count = 0 + folder_s3_url = f"{self.base_url}/{base_folder}/{folder_name}" + + # 🆕 1. 먼저 상품 데이터 JSON 파일 업로드 + try: + # 전체 크롤링 데이터를 JSON으로 저장 (S3 업로드 메타데이터 추가) + product_data_with_meta = { + **product_info, # 전체 크롤링 데이터 (index, url, product_detail, status, crawled_at 포함) + "s3_upload_keyword": keyword, # 추가 메타데이터 + "s3_uploaded_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), + } + + json_s3_key = self.generate_s3_key( + base_folder, folder_name, "product_data.json" + ) + + if self.upload_json_to_s3(product_data_with_meta, json_s3_key): + logger.success(f"상품 {product_index} JSON 데이터 업로드 완료") + else: + logger.error(f"상품 {product_index} JSON 데이터 업로드 실패") + + except Exception as e: + logger.error(f"상품 {product_index} JSON 업로드 오류: {e}") + + # 2. 이미지 업로드 (기존 로직) + if not product_images: + logger.warning(f"상품 {product_index}: 업로드할 이미지가 없음") + return { + "product_index": product_index, + "product_title": product_title, + "status": "no_images", + "folder_s3_url": folder_s3_url, + "uploaded_images": uploaded_images, + "success_count": 0, + "fail_count": 0, + } + + # 각 이미지 업로드 + for img_idx, img_info in enumerate(product_images, 1): + original_url = img_info.get("original_url", "") + + if not original_url: + logger.warning(f"상품 {product_index}, 이미지 {img_idx}: URL이 없음") + fail_count += 1 + continue + + try: + # 이미지 다운로드 + image_data = await self.download_image(session, original_url) + + if not image_data: + fail_count += 1 + continue + + # S3 키 생성 (키워드 기반 폴더명 사용) + file_extension = self.get_file_extension(original_url) + image_file_name = f"image_{img_idx:03d}{file_extension}" + s3_key = self.generate_s3_key(base_folder, folder_name, image_file_name) + + # S3 업로드 + content_type = self.get_content_type(file_extension) + + if self.upload_to_s3(image_data, s3_key, content_type): + s3_url = self.get_s3_url(s3_key) + uploaded_images.append( + { + "index": img_idx, + "original_url": original_url, + "s3_url": s3_url, + } + ) + + logger.debug(f"상품 {product_index}, 이미지 {img_idx} 업로드 완료") + else: + fail_count += 1 + + except Exception as e: + logger.error(f"상품 {product_index}, 이미지 {img_idx} 처리 오류: {e}") + fail_count += 1 + + # 이미지 간 간격 (서버 부하 방지) + await asyncio.sleep(0.5) + + logger.success( + f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {fail_count}개, folder='{folder_name}'" + ) + + return { + "product_index": product_index, + "product_title": product_title, + "status": "completed", + "folder_s3_url": folder_s3_url, # 🔸 폴더 전체를 가리킴 (이미지 + JSON 포함) + "json_s3_url": f"{folder_s3_url}/product_data.json", # 🆕 JSON 파일 직접 링크 + "uploaded_images": uploaded_images, + "success_count": len(uploaded_images), + "fail_count": fail_count, + } diff --git a/apps/pre-processing-service/poetry.lock b/apps/pre-processing-service/poetry.lock index ca5c20ab..f02855bc 100644 --- a/apps/pre-processing-service/poetry.lock +++ b/apps/pre-processing-service/poetry.lock @@ -321,6 +321,46 @@ d = ["aiohttp (>=3.10)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "boto3" +version = "1.40.35" +description = "The AWS SDK for Python" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "boto3-1.40.35-py3-none-any.whl", hash = "sha256:f4c1b01dd61e7733b453bca38b004ce030e26ee36e7a3d4a9e45a730b67bc38d"}, + {file = "boto3-1.40.35.tar.gz", hash = "sha256:d718df3591c829bcca4c498abb7b09d64d1eecc4e5a2b6cef14b476501211b8a"}, +] + +[package.dependencies] +botocore = ">=1.40.35,<1.41.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.14.0,<0.15.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.40.35" +description = "Low-level, data-driven core of boto 3." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "botocore-1.40.35-py3-none-any.whl", hash = "sha256:c545de2cbbce161f54ca589fbb677bae14cdbfac7d5f1a27f6a620cb057c26f4"}, + {file = "botocore-1.40.35.tar.gz", hash = "sha256:67e062752ff579c8cc25f30f9c3a84c72d692516a41a9ee1cf17735767ca78be"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} + +[package.extras] +crt = ["awscrt (==0.27.6)"] + [[package]] name = "bs4" version = "0.0.2" @@ -1320,6 +1360,18 @@ files = [ {file = "jiter-0.11.0.tar.gz", hash = "sha256:1d9637eaf8c1d6a63d6562f2a6e5ab3af946c66037eb1b894e8fad75422266e4"}, ] +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] + [[package]] name = "joblib" version = "1.5.2" @@ -1693,14 +1745,14 @@ sympy = "*" [[package]] name = "openai" -version = "1.108.0" +version = "1.108.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "openai-1.108.0-py3-none-any.whl", hash = "sha256:31f2e58230e2703f13ddbb50c285f39dacf7fca64ab19882fd8a7a0b2bccd781"}, - {file = "openai-1.108.0.tar.gz", hash = "sha256:e859c64e4202d7f5956f19280eee92bb281f211c41cdd5be9e63bf51a024ff72"}, + {file = "openai-1.108.1-py3-none-any.whl", hash = "sha256:952fc027e300b2ac23be92b064eac136a2bc58274cec16f5d2906c361340d59b"}, + {file = "openai-1.108.1.tar.gz", hash = "sha256:6648468c1aec4eacfa554001e933a9fa075f57bacfc27588c2e34456cee9fef9"}, ] [package.dependencies] @@ -1793,14 +1845,14 @@ testing = ["coverage", "pytest", "pytest-benchmark"] [[package]] name = "poetry-core" -version = "2.2.0" +version = "2.2.1" description = "Poetry PEP 517 Build Backend" optional = false python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "poetry_core-2.2.0-py3-none-any.whl", hash = "sha256:0edea81d07e88cbd407369eef753c722da8ff1338f554788dc04636e756318fc"}, - {file = "poetry_core-2.2.0.tar.gz", hash = "sha256:b4033b71b99717a942030e074fec7e3082e5fde7a8ed10f02cd2413bdf940b1f"}, + {file = "poetry_core-2.2.1-py3-none-any.whl", hash = "sha256:bdfce710edc10bfcf9ab35041605c480829be4ab23f5bc01202cfe5db8f125ab"}, + {file = "poetry_core-2.2.1.tar.gz", hash = "sha256:97e50d8593c8729d3f49364b428583e044087ee3def1e010c6496db76bd65ac5"}, ] [[package]] @@ -2288,14 +2340,14 @@ rsa = ["cryptography"] [[package]] name = "pyparsing" -version = "3.2.4" +version = "3.2.5" description = "pyparsing - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "pyparsing-3.2.4-py3-none-any.whl", hash = "sha256:91d0fcde680d42cd031daf3a6ba20da3107e08a75de50da58360e7d94ab24d36"}, - {file = "pyparsing-3.2.4.tar.gz", hash = "sha256:fff89494f45559d0f2ce46613b419f632bbb6afbdaed49696d322bcf98a58e99"}, + {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"}, + {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"}, ] [package.extras] @@ -2364,6 +2416,21 @@ pygments = ">=2.7.2" [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "python-dotenv" version = "1.1.1" @@ -2638,6 +2705,24 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "s3transfer" +version = "0.14.0" +description = "An Amazon S3 Transfer Manager" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456"}, + {file = "s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125"}, +] + +[package.dependencies] +botocore = ">=1.37.4,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"] + [[package]] name = "safetensors" version = "0.6.2" @@ -2832,6 +2917,18 @@ typing_extensions = ">=4.14.0,<4.15.0" urllib3 = {version = ">=2.5.0,<3.0", extras = ["socks"]} websocket-client = ">=1.8.0,<1.9.0" +[[package]] +name = "six" +version = "1.17.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] +files = [ + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -3015,31 +3112,31 @@ files = [ [[package]] name = "tokenizers" -version = "0.22.0" +version = "0.22.1" description = "" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "tokenizers-0.22.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:eaa9620122a3fb99b943f864af95ed14c8dfc0f47afa3b404ac8c16b3f2bb484"}, - {file = "tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:71784b9ab5bf0ff3075bceeb198149d2c5e068549c0d18fe32d06ba0deb63f79"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec5b71f668a8076802b0241a42387d48289f25435b86b769ae1837cad4172a17"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ea8562fa7498850d02a16178105b58803ea825b50dc9094d60549a7ed63654bb"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4136e1558a9ef2e2f1de1555dcd573e1cbc4a320c1a06c4107a3d46dc8ac6e4b"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf5954de3962a5fd9781dc12048d24a1a6f1f5df038c6e95db328cd22964206"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8337ca75d0731fc4860e6204cc24bb36a67d9736142aa06ed320943b50b1e7ed"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a89264e26f63c449d8cded9061adea7b5de53ba2346fc7e87311f7e4117c1cc8"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:790bad50a1b59d4c21592f9c3cf5e5cf9c3c7ce7e1a23a739f13e01fb1be377a"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:76cf6757c73a10ef10bf06fa937c0ec7393d90432f543f49adc8cab3fb6f26cb"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:1626cb186e143720c62c6c6b5371e62bbc10af60481388c0da89bc903f37ea0c"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:da589a61cbfea18ae267723d6b029b84598dc8ca78db9951d8f5beff72d8507c"}, - {file = "tokenizers-0.22.0-cp39-abi3-win32.whl", hash = "sha256:dbf9d6851bddae3e046fedfb166f47743c1c7bd11c640f0691dd35ef0bcad3be"}, - {file = "tokenizers-0.22.0-cp39-abi3-win_amd64.whl", hash = "sha256:c78174859eeaee96021f248a56c801e36bfb6bd5b067f2e95aa82445ca324f00"}, - {file = "tokenizers-0.22.0.tar.gz", hash = "sha256:2e33b98525be8453f355927f3cab312c36cd3e44f4d7e9e97da2fa94d0a49dcb"}, + {file = "tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73"}, + {file = "tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390"}, + {file = "tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82"}, + {file = "tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138"}, + {file = "tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9"}, ] [package.dependencies] -huggingface-hub = ">=0.16.4,<1.0" +huggingface-hub = ">=0.16.4,<2.0" [package.extras] dev = ["tokenizers[testing]"] @@ -3070,14 +3167,14 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "4.56.1" +version = "4.56.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.9.0" groups = ["main"] files = [ - {file = "transformers-4.56.1-py3-none-any.whl", hash = "sha256:1697af6addfb6ddbce9618b763f4b52d5a756f6da4899ffd1b4febf58b779248"}, - {file = "transformers-4.56.1.tar.gz", hash = "sha256:0d88b1089a563996fc5f2c34502f10516cad3ea1aa89f179f522b54c8311fe74"}, + {file = "transformers-4.56.2-py3-none-any.whl", hash = "sha256:79c03d0e85b26cb573c109ff9eafa96f3c8d4febfd8a0774e8bba32702dd6dde"}, + {file = "transformers-4.56.2.tar.gz", hash = "sha256:5e7c623e2d7494105c726dd10f6f90c2c99a55ebe86eef7233765abd0cb1c529"}, ] [package.dependencies] @@ -3094,23 +3191,23 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.26.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] chat-template = ["jinja2 (>=3.1.0)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] -hf-xet = ["hf_xet"] +hf-xet = ["hf-xet"] hub-kernels = ["kernels (>=0.6.1,<=0.9)"] integrations = ["kernels (>=0.6.1,<=0.9)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] @@ -3129,7 +3226,7 @@ serving = ["accelerate (>=0.26.0)", "fastapi", "openai (>=1.98.0)", "pydantic (> sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] @@ -3139,7 +3236,7 @@ tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"] torch = ["accelerate (>=0.26.0)", "torch (>=2.2)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -3429,4 +3526,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.14" -content-hash = "42274fd00aedabf70dc419acd06e2f25b5c69b58b7bf76eef2ea7a9df6470b2c" +content-hash = "fe9799a3d3a101e05d75d5e193c6e9e4ef17a7581cb273f41101e12129f80a2f" diff --git a/apps/pre-processing-service/pyproject.toml b/apps/pre-processing-service/pyproject.toml index 84a957b9..8cb11c0f 100644 --- a/apps/pre-processing-service/pyproject.toml +++ b/apps/pre-processing-service/pyproject.toml @@ -38,6 +38,7 @@ openai = "^1.107.3" aiohttp = "^3.12.15" prometheus-client = "^0.23.1" prometheus-fastapi-instrumentator = "^7.1.0" +boto3 = "^1.40.35" [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"]