From 6b4b84e0691b9fed252ca1b9451d0660e32cc5a7 Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 22 Sep 2025 02:22:38 +0900 Subject: [PATCH 1/6] =?UTF-8?q?chore:=20=EC=8B=B8=EB=8B=A4=EA=B5=AC?= =?UTF-8?q?=EB=AA=B0=20=EC=83=81=ED=92=88=20=EA=B2=80=EC=83=89=20=EB=AC=BC?= =?UTF-8?q?=ED=92=88=2020=EA=B0=9C=20->=2040=EA=B0=9C?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/service/crawlers/search_crawler.py | 4 ++-- apps/pre-processing-service/app/service/search_service.py | 6 +++--- 2 files changed, 5 insertions(+), 5 deletions(-) diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py index a0d46e02..1bc36fc5 100644 --- a/apps/pre-processing-service/app/service/crawlers/search_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/search_crawler.py @@ -49,7 +49,7 @@ async def search_products_selenium(self, keyword: str) -> list[dict]: logger.info( f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)" ) - return unique_products[:20] + return unique_products[:40] except Exception as e: logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'") @@ -88,7 +88,7 @@ async def search_products_httpx(self, keyword: str) -> list[dict]: product_links.append({"url": full_url, "title": title}) logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개") - return product_links[:20] + return product_links[:40] except Exception as e: logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'") diff --git a/apps/pre-processing-service/app/service/search_service.py b/apps/pre-processing-service/app/service/search_service.py index 171bd57f..070f6cc2 100644 --- a/apps/pre-processing-service/app/service/search_service.py +++ b/apps/pre-processing-service/app/service/search_service.py @@ -77,9 +77,9 @@ async def search_products(self, request: RequestSadaguSearch) -> dict: logger.debug(f"상품 {i + 1}: 제목 추출 실패, 제외") continue - # 최대 20개까지만 처리 - if len(enriched_results) >= 20: - logger.info("최대 20개 상품 수집 완료") + # 최대 40개까지 처리 + if len(enriched_results) >= 40: + logger.info("최대 40개 상품 수집 완료") break except Exception as e: From 58ccb7dce5dafd31f99969f1670a9611854b6e5f Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 22 Sep 2025 03:09:55 +0900 Subject: [PATCH 2/6] =?UTF-8?q?chore:=20=EC=8B=B8=EB=8B=A4=EA=B5=AC?= =?UTF-8?q?=EB=AA=B0=20=EC=B5=9C=EC=A2=85=EC=A0=81=EC=9C=BC=EB=A1=9C=20top?= =?UTF-8?q?10=EC=9D=84=20=ED=81=AC=EB=A1=A4=EB=A7=81=ED=95=98=EB=8F=84?= =?UTF-8?q?=EB=A1=9D=20=EB=A1=9C=EC=A7=81=20=EC=88=98=EC=A0=95?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/api/endpoints/product.py | 4 +- .../app/model/schemas.py | 17 +- .../app/service/crawl_service.py | 121 +++++++++-- .../app/service/similarity_service.py | 196 +++++++++--------- 4 files changed, 213 insertions(+), 125 deletions(-) diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 32a4dcbe..0d558b1c 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -60,11 +60,11 @@ async def match(request: RequestSadaguMatch): ) async def similarity(request: RequestSadaguSimilarity): """ - 매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다. + 매칭된 상품들 중 키워드와의 유사도를 계산하여 상위 10개 상품을 선택합니다. """ try: similarity_service = SimilarityService() - response_data = similarity_service.select_product_by_similarity(request) + response_data = similarity_service.select_top_products_by_similarity(request) if not response_data: raise CustomException( diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index 36bef959..ea14a0f4 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -110,8 +110,8 @@ class SadaguSimilarityData(BaseModel): keyword: str = Field( ..., title="분석 키워드", description="유사도 분석에 사용된 키워드" ) - selected_product: Optional[Dict] = Field( - None, title="선택된 상품", description="유사도 분석 결과 선택된 상품" + top_products: List[Dict] = Field( + default_factory=list, title="선택된 상품들", description="유사도 분석 결과 선택된 상위 상품 목록" ) reason: Optional[str] = Field( None, title="선택 이유", description="상품 선택 근거 및 점수 정보" @@ -129,16 +129,21 @@ class ResponseSadaguSimilarity(ResponseBase[SadaguSimilarityData]): class RequestSadaguCrawl(RequestBase): - product_url: HttpUrl = Field( + product_urls: List[HttpUrl] = Field( ..., title="상품 URL", description="크롤링할 상품 페이지의 URL" ) # 응답 데이터 모델 class SadaguCrawlData(BaseModel): - product_url: str = Field(..., title="상품 URL", description="크롤링된 상품 URL") - product_detail: Optional[Dict] = Field( - None, title="상품 상세정보", description="크롤링된 상품의 상세 정보" + crawled_products: List[Dict] = Field( + ..., title="크롤링된 상품들", description="크롤링된 상품들의 상세 정보 목록 (URL 포함)" + ) + success_count: int = Field( + ..., title="성공 개수", description="성공적으로 크롤링된 상품 개수" + ) + fail_count: int = Field( + ..., title="실패 개수", description="크롤링에 실패한 상품 개수" ) crawled_at: Optional[str] = Field( None, title="크롤링 시간", description="크롤링 완료 시간" diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index af8f91bc..311ae42e 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -1,4 +1,5 @@ import time +import asyncio from app.service.crawlers.detail_crawler import DetailCrawler from app.errors.CustomException import InvalidItemDataException from app.model.schemas import RequestSadaguCrawl @@ -12,45 +13,123 @@ def __init__(self): async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: """ - 선택된 상품의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) - 상품 URL을 입력받아 상세 정보를 크롤링하여 딕셔너리로 반환합니다. + 선택된 상품들의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계) + 여러 상품 URL을 입력받아 순차적으로 상세 정보를 크롤링하여 딕셔너리로 반환합니다. """ - crawler = DetailCrawler(use_selenium=True) + product_urls = [str(url) for url in request.product_urls] + + logger.info(f"상품 상세 크롤링 서비스 시작: 총 {len(product_urls)}개 상품") + + crawled_products = [] + success_count = 0 + fail_count = 0 try: - logger.info( - f"상품 상세 크롤링 서비스 시작: product_url={request.product_url}" - ) + # 각 상품을 순차적으로 크롤링 (안정성 확보) + for i, product_url in enumerate(product_urls, 1): + logger.info(f"상품 {i}/{len(product_urls)} 크롤링 시작: {product_url}") + + crawler = DetailCrawler(use_selenium=True) + + try: + # 상세 정보 크롤링 실행 + product_detail = await crawler.crawl_detail(product_url) + + if product_detail: + product_title = product_detail.get("title", "Unknown")[:50] + logger.success( + f"상품 {i} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}" + ) + + # 성공한 상품 추가 + crawled_products.append({ + "index": i, + "url": product_url, + "product_detail": product_detail, + "status": "success", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") + }) + success_count += 1 + else: + logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음") + crawled_products.append({ + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": "상세 정보 없음", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") + }) + fail_count += 1 + + except Exception as e: + logger.error(f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'") + crawled_products.append({ + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": str(e), + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") + }) + fail_count += 1 - # 상세 정보 크롤링 실행 - product_detail = await crawler.crawl_detail( - product_url=str(request.product_url) + finally: + # 각 크롤러 개별 정리 + await crawler.close() + + # 상품간 간격 (서버 부하 방지) + if i < len(product_urls): + await asyncio.sleep(1) + + logger.success( + f"전체 크롤링 완료: 총 {len(product_urls)}개, 성공 {success_count}개, 실패 {fail_count}개" ) + # 응답 데이터 구성 + data = { + "crawled_products": crawled_products, + "success_count": success_count, + "fail_count": fail_count, + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + logger.info(f"상품 상세 크롤링 서비스 완료: success_rate={success_count}/{len(product_urls)}") + return Response.ok(data) + + except Exception as e: + logger.error(f"배치 크롤링 서비스 오류: error='{e}'") + raise InvalidItemDataException() + + # 기존 단일 크롤링 메서드도 유지 (하위 호환성) + async def crawl_single_product_detail(self, product_url: str) -> dict: + """ + 단일 상품 크롤링 (하위 호환성용) + """ + crawler = DetailCrawler(use_selenium=True) + + try: + logger.info(f"단일 상품 크롤링 시작: {product_url}") + + product_detail = await crawler.crawl_detail(product_url) + if not product_detail: - logger.error(f"상품 상세 정보 크롤링 실패: url={request.product_url}") + logger.error(f"상품 상세 정보 크롤링 실패: url={product_url}") raise InvalidItemDataException() product_title = product_detail.get("title", "Unknown")[:50] - logger.success( - f"크롤링 완료: title='{product_title}', price={product_detail.get('price', 0)}, options_count={len(product_detail.get('options', []))}" - ) + logger.success(f"크롤링 완료: title='{product_title}'") - # 응답 데이터 구성 data = { - "product_url": str(request.product_url), + "product_url": product_url, "product_detail": product_detail, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - logger.info(f"상품 상세 크롤링 서비스 완료: status=success") return Response.ok(data) except Exception as e: - logger.error( - f"크롤링 서비스 오류: product_url={request.product_url}, error='{e}'" - ) + logger.error(f"단일 크롤링 오류: url={product_url}, error='{e}'") raise InvalidItemDataException() finally: - await crawler.close() - logger.debug("크롤러 리소스 정리 완료") + await crawler.close() \ No newline at end of file diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 516b0c63..2974640e 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -9,16 +9,17 @@ class SimilarityService: def __init__(self): pass - def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict: + def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> dict: """ - BERT 기반 유사도 분석 후 상품 선택 - 4단계 + 형태소 분석 후 Top 10 선택 (10개 이하면 유사도 분석 생략) """ keyword = request.keyword candidates = request.matched_products fallback_products = request.search_results or [] + top_count = 10 # Top 10 개수 설정 logger.info( - f"유사도 분석 서비스 시작: keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" + f"상품 선택 서비스 시작 (Top {top_count}): keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}" ) # 매칭된 상품이 없으면 전체 검색 결과로 폴백 @@ -30,133 +31,136 @@ def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict data = { "keyword": keyword, - "selected_product": None, + "top_products": [], "reason": "매칭된 상품과 검색 결과가 모두 없음", } return Response.ok(data, "매칭된 상품과 검색 결과가 모두 없습니다.") - logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석") + logger.info("매칭된 상품 없음 → 전체 검색 결과에서 유사도 분석 진행") candidates = fallback_products analysis_mode = "fallback_similarity_only" + skip_similarity = False else: analysis_mode = "matched_products" + # 형태소 분석 결과가 10개 이하면 유사도 분석 생략 + skip_similarity = len(candidates) <= top_count try: - analyzer = SimilarityAnalyzerONNX() - - logger.info( - f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" - ) - - # 한 개만 있으면 바로 선택 - if len(candidates) == 1: - selected_product = candidates[0] - - logger.info("단일 후보 상품 - 유사도 검증 진행") - # 유사도 계산 - similarity = analyzer.calculate_similarity( - keyword, selected_product["title"] - ) - - # 폴백 모드에서는 임계값 검증 - if analysis_mode == "fallback_similarity_only": - similarity_threshold = 0.3 - if similarity < similarity_threshold: - logger.warning( - f"단일 상품 유사도 미달: similarity={similarity:.4f} < threshold={similarity_threshold}" - ) - data = { - "keyword": keyword, - "selected_product": None, - "reason": f"단일 상품 유사도({similarity:.4f}) < 기준({similarity_threshold})", - } - return Response.ok( - data, "단일 상품 유사도 미달 되어 상품이 존재하지않습니다." - ) - - selected_product["similarity_info"] = { - "similarity_score": float(similarity), - "analysis_type": "single_candidate", - "analysis_mode": analysis_mode, - } + # 형태소 분석 결과가 10개 이하면 유사도 분석 생략하고 바로 반환 + if skip_similarity and analysis_mode == "matched_products": + logger.info(f"형태소 분석 결과가 {len(candidates)}개로 {top_count}개 이하 - 유사도 분석 생략") + + # 매칭 스코어 기준으로 정렬된 상태 유지 (이미 match_service에서 정렬됨) + top_products = [] + for i, product in enumerate(candidates): + enhanced_product = product.copy() + enhanced_product["rank"] = i + 1 + enhanced_product["selection_info"] = { + "selection_type": "match_only", + "match_score": product.get("match_info", {}).get("match_score", 0.0), + "reason": "형태소 분석만으로 선택 (유사도 분석 생략)", + "total_candidates": len(candidates), + } + top_products.append(enhanced_product) logger.success( - f"단일 상품 선택 완료: title='{selected_product['title'][:30]}', similarity={similarity:.4f}" + f"형태소 분석만으로 상품 선택 완료: keyword='{keyword}', selected_count={len(top_products)}" ) + data = { "keyword": keyword, - "selected_product": selected_product, - "reason": f"단일 상품 - 유사도: {similarity:.4f} ({analysis_mode})", + "top_products": top_products, + "reason": f"형태소 분석 결과 {len(candidates)}개 - 유사도 분석 생략", } return Response.ok(data) - # 여러 개가 있으면 유사도 비교 - logger.info("여러 상품 중 최고 유사도로 선택...") + # 유사도 분석 필요한 경우 (매칭 결과가 10개 초과이거나 폴백 모드) + analyzer = SimilarityAnalyzerONNX() - # 제목만 추출해서 배치 분석 + logger.info( + f"키워드 '{keyword}'와 {len(candidates)}개 상품의 유사도 분석 시작... (모드: {analysis_mode})" + ) + + # 모든 후보에 대해 유사도 계산 titles = [product["title"] for product in candidates] similarity_results = analyzer.analyze_similarity_batch(keyword, titles) - # 결과 출력 - logger.info("유사도 분석 결과:") - for i, result in enumerate(similarity_results[:5]): # 상위 5개만 로그 - logger.info( - f" {i+1}위: {result['title'][:40]} | 유사도: {result['similarity']:.4f}" - ) + # 유사도 정보 추가 및 Top 10 선택 + enhanced_products = [] + similarity_threshold = 0.3 if analysis_mode == "fallback_similarity_only" else 0.0 - # 최고 유사도 선택 - best_result = similarity_results[0] - selected_product = candidates[best_result["index"]].copy() + for i, result in enumerate(similarity_results): + product = candidates[result["index"]].copy() - # 폴백 모드에서는 임계값 검증 - similarity_threshold = 0.3 - if ( - analysis_mode == "fallback_similarity_only" - and best_result["similarity"] < similarity_threshold - ): - logger.warning( - f"최고 유사도 미달: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}" - ) - data = { - "keyword": keyword, - "selected_product": None, - "reason": f"최고 유사도({best_result['similarity']:.4f}) < 기준({similarity_threshold})", + # 폴백 모드에서는 임계값 검증 + if analysis_mode == "fallback_similarity_only" and result["similarity"] < similarity_threshold: + logger.debug( + f"상품 {i + 1} 유사도 미달로 제외: similarity={result['similarity']:.4f} < threshold={similarity_threshold}") + continue + + product["similarity_info"] = { + "similarity_score": result["similarity"], + "analysis_type": "batch_similarity", + "analysis_mode": analysis_mode, } - return Response.ok(data, "최고 유사도가 기준보다 미달 되었습니다.") - - # 유사도 정보 추가 - selected_product["similarity_info"] = { - "similarity_score": best_result["similarity"], - "analysis_type": "multi_candidate_bert", - "analysis_mode": analysis_mode, - "rank": 1, - "total_candidates": len(candidates), - } - # 매칭 모드에서는 종합 점수도 계산 - if analysis_mode == "matched_products" and "match_info" in selected_product: - match_score = selected_product["match_info"]["match_score"] - similarity_score = best_result["similarity"] - # 가중치: 매칭 40%, 유사도 60% - final_score = match_score * 0.4 + similarity_score * 0.6 - selected_product["final_score"] = final_score - reason = f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6" - logger.info( - f"종합 점수 계산: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}" - ) + # 매칭 모드에서는 종합 점수 계산 + if analysis_mode == "matched_products" and "match_info" in product: + match_score = product["match_info"]["match_score"] + similarity_score = result["similarity"] + # 가중치: 매칭 40%, 유사도 60% + final_score = match_score * 0.4 + similarity_score * 0.6 + product["final_score"] = final_score + product["selection_info"] = { + "selection_type": "match_and_similarity", + "match_score": match_score, + "similarity_score": similarity_score, + "final_score": final_score, + "reason": f"종합점수({final_score:.4f}) = 매칭({match_score:.4f})*0.4 + 유사도({similarity_score:.4f})*0.6", + } + else: + product["selection_info"] = { + "selection_type": "similarity_only", + "similarity_score": result["similarity"], + "reason": f"유사도({result['similarity']:.4f}) 기준 선택 ({analysis_mode})", + } + + enhanced_products.append(product) + + # 종합 점수 또는 유사도 기준으로 재정렬 + if analysis_mode == "matched_products": + enhanced_products.sort(key=lambda x: x.get("final_score", x["similarity_info"]["similarity_score"]), + reverse=True) else: - reason = f"유사도({best_result['similarity']:.4f}) 기준 선택 ({analysis_mode})" + enhanced_products.sort(key=lambda x: x["similarity_info"]["similarity_score"], reverse=True) + + # Top 10 선택 + top_products = enhanced_products[:top_count] + + # 순위 정보 추가 + for i, product in enumerate(top_products): + product["rank"] = i + 1 logger.success( - f"상품 선택 완료: title='{selected_product['title'][:30]}', {reason}" + f"유사도 분석 완료: keyword='{keyword}', total_analyzed={len(candidates)}, valid_results={len(enhanced_products)}, top_selected={len(top_products)}" ) + + if top_products: + best_product = top_products[0] + if "final_score" in best_product: + logger.info( + f"1위 상품: title='{best_product['title'][:30]}', final_score={best_product['final_score']:.4f}") + else: + logger.info( + f"1위 상품: title='{best_product['title'][:30]}', similarity={best_product['similarity_info']['similarity_score']:.4f}") + data = { "keyword": keyword, - "selected_product": selected_product, - "reason": reason, + "top_products": top_products, + "reason": f"유사도 분석 후 상위 {len(top_products)}개 선택 ({analysis_mode})", } return Response.ok(data) except Exception as e: logger.error(f"유사도 분석 서비스 오류: keyword='{keyword}', error='{e}'") - raise InvalidItemDataException() + raise InvalidItemDataException() \ No newline at end of file From 7e89bd32cd4d7939df7b7d5aec39ab38509b80e7 Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 22 Sep 2025 04:02:20 +0900 Subject: [PATCH 3/6] =?UTF-8?q?feat:=20=ED=81=AC=EB=A1=A4=EB=A7=81=20?= =?UTF-8?q?=ED=9B=84=20=EC=9D=B4=EB=AF=B8=EC=A7=80=20=EC=A0=84=EB=B6=80=20?= =?UTF-8?q?s3=EC=97=90=20=EC=97=85=EB=A1=9C=EB=93=9C=20=ED=95=98=EB=8A=94?= =?UTF-8?q?=20=EB=A1=9C=EC=A7=81=20=EC=B6=94=EA=B0=80?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/api/endpoints/product.py | 21 ++ .../app/model/schemas.py | 60 +++++ .../app/service/s3_upload_service.py | 153 ++++++++++++ .../app/utils/s3_upload_util.py | 231 ++++++++++++++++++ apps/pre-processing-service/poetry.lock | 175 ++++++++++--- apps/pre-processing-service/pyproject.toml | 1 + 6 files changed, 602 insertions(+), 39 deletions(-) create mode 100644 apps/pre-processing-service/app/service/s3_upload_service.py create mode 100644 apps/pre-processing-service/app/utils/s3_upload_util.py diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 0d558b1c..0accbf71 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -6,6 +6,7 @@ CustomException, ) from ...service.crawl_service import CrawlService +from ...service.s3_upload_service import S3UploadService from ...service.search_service import SearchService from ...service.match_service import MatchService from ...service.similarity_service import SimilarityService @@ -99,3 +100,23 @@ async def crawl(body: RequestSadaguCrawl): raise HTTPException(status_code=e.status_code, detail=e.detail) except Exception as e: raise HTTPException(status_code=500, detail=str(e)) + +@router.post( + "/s3-upload", response_model=ResponseS3Upload, summary="S3 이미지 업로드" +) +async def s3_upload(request: RequestS3Upload): + """ + 크롤링 완료 후 별도로 호출하여 이미지들을 S3 저장소에 업로드합니다. + """ + try: + s3_upload_service = S3UploadService() + response_data = await s3_upload_service.upload_crawled_products_to_s3(request) + + if not response_data: + raise CustomException(500, "S3 이미지 업로드에 실패했습니다.", "S3_UPLOAD_FAILED") + + return response_data + except InvalidItemDataException as e: + raise HTTPException(status_code=e.status_code, detail=e.detail) + except Exception as e: + raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index ea14a0f4..9acef5d2 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -156,6 +156,66 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]): pass +# ============== 6단계: S3 이미지 업로드 ============== + +class RequestS3Upload(RequestBase): + crawled_products: List[Dict] = Field( + ..., title="크롤링된 상품 데이터", description="이전 단계에서 크롤링된 상품들의 데이터" + ) + base_folder: Optional[str] = Field( + "product-images", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" + ) + + +# S3 업로드된 이미지 정보 +class S3ImageInfo(BaseModel): + index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번") + original_url: str = Field(..., title="원본 URL", description="크롤링된 원본 이미지 URL") + s3_key: str = Field(..., title="S3 키", description="S3 저장소 내 파일 키") + s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL") + file_size: int = Field(..., title="파일 크기", description="업로드된 파일 크기 (bytes)") + content_type: str = Field(..., title="콘텐츠 타입", description="파일의 MIME 타입") + + +# 상품별 S3 업로드 결과 +class ProductS3UploadResult(BaseModel): + product_index: int = Field(..., title="상품 순번", description="크롤링 순번") + product_title: str = Field(..., title="상품 제목", description="상품명") + product_url: str = Field(..., title="상품 URL", description="상품 페이지 URL") + status: str = Field(..., title="업로드 상태", description="completed/skipped/error") + upload_folder: Optional[str] = Field(None, title="업로드 폴더", description="S3 내 상품별 폴더 경로") + folder_s3_url: Optional[str] = Field(None, title="폴더 S3 URL", description="S3 폴더 접근 URL") + uploaded_images: List[S3ImageInfo] = Field(default_factory=list, title="업로드 성공 이미지") + failed_images: List[Dict] = Field(default_factory=list, title="업로드 실패 이미지") + success_count: int = Field(..., title="성공 개수", description="업로드 성공한 이미지 수") + fail_count: int = Field(..., title="실패 개수", description="업로드 실패한 이미지 수") + reason: Optional[str] = Field(None, title="건너뜀 사유", description="업로드를 건너뛴 이유") + error: Optional[str] = Field(None, title="오류 메시지", description="업로드 중 발생한 오류") + + +# S3 업로드 요약 정보 +class S3UploadSummary(BaseModel): + total_products: int = Field(..., title="총 상품 수", description="처리 대상 상품 총 개수") + processed_products: int = Field(..., title="처리된 상품 수", description="실제 처리된 상품 수") + skipped_products: int = Field(..., title="건너뛴 상품 수", description="크롤링 실패로 건너뛴 상품 수") + total_success_images: int = Field(..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수") + total_fail_images: int = Field(..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수") + success_rate: str = Field(..., title="성공률", description="이미지 업로드 성공률 (성공/전체)") + + +# 응답 데이터 모델 +class S3UploadData(BaseModel): + upload_results: List[ProductS3UploadResult] = Field(..., title="업로드 결과", description="각 상품의 S3 업로드 결과") + summary: S3UploadSummary = Field(..., title="업로드 요약", description="전체 업로드 결과 요약") + base_folder: str = Field(..., title="기본 폴더", description="S3 업로드에 사용된 기본 폴더") + uploaded_at: str = Field(..., title="업로드 완료 시간", description="S3 업로드 완료 시간") + + +# 최종 응답 모델 +class ResponseS3Upload(ResponseBase[S3UploadData]): + """S3 이미지 업로드 API 응답""" + pass + # ============== 블로그 콘텐츠 생성 ============== diff --git a/apps/pre-processing-service/app/service/s3_upload_service.py b/apps/pre-processing-service/app/service/s3_upload_service.py new file mode 100644 index 00000000..033a11a6 --- /dev/null +++ b/apps/pre-processing-service/app/service/s3_upload_service.py @@ -0,0 +1,153 @@ +import time +import asyncio +import aiohttp +from typing import List, Dict +from loguru import logger +from app.errors.CustomException import InvalidItemDataException +from app.model.schemas import RequestS3Upload +from app.utils.s3_upload_util import S3UploadUtil +from app.utils.response import Response + + +class S3UploadService: + """6단계: 크롤링된 상품 이미지들을 S3에 업로드하는 서비스""" + + def __init__(self): + self.s3_util = S3UploadUtil() + + async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: + """ + 크롤링된 상품들의 이미지를 S3에 업로드하는 비즈니스 로직 (6단계) + """ + crawled_products = request.crawled_products + base_folder = request.base_folder or "product-images" + + logger.info(f"S3 업로드 서비스 시작: {len(crawled_products)}개 상품") + + upload_results = [] + total_success_images = 0 + total_fail_images = 0 + processed_products = 0 + + try: + # HTTP 세션을 사용한 이미지 다운로드 + async with aiohttp.ClientSession() as session: + + # 각 상품별로 순차 업로드 + for product_info in crawled_products: + product_index = product_info.get("index", 0) + product_detail = product_info.get("product_detail") + + logger.info(f"상품 {product_index}/{len(crawled_products)} S3 업로드 시작") + + # 크롤링 실패한 상품은 스킵 + if not product_detail or product_info.get("status") != "success": + logger.warning(f"상품 {product_index}: 크롤링 실패로 인한 업로드 스킵") + upload_results.append({ + "product_index": product_index, + "product_title": "Unknown", + "product_url": product_info.get("url", ""), + "status": "skipped", + "reason": "크롤링 실패", + "success_count": 0, + "fail_count": 0, + "uploaded_images": [], + "failed_images": [] + }) + continue + + try: + # 상품 이미지 업로드 (유틸리티 사용) + upload_result = await self.s3_util.upload_single_product_images( + session, product_detail, product_index, base_folder + ) + + upload_results.append(upload_result) + total_success_images += upload_result["success_count"] + total_fail_images += upload_result["fail_count"] + processed_products += 1 + + logger.success( + f"상품 {product_index} S3 업로드 완료: 성공 {upload_result['success_count']}개, " + f"실패 {upload_result['fail_count']}개" + ) + + except Exception as e: + logger.error(f"상품 {product_index} S3 업로드 오류: {e}") + upload_results.append({ + "product_index": product_index, + "product_title": product_detail.get("title", "Unknown"), + "product_url": product_detail.get("url", ""), + "status": "error", + "error": str(e), + "success_count": 0, + "fail_count": 0, + "uploaded_images": [], + "failed_images": [] + }) + + # 상품간 간격 (서버 부하 방지) + if product_index < len(crawled_products): + await asyncio.sleep(1) + + logger.success( + f"S3 업로드 서비스 완료: 처리된 상품 {processed_products}개, " + f"총 성공 이미지 {total_success_images}개, 총 실패 이미지 {total_fail_images}개" + ) + + # 응답 데이터 구성 + data = { + "upload_results": upload_results, + "summary": { + "total_products": len(crawled_products), + "processed_products": processed_products, + "skipped_products": len(crawled_products) - processed_products, + "total_success_images": total_success_images, + "total_fail_images": total_fail_images, + "success_rate": f"{total_success_images}/{total_success_images + total_fail_images}" if ( + total_success_images + total_fail_images) > 0 else "0/0" + }, + "base_folder": base_folder, + "uploaded_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + + message = f"S3 업로드 완료: {total_success_images}개 이미지 업로드 성공" + return Response.ok(data, message) + + except Exception as e: + logger.error(f"S3 업로드 서비스 전체 오류: {e}") + raise InvalidItemDataException() + + async def get_upload_status(self, upload_results: List[Dict]) -> Dict: + """ + 업로드 결과 상태 요약 (선택적 기능) + """ + try: + total_products = len(upload_results) + successful_products = len([r for r in upload_results if r.get("status") == "completed"]) + failed_products = len([r for r in upload_results if r.get("status") in ["error", "skipped"]]) + + total_images = sum(r.get("success_count", 0) + r.get("fail_count", 0) for r in upload_results) + successful_images = sum(r.get("success_count", 0) for r in upload_results) + failed_images = sum(r.get("fail_count", 0) for r in upload_results) + + status_summary = { + "products": { + "total": total_products, + "successful": successful_products, + "failed": failed_products, + "success_rate": f"{successful_products}/{total_products}" + }, + "images": { + "total": total_images, + "successful": successful_images, + "failed": failed_images, + "success_rate": f"{successful_images}/{total_images}" if total_images > 0 else "0/0" + } + } + + return status_summary + + except Exception as e: + logger.error(f"업로드 상태 요약 오류: {e}") + return {} \ No newline at end of file diff --git a/apps/pre-processing-service/app/utils/s3_upload_util.py b/apps/pre-processing-service/app/utils/s3_upload_util.py new file mode 100644 index 00000000..98088ae3 --- /dev/null +++ b/apps/pre-processing-service/app/utils/s3_upload_util.py @@ -0,0 +1,231 @@ +import os +import boto3 +import aiohttp +import asyncio +from datetime import datetime +from urllib.parse import urlparse +from typing import List, Dict, Optional, Tuple +from loguru import logger + + +class S3UploadUtil: + """S3 업로드 전용 유틸리티 클래스""" + + def __init__(self): + # 환경변수에서 AWS 설정 읽기 + self.aws_access_key = os.getenv("AWS_ACCESS_KEY_ID") + self.aws_secret_key = os.getenv("AWS_SECRET_ACCESS_KEY") + self.bucket_name = os.getenv("S3_BUCKET_NAME", "icebang4-dev-bucket") + self.region = os.getenv("AWS_REGION", "ap-northeast-2") + + if not self.aws_access_key or not self.aws_secret_key: + raise ValueError("AWS_ACCESS_KEY_ID와 AWS_SECRET_ACCESS_KEY 환경변수가 필요합니다") + + self.base_url = f"https://{self.bucket_name}.s3.{self.region}.amazonaws.com" + + # S3 클라이언트 초기화 + self.s3_client = boto3.client( + 's3', + aws_access_key_id=self.aws_access_key, + aws_secret_access_key=self.aws_secret_key, + region_name=self.region + ) + + logger.info(f"S3 클라이언트 초기화 완료: bucket={self.bucket_name}, region={self.region}") + + async def download_image(self, session: aiohttp.ClientSession, image_url: str) -> Optional[bytes]: + """이미지 URL에서 이미지 데이터 다운로드""" + try: + logger.debug(f"이미지 다운로드 시작: {image_url}") + + async with session.get(image_url, timeout=aiohttp.ClientTimeout(total=30)) as response: + if response.status == 200: + image_data = await response.read() + logger.debug(f"이미지 다운로드 완료: {len(image_data)} bytes") + return image_data + else: + logger.warning(f"이미지 다운로드 실패: {image_url}, status={response.status}") + return None + + except Exception as e: + logger.error(f"이미지 다운로드 오류: {image_url}, error={e}") + return None + + def get_file_extension(self, image_url: str) -> str: + """URL에서 파일 확장자 추출""" + parsed = urlparse(image_url) + path = parsed.path.lower() + + # 일반적인 이미지 확장자 확인 + for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']: + if ext in path: + return ext + + # 기본값 + return '.jpg' + + def get_content_type(self, file_extension: str) -> str: + """파일 확장자에 따른 Content-Type 반환""" + content_types = { + '.jpg': 'image/jpeg', + '.jpeg': 'image/jpeg', + '.png': 'image/png', + '.gif': 'image/gif', + '.webp': 'image/webp' + } + return content_types.get(file_extension, 'image/jpeg') + + def upload_to_s3(self, image_data: bytes, s3_key: str, content_type: str = "image/jpeg") -> bool: + """S3에 이미지 업로드""" + try: + logger.debug(f"S3 업로드 시작: key={s3_key}") + + self.s3_client.put_object( + Bucket=self.bucket_name, + Key=s3_key, + Body=image_data, + ContentType=content_type, + ) + + logger.debug(f"S3 업로드 완료: key={s3_key}") + return True + + except Exception as e: + logger.error(f"S3 업로드 오류: key={s3_key}, error={e}") + return False + + def generate_s3_key(self, base_folder: str, product_index: int, product_title: str, + image_index: int, file_extension: str) -> str: + """S3 키 생성""" + # 상품 제목에서 특수문자 제거 + safe_title = product_title.replace("/", "-").replace("\\", "-").replace(" ", "_")[:30] + + # 타임스탬프 + 상품 정보로 폴더명 생성 + timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") + folder_name = f"{timestamp}_product_{product_index}_{safe_title}" + + # 최종 S3 키 + s3_key = f"{base_folder}/{folder_name}/image_{image_index:03d}{file_extension}" + + return s3_key + + def get_s3_url(self, s3_key: str) -> str: + """S3 키에서 접근 가능한 URL 생성""" + return f"{self.base_url}/{s3_key}" + + async def upload_single_product_images(self, + session: aiohttp.ClientSession, + product_data: Dict, + product_index: int, + base_folder: str = "product-images") -> Dict: + """단일 상품의 모든 이미지를 S3에 업로드""" + + product_title = product_data.get("title", "Unknown") + product_url = product_data.get("url", "") + product_images = product_data.get("product_images", []) + + uploaded_images = [] + failed_images = [] + + if not product_images: + logger.warning(f"상품 {product_index}: 업로드할 이미지가 없음") + return { + "product_index": product_index, + "product_title": product_title, + "product_url": product_url, + "status": "no_images", + "uploaded_images": uploaded_images, + "failed_images": failed_images, + "success_count": 0, + "fail_count": 0, + "upload_folder": None, + "folder_s3_url": None + } + + logger.info(f"상품 {product_index} 이미지 업로드 시작: {len(product_images)}개 이미지") + + # 각 이미지 업로드 + for img_idx, img_info in enumerate(product_images, 1): + original_url = img_info.get("original_url", "") + + if not original_url: + logger.warning(f"상품 {product_index}, 이미지 {img_idx}: URL이 없음") + failed_images.append({ + "index": img_idx, + "original_url": original_url, + "error": "URL이 없음" + }) + continue + + try: + # 이미지 다운로드 + image_data = await self.download_image(session, original_url) + + if not image_data: + failed_images.append({ + "index": img_idx, + "original_url": original_url, + "error": "다운로드 실패" + }) + continue + + # S3 키 생성 + file_extension = self.get_file_extension(original_url) + s3_key = self.generate_s3_key( + base_folder, product_index, product_title, img_idx, file_extension + ) + + # S3 업로드 + content_type = self.get_content_type(file_extension) + + if self.upload_to_s3(image_data, s3_key, content_type): + s3_url = self.get_s3_url(s3_key) + uploaded_images.append({ + "index": img_idx, + "original_url": original_url, + "s3_key": s3_key, + "s3_url": s3_url, + "file_size": len(image_data), + "content_type": content_type + }) + logger.debug(f"상품 {product_index}, 이미지 {img_idx} 업로드 완료") + else: + failed_images.append({ + "index": img_idx, + "original_url": original_url, + "error": "S3 업로드 실패" + }) + + except Exception as e: + logger.error(f"상품 {product_index}, 이미지 {img_idx} 처리 오류: {e}") + failed_images.append({ + "index": img_idx, + "original_url": original_url, + "error": str(e) + }) + + # 이미지 간 간격 (서버 부하 방지) + await asyncio.sleep(0.5) + + # 업로드 폴더 정보 계산 + upload_folder = None + folder_s3_url = None + if uploaded_images: + first_s3_key = uploaded_images[0]["s3_key"] + upload_folder = "/".join(first_s3_key.split("/")[:-1]) # 파일명 제거 + folder_s3_url = f"{self.base_url}/{upload_folder}" + + logger.success(f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {len(failed_images)}개") + + return { + "product_index": product_index, + "product_title": product_title, + "product_url": product_url, + "status": "completed", + "upload_folder": upload_folder, + "folder_s3_url": folder_s3_url, + "uploaded_images": uploaded_images, + "failed_images": failed_images, + "success_count": len(uploaded_images), + "fail_count": len(failed_images) + } \ No newline at end of file diff --git a/apps/pre-processing-service/poetry.lock b/apps/pre-processing-service/poetry.lock index ca5c20ab..f02855bc 100644 --- a/apps/pre-processing-service/poetry.lock +++ b/apps/pre-processing-service/poetry.lock @@ -321,6 +321,46 @@ d = ["aiohttp (>=3.10)"] jupyter = ["ipython (>=7.8.0)", "tokenize-rt (>=3.2.0)"] uvloop = ["uvloop (>=0.15.2)"] +[[package]] +name = "boto3" +version = "1.40.35" +description = "The AWS SDK for Python" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "boto3-1.40.35-py3-none-any.whl", hash = "sha256:f4c1b01dd61e7733b453bca38b004ce030e26ee36e7a3d4a9e45a730b67bc38d"}, + {file = "boto3-1.40.35.tar.gz", hash = "sha256:d718df3591c829bcca4c498abb7b09d64d1eecc4e5a2b6cef14b476501211b8a"}, +] + +[package.dependencies] +botocore = ">=1.40.35,<1.41.0" +jmespath = ">=0.7.1,<2.0.0" +s3transfer = ">=0.14.0,<0.15.0" + +[package.extras] +crt = ["botocore[crt] (>=1.21.0,<2.0a0)"] + +[[package]] +name = "botocore" +version = "1.40.35" +description = "Low-level, data-driven core of boto 3." +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "botocore-1.40.35-py3-none-any.whl", hash = "sha256:c545de2cbbce161f54ca589fbb677bae14cdbfac7d5f1a27f6a620cb057c26f4"}, + {file = "botocore-1.40.35.tar.gz", hash = "sha256:67e062752ff579c8cc25f30f9c3a84c72d692516a41a9ee1cf17735767ca78be"}, +] + +[package.dependencies] +jmespath = ">=0.7.1,<2.0.0" +python-dateutil = ">=2.1,<3.0.0" +urllib3 = {version = ">=1.25.4,<2.2.0 || >2.2.0,<3", markers = "python_version >= \"3.10\""} + +[package.extras] +crt = ["awscrt (==0.27.6)"] + [[package]] name = "bs4" version = "0.0.2" @@ -1320,6 +1360,18 @@ files = [ {file = "jiter-0.11.0.tar.gz", hash = "sha256:1d9637eaf8c1d6a63d6562f2a6e5ab3af946c66037eb1b894e8fad75422266e4"}, ] +[[package]] +name = "jmespath" +version = "1.0.1" +description = "JSON Matching Expressions" +optional = false +python-versions = ">=3.7" +groups = ["main"] +files = [ + {file = "jmespath-1.0.1-py3-none-any.whl", hash = "sha256:02e2e4cc71b5bcab88332eebf907519190dd9e6e82107fa7f83b1003a6252980"}, + {file = "jmespath-1.0.1.tar.gz", hash = "sha256:90261b206d6defd58fdd5e85f478bf633a2901798906be2ad389150c5c60edbe"}, +] + [[package]] name = "joblib" version = "1.5.2" @@ -1693,14 +1745,14 @@ sympy = "*" [[package]] name = "openai" -version = "1.108.0" +version = "1.108.1" description = "The official Python library for the openai API" optional = false python-versions = ">=3.8" groups = ["main"] files = [ - {file = "openai-1.108.0-py3-none-any.whl", hash = "sha256:31f2e58230e2703f13ddbb50c285f39dacf7fca64ab19882fd8a7a0b2bccd781"}, - {file = "openai-1.108.0.tar.gz", hash = "sha256:e859c64e4202d7f5956f19280eee92bb281f211c41cdd5be9e63bf51a024ff72"}, + {file = "openai-1.108.1-py3-none-any.whl", hash = "sha256:952fc027e300b2ac23be92b064eac136a2bc58274cec16f5d2906c361340d59b"}, + {file = "openai-1.108.1.tar.gz", hash = "sha256:6648468c1aec4eacfa554001e933a9fa075f57bacfc27588c2e34456cee9fef9"}, ] [package.dependencies] @@ -1793,14 +1845,14 @@ testing = ["coverage", "pytest", "pytest-benchmark"] [[package]] name = "poetry-core" -version = "2.2.0" +version = "2.2.1" description = "Poetry PEP 517 Build Backend" optional = false python-versions = "<4.0,>=3.9" groups = ["main"] files = [ - {file = "poetry_core-2.2.0-py3-none-any.whl", hash = "sha256:0edea81d07e88cbd407369eef753c722da8ff1338f554788dc04636e756318fc"}, - {file = "poetry_core-2.2.0.tar.gz", hash = "sha256:b4033b71b99717a942030e074fec7e3082e5fde7a8ed10f02cd2413bdf940b1f"}, + {file = "poetry_core-2.2.1-py3-none-any.whl", hash = "sha256:bdfce710edc10bfcf9ab35041605c480829be4ab23f5bc01202cfe5db8f125ab"}, + {file = "poetry_core-2.2.1.tar.gz", hash = "sha256:97e50d8593c8729d3f49364b428583e044087ee3def1e010c6496db76bd65ac5"}, ] [[package]] @@ -2288,14 +2340,14 @@ rsa = ["cryptography"] [[package]] name = "pyparsing" -version = "3.2.4" +version = "3.2.5" description = "pyparsing - Classes and methods to define and execute parsing grammars" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "pyparsing-3.2.4-py3-none-any.whl", hash = "sha256:91d0fcde680d42cd031daf3a6ba20da3107e08a75de50da58360e7d94ab24d36"}, - {file = "pyparsing-3.2.4.tar.gz", hash = "sha256:fff89494f45559d0f2ce46613b419f632bbb6afbdaed49696d322bcf98a58e99"}, + {file = "pyparsing-3.2.5-py3-none-any.whl", hash = "sha256:e38a4f02064cf41fe6593d328d0512495ad1f3d8a91c4f73fc401b3079a59a5e"}, + {file = "pyparsing-3.2.5.tar.gz", hash = "sha256:2df8d5b7b2802ef88e8d016a2eb9c7aeaa923529cd251ed0fe4608275d4105b6"}, ] [package.extras] @@ -2364,6 +2416,21 @@ pygments = ">=2.7.2" [package.extras] dev = ["argcomplete", "attrs (>=19.2)", "hypothesis (>=3.56)", "mock", "requests", "setuptools", "xmlschema"] +[[package]] +name = "python-dateutil" +version = "2.9.0.post0" +description = "Extensions to the standard Python datetime module" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] +files = [ + {file = "python-dateutil-2.9.0.post0.tar.gz", hash = "sha256:37dd54208da7e1cd875388217d5e00ebd4179249f90fb72437e91a35459a0ad3"}, + {file = "python_dateutil-2.9.0.post0-py2.py3-none-any.whl", hash = "sha256:a8b2bc7bffae282281c8140a97d3aa9c14da0b136dfe83f850eea9a5f7470427"}, +] + +[package.dependencies] +six = ">=1.5" + [[package]] name = "python-dotenv" version = "1.1.1" @@ -2638,6 +2705,24 @@ files = [ [package.dependencies] pyasn1 = ">=0.1.3" +[[package]] +name = "s3transfer" +version = "0.14.0" +description = "An Amazon S3 Transfer Manager" +optional = false +python-versions = ">=3.9" +groups = ["main"] +files = [ + {file = "s3transfer-0.14.0-py3-none-any.whl", hash = "sha256:ea3b790c7077558ed1f02a3072fb3cb992bbbd253392f4b6e9e8976941c7d456"}, + {file = "s3transfer-0.14.0.tar.gz", hash = "sha256:eff12264e7c8b4985074ccce27a3b38a485bb7f7422cc8046fee9be4983e4125"}, +] + +[package.dependencies] +botocore = ">=1.37.4,<2.0a.0" + +[package.extras] +crt = ["botocore[crt] (>=1.37.4,<2.0a.0)"] + [[package]] name = "safetensors" version = "0.6.2" @@ -2832,6 +2917,18 @@ typing_extensions = ">=4.14.0,<4.15.0" urllib3 = {version = ">=2.5.0,<3.0", extras = ["socks"]} websocket-client = ">=1.8.0,<1.9.0" +[[package]] +name = "six" +version = "1.17.0" +description = "Python 2 and 3 compatibility utilities" +optional = false +python-versions = "!=3.0.*,!=3.1.*,!=3.2.*,>=2.7" +groups = ["main"] +files = [ + {file = "six-1.17.0-py2.py3-none-any.whl", hash = "sha256:4721f391ed90541fddacab5acf947aa0d3dc7d27b2e1e8eda2be8970586c3274"}, + {file = "six-1.17.0.tar.gz", hash = "sha256:ff70335d468e7eb6ec65b95b99d3a2836546063f63acc5171de367e834932a81"}, +] + [[package]] name = "sniffio" version = "1.3.1" @@ -3015,31 +3112,31 @@ files = [ [[package]] name = "tokenizers" -version = "0.22.0" +version = "0.22.1" description = "" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "tokenizers-0.22.0-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:eaa9620122a3fb99b943f864af95ed14c8dfc0f47afa3b404ac8c16b3f2bb484"}, - {file = "tokenizers-0.22.0-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:71784b9ab5bf0ff3075bceeb198149d2c5e068549c0d18fe32d06ba0deb63f79"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ec5b71f668a8076802b0241a42387d48289f25435b86b769ae1837cad4172a17"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:ea8562fa7498850d02a16178105b58803ea825b50dc9094d60549a7ed63654bb"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:4136e1558a9ef2e2f1de1555dcd573e1cbc4a320c1a06c4107a3d46dc8ac6e4b"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:cdf5954de3962a5fd9781dc12048d24a1a6f1f5df038c6e95db328cd22964206"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:8337ca75d0731fc4860e6204cc24bb36a67d9736142aa06ed320943b50b1e7ed"}, - {file = "tokenizers-0.22.0-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:a89264e26f63c449d8cded9061adea7b5de53ba2346fc7e87311f7e4117c1cc8"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:790bad50a1b59d4c21592f9c3cf5e5cf9c3c7ce7e1a23a739f13e01fb1be377a"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:76cf6757c73a10ef10bf06fa937c0ec7393d90432f543f49adc8cab3fb6f26cb"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:1626cb186e143720c62c6c6b5371e62bbc10af60481388c0da89bc903f37ea0c"}, - {file = "tokenizers-0.22.0-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:da589a61cbfea18ae267723d6b029b84598dc8ca78db9951d8f5beff72d8507c"}, - {file = "tokenizers-0.22.0-cp39-abi3-win32.whl", hash = "sha256:dbf9d6851bddae3e046fedfb166f47743c1c7bd11c640f0691dd35ef0bcad3be"}, - {file = "tokenizers-0.22.0-cp39-abi3-win_amd64.whl", hash = "sha256:c78174859eeaee96021f248a56c801e36bfb6bd5b067f2e95aa82445ca324f00"}, - {file = "tokenizers-0.22.0.tar.gz", hash = "sha256:2e33b98525be8453f355927f3cab312c36cd3e44f4d7e9e97da2fa94d0a49dcb"}, + {file = "tokenizers-0.22.1-cp39-abi3-macosx_10_12_x86_64.whl", hash = "sha256:59fdb013df17455e5f950b4b834a7b3ee2e0271e6378ccb33aa74d178b513c73"}, + {file = "tokenizers-0.22.1-cp39-abi3-macosx_11_0_arm64.whl", hash = "sha256:8d4e484f7b0827021ac5f9f71d4794aaef62b979ab7608593da22b1d2e3c4edc"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:19d2962dd28bc67c1f205ab180578a78eef89ac60ca7ef7cbe9635a46a56422a"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:38201f15cdb1f8a6843e6563e6e79f4abd053394992b9bbdf5213ea3469b4ae7"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:d1cbe5454c9a15df1b3443c726063d930c16f047a3cc724b9e6e1a91140e5a21"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:e7d094ae6312d69cc2a872b54b91b309f4f6fbce871ef28eb27b52a98e4d0214"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:afd7594a56656ace95cdd6df4cca2e4059d294c5cfb1679c57824b605556cb2f"}, + {file = "tokenizers-0.22.1-cp39-abi3-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:e2ef6063d7a84994129732b47e7915e8710f27f99f3a3260b8a38fc7ccd083f4"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_aarch64.whl", hash = "sha256:ba0a64f450b9ef412c98f6bcd2a50c6df6e2443b560024a09fa6a03189726879"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_armv7l.whl", hash = "sha256:331d6d149fa9c7d632cde4490fb8bbb12337fa3a0232e77892be656464f4b446"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_i686.whl", hash = "sha256:607989f2ea68a46cb1dfbaf3e3aabdf3f21d8748312dbeb6263d1b3b66c5010a"}, + {file = "tokenizers-0.22.1-cp39-abi3-musllinux_1_2_x86_64.whl", hash = "sha256:a0f307d490295717726598ef6fa4f24af9d484809223bbc253b201c740a06390"}, + {file = "tokenizers-0.22.1-cp39-abi3-win32.whl", hash = "sha256:b5120eed1442765cd90b903bb6cfef781fd8fe64e34ccaecbae4c619b7b12a82"}, + {file = "tokenizers-0.22.1-cp39-abi3-win_amd64.whl", hash = "sha256:65fd6e3fb11ca1e78a6a93602490f134d1fdeb13bcef99389d5102ea318ed138"}, + {file = "tokenizers-0.22.1.tar.gz", hash = "sha256:61de6522785310a309b3407bac22d99c4db5dba349935e99e4d15ea2226af2d9"}, ] [package.dependencies] -huggingface-hub = ">=0.16.4,<1.0" +huggingface-hub = ">=0.16.4,<2.0" [package.extras] dev = ["tokenizers[testing]"] @@ -3070,14 +3167,14 @@ telegram = ["requests"] [[package]] name = "transformers" -version = "4.56.1" +version = "4.56.2" description = "State-of-the-art Machine Learning for JAX, PyTorch and TensorFlow" optional = false python-versions = ">=3.9.0" groups = ["main"] files = [ - {file = "transformers-4.56.1-py3-none-any.whl", hash = "sha256:1697af6addfb6ddbce9618b763f4b52d5a756f6da4899ffd1b4febf58b779248"}, - {file = "transformers-4.56.1.tar.gz", hash = "sha256:0d88b1089a563996fc5f2c34502f10516cad3ea1aa89f179f522b54c8311fe74"}, + {file = "transformers-4.56.2-py3-none-any.whl", hash = "sha256:79c03d0e85b26cb573c109ff9eafa96f3c8d4febfd8a0774e8bba32702dd6dde"}, + {file = "transformers-4.56.2.tar.gz", hash = "sha256:5e7c623e2d7494105c726dd10f6f90c2c99a55ebe86eef7233765abd0cb1c529"}, ] [package.dependencies] @@ -3094,23 +3191,23 @@ tqdm = ">=4.27" [package.extras] accelerate = ["accelerate (>=0.26.0)"] -all = ["Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] +all = ["Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "codecarbon (>=2.8.1)", "flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "librosa", "mistral-common[opencv] (>=1.6.3)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "phonemizer", "protobuf", "pyctcdecode (>=0.4.0)", "ray[tune] (>=2.7.0)", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision"] audio = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] benchmark = ["optimum-benchmark (>=0.3.0)"] chat-template = ["jinja2 (>=3.1.0)"] codecarbon = ["codecarbon (>=2.8.1)"] deepspeed = ["accelerate (>=0.26.0)", "deepspeed (>=0.9.3)"] -deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] -dev = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] -dev-tensorflow = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"] -dev-torch = ["GitPython (<3.1.19)", "GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)", "urllib3 (<2.0.0)"] +deepspeed-testing = ["GitPython (<3.1.19)", "accelerate (>=0.26.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "deepspeed (>=0.9.3)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "optuna", "parameterized (>=0.9)", "protobuf", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +dev = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "av", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "flax (>=0.4.1,<=0.7.0)", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "jinja2 (>=3.1.0)", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "kernels (>=0.6.1,<=0.9)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxconverter-common", "optax (>=0.0.8,<=0.1.4)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "scipy (<1.13.0)", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] +dev-tensorflow = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "kenlm", "keras-nlp (>=0.3.1,<0.14.0)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "onnxconverter-common", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx", "timeout-decorator", "tokenizers (>=0.22.0,<=0.23.0)", "urllib3 (<2.0.0)"] +dev-torch = ["GitPython (<3.1.19)", "Pillow (>=10.0.1,<=15.0)", "accelerate (>=0.26.0)", "beautifulsoup4", "codecarbon (>=2.8.1)", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "kenlm", "kernels (>=0.6.1,<=0.9)", "libcst", "librosa", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "num2words", "onnxruntime (>=1.4.0)", "onnxruntime-tools (>=1.4.2)", "optuna", "pandas (<2.3.0)", "parameterized (>=0.9)", "phonemizer", "protobuf", "psutil", "pyctcdecode (>=0.4.0)", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "ray[tune] (>=2.7.0)", "rhoknp (>=1.1.0,<1.3.1)", "rich", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "scikit-learn", "sentencepiece (>=0.1.91,!=0.1.92)", "sigopt", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "tensorboard", "timeout-decorator", "timm (!=1.0.18,<=1.0.19)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "torchaudio", "torchvision", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)", "urllib3 (<2.0.0)"] flax = ["flax (>=0.4.1,<=0.7.0)", "jax (>=0.4.1,<=0.4.13)", "jaxlib (>=0.4.1,<=0.4.13)", "optax (>=0.0.8,<=0.1.4)", "scipy (<1.13.0)"] flax-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] ftfy = ["ftfy"] -hf-xet = ["hf_xet"] +hf-xet = ["hf-xet"] hub-kernels = ["kernels (>=0.6.1,<=0.9)"] integrations = ["kernels (>=0.6.1,<=0.9)", "optuna", "ray[tune] (>=2.7.0)", "sigopt"] -ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict_core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic_lite (>=1.0.7)"] +ja = ["fugashi (>=1.0)", "ipadic (>=1.0.0,<2.0)", "rhoknp (>=1.1.0,<1.3.1)", "sudachidict-core (>=20220729)", "sudachipy (>=0.6.6)", "unidic (>=1.0.2)", "unidic-lite (>=1.0.7)"] mistral-common = ["mistral-common[opencv] (>=1.6.3)"] modelcreation = ["cookiecutter (==1.7.3)"] natten = ["natten (>=0.14.6,<0.15.0)"] @@ -3129,7 +3226,7 @@ serving = ["accelerate (>=0.26.0)", "fastapi", "openai (>=1.98.0)", "pydantic (> sigopt = ["sigopt"] sklearn = ["scikit-learn"] speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] -testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] +testing = ["GitPython (<3.1.19)", "beautifulsoup4", "cookiecutter (==1.7.3)", "datasets (>=2.15.0)", "dill (<0.3.5)", "evaluate (>=0.2.0)", "faiss-cpu", "libcst", "mistral-common[opencv] (>=1.6.3)", "nltk (<=3.8.1)", "parameterized (>=0.9)", "psutil", "pydantic (>=2)", "pytest (>=7.2.0)", "pytest-asyncio", "pytest-order", "pytest-rerunfailures", "pytest-rich", "pytest-timeout", "pytest-xdist", "rjieba", "rouge-score (!=0.0.7,!=0.0.8,!=0.1,!=0.1.1)", "ruff (==0.11.2)", "sacrebleu (>=1.4.12,<2.0.0)", "sacremoses", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "timeout-decorator"] tf = ["keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow (>2.9,<2.16)", "tensorflow-text (<2.16)", "tf2onnx"] tf-cpu = ["keras (>2.9,<2.16)", "keras-nlp (>=0.3.1,<0.14.0)", "onnxconverter-common", "tensorflow-cpu (>2.9,<2.16)", "tensorflow-probability (<0.24)", "tensorflow-text (<2.16)", "tf2onnx"] tf-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)"] @@ -3139,7 +3236,7 @@ tokenizers = ["tokenizers (>=0.22.0,<=0.23.0)"] torch = ["accelerate (>=0.26.0)", "torch (>=2.2)"] torch-speech = ["kenlm", "librosa", "phonemizer", "pyctcdecode (>=0.4.0)", "torchaudio"] torch-vision = ["Pillow (>=10.0.1,<=15.0)", "torchvision"] -torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib_metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] +torchhub = ["filelock", "huggingface-hub (>=0.34.0,<1.0)", "importlib-metadata", "numpy (>=1.17)", "packaging (>=20.0)", "protobuf", "regex (!=2019.12.17)", "requests", "sentencepiece (>=0.1.91,!=0.1.92)", "tokenizers (>=0.22.0,<=0.23.0)", "torch (>=2.2)", "tqdm (>=4.27)"] video = ["av"] vision = ["Pillow (>=10.0.1,<=15.0)"] @@ -3429,4 +3526,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.1" python-versions = ">=3.11,<3.14" -content-hash = "42274fd00aedabf70dc419acd06e2f25b5c69b58b7bf76eef2ea7a9df6470b2c" +content-hash = "fe9799a3d3a101e05d75d5e193c6e9e4ef17a7581cb273f41101e12129f80a2f" diff --git a/apps/pre-processing-service/pyproject.toml b/apps/pre-processing-service/pyproject.toml index 84a957b9..8cb11c0f 100644 --- a/apps/pre-processing-service/pyproject.toml +++ b/apps/pre-processing-service/pyproject.toml @@ -38,6 +38,7 @@ openai = "^1.107.3" aiohttp = "^3.12.15" prometheus-client = "^0.23.1" prometheus-fastapi-instrumentator = "^7.1.0" +boto3 = "^1.40.35" [build-system] requires = ["poetry-core>=2.0.0,<3.0.0"] From 167d923136b639d15f8b4ba16810976070792ac9 Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 22 Sep 2025 04:03:03 +0900 Subject: [PATCH 4/6] chore: poetry run black . --- .../app/api/endpoints/product.py | 11 +- .../app/model/schemas.py | 91 ++++++++--- .../app/service/crawl_service.py | 60 ++++--- .../app/service/s3_upload_service.py | 88 +++++++---- .../app/service/similarity_service.py | 44 ++++-- .../app/utils/s3_upload_util.py | 146 +++++++++++------- 6 files changed, 286 insertions(+), 154 deletions(-) diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py index 0accbf71..2812ef79 100644 --- a/apps/pre-processing-service/app/api/endpoints/product.py +++ b/apps/pre-processing-service/app/api/endpoints/product.py @@ -101,9 +101,8 @@ async def crawl(body: RequestSadaguCrawl): except Exception as e: raise HTTPException(status_code=500, detail=str(e)) -@router.post( - "/s3-upload", response_model=ResponseS3Upload, summary="S3 이미지 업로드" -) + +@router.post("/s3-upload", response_model=ResponseS3Upload, summary="S3 이미지 업로드") async def s3_upload(request: RequestS3Upload): """ 크롤링 완료 후 별도로 호출하여 이미지들을 S3 저장소에 업로드합니다. @@ -113,10 +112,12 @@ async def s3_upload(request: RequestS3Upload): response_data = await s3_upload_service.upload_crawled_products_to_s3(request) if not response_data: - raise CustomException(500, "S3 이미지 업로드에 실패했습니다.", "S3_UPLOAD_FAILED") + raise CustomException( + 500, "S3 이미지 업로드에 실패했습니다.", "S3_UPLOAD_FAILED" + ) return response_data except InvalidItemDataException as e: raise HTTPException(status_code=e.status_code, detail=e.detail) except Exception as e: - raise HTTPException(status_code=500, detail=str(e)) \ No newline at end of file + raise HTTPException(status_code=500, detail=str(e)) diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index 9acef5d2..cc182f84 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -111,7 +111,9 @@ class SadaguSimilarityData(BaseModel): ..., title="분석 키워드", description="유사도 분석에 사용된 키워드" ) top_products: List[Dict] = Field( - default_factory=list, title="선택된 상품들", description="유사도 분석 결과 선택된 상위 상품 목록" + default_factory=list, + title="선택된 상품들", + description="유사도 분석 결과 선택된 상위 상품 목록", ) reason: Optional[str] = Field( None, title="선택 이유", description="상품 선택 근거 및 점수 정보" @@ -137,7 +139,9 @@ class RequestSadaguCrawl(RequestBase): # 응답 데이터 모델 class SadaguCrawlData(BaseModel): crawled_products: List[Dict] = Field( - ..., title="크롤링된 상품들", description="크롤링된 상품들의 상세 정보 목록 (URL 포함)" + ..., + title="크롤링된 상품들", + description="크롤링된 상품들의 상세 정보 목록 (URL 포함)", ) success_count: int = Field( ..., title="성공 개수", description="성공적으로 크롤링된 상품 개수" @@ -156,11 +160,15 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]): pass + # ============== 6단계: S3 이미지 업로드 ============== + class RequestS3Upload(RequestBase): crawled_products: List[Dict] = Field( - ..., title="크롤링된 상품 데이터", description="이전 단계에서 크롤링된 상품들의 데이터" + ..., + title="크롤링된 상품 데이터", + description="이전 단계에서 크롤링된 상품들의 데이터", ) base_folder: Optional[str] = Field( "product-images", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" @@ -170,10 +178,14 @@ class RequestS3Upload(RequestBase): # S3 업로드된 이미지 정보 class S3ImageInfo(BaseModel): index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번") - original_url: str = Field(..., title="원본 URL", description="크롤링된 원본 이미지 URL") + original_url: str = Field( + ..., title="원본 URL", description="크롤링된 원본 이미지 URL" + ) s3_key: str = Field(..., title="S3 키", description="S3 저장소 내 파일 키") s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL") - file_size: int = Field(..., title="파일 크기", description="업로드된 파일 크기 (bytes)") + file_size: int = Field( + ..., title="파일 크기", description="업로드된 파일 크기 (bytes)" + ) content_type: str = Field(..., title="콘텐츠 타입", description="파일의 MIME 타입") @@ -183,37 +195,72 @@ class ProductS3UploadResult(BaseModel): product_title: str = Field(..., title="상품 제목", description="상품명") product_url: str = Field(..., title="상품 URL", description="상품 페이지 URL") status: str = Field(..., title="업로드 상태", description="completed/skipped/error") - upload_folder: Optional[str] = Field(None, title="업로드 폴더", description="S3 내 상품별 폴더 경로") - folder_s3_url: Optional[str] = Field(None, title="폴더 S3 URL", description="S3 폴더 접근 URL") - uploaded_images: List[S3ImageInfo] = Field(default_factory=list, title="업로드 성공 이미지") + upload_folder: Optional[str] = Field( + None, title="업로드 폴더", description="S3 내 상품별 폴더 경로" + ) + folder_s3_url: Optional[str] = Field( + None, title="폴더 S3 URL", description="S3 폴더 접근 URL" + ) + uploaded_images: List[S3ImageInfo] = Field( + default_factory=list, title="업로드 성공 이미지" + ) failed_images: List[Dict] = Field(default_factory=list, title="업로드 실패 이미지") - success_count: int = Field(..., title="성공 개수", description="업로드 성공한 이미지 수") - fail_count: int = Field(..., title="실패 개수", description="업로드 실패한 이미지 수") - reason: Optional[str] = Field(None, title="건너뜀 사유", description="업로드를 건너뛴 이유") - error: Optional[str] = Field(None, title="오류 메시지", description="업로드 중 발생한 오류") + success_count: int = Field( + ..., title="성공 개수", description="업로드 성공한 이미지 수" + ) + fail_count: int = Field( + ..., title="실패 개수", description="업로드 실패한 이미지 수" + ) + reason: Optional[str] = Field( + None, title="건너뜀 사유", description="업로드를 건너뛴 이유" + ) + error: Optional[str] = Field( + None, title="오류 메시지", description="업로드 중 발생한 오류" + ) # S3 업로드 요약 정보 class S3UploadSummary(BaseModel): - total_products: int = Field(..., title="총 상품 수", description="처리 대상 상품 총 개수") - processed_products: int = Field(..., title="처리된 상품 수", description="실제 처리된 상품 수") - skipped_products: int = Field(..., title="건너뛴 상품 수", description="크롤링 실패로 건너뛴 상품 수") - total_success_images: int = Field(..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수") - total_fail_images: int = Field(..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수") - success_rate: str = Field(..., title="성공률", description="이미지 업로드 성공률 (성공/전체)") + total_products: int = Field( + ..., title="총 상품 수", description="처리 대상 상품 총 개수" + ) + processed_products: int = Field( + ..., title="처리된 상품 수", description="실제 처리된 상품 수" + ) + skipped_products: int = Field( + ..., title="건너뛴 상품 수", description="크롤링 실패로 건너뛴 상품 수" + ) + total_success_images: int = Field( + ..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수" + ) + total_fail_images: int = Field( + ..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수" + ) + success_rate: str = Field( + ..., title="성공률", description="이미지 업로드 성공률 (성공/전체)" + ) # 응답 데이터 모델 class S3UploadData(BaseModel): - upload_results: List[ProductS3UploadResult] = Field(..., title="업로드 결과", description="각 상품의 S3 업로드 결과") - summary: S3UploadSummary = Field(..., title="업로드 요약", description="전체 업로드 결과 요약") - base_folder: str = Field(..., title="기본 폴더", description="S3 업로드에 사용된 기본 폴더") - uploaded_at: str = Field(..., title="업로드 완료 시간", description="S3 업로드 완료 시간") + upload_results: List[ProductS3UploadResult] = Field( + ..., title="업로드 결과", description="각 상품의 S3 업로드 결과" + ) + summary: S3UploadSummary = Field( + ..., title="업로드 요약", description="전체 업로드 결과 요약" + ) + base_folder: str = Field( + ..., title="기본 폴더", description="S3 업로드에 사용된 기본 폴더" + ) + uploaded_at: str = Field( + ..., title="업로드 완료 시간", description="S3 업로드 완료 시간" + ) # 최종 응답 모델 class ResponseS3Upload(ResponseBase[S3UploadData]): """S3 이미지 업로드 API 응답""" + pass diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index 311ae42e..e8785f64 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -42,36 +42,44 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: ) # 성공한 상품 추가 - crawled_products.append({ - "index": i, - "url": product_url, - "product_detail": product_detail, - "status": "success", - "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") - }) + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": product_detail, + "status": "success", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) success_count += 1 else: logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음") - crawled_products.append({ + crawled_products.append( + { + "index": i, + "url": product_url, + "product_detail": None, + "status": "failed", + "error": "상세 정보 없음", + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) + fail_count += 1 + + except Exception as e: + logger.error( + f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'" + ) + crawled_products.append( + { "index": i, "url": product_url, "product_detail": None, "status": "failed", - "error": "상세 정보 없음", - "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") - }) - fail_count += 1 - - except Exception as e: - logger.error(f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'") - crawled_products.append({ - "index": i, - "url": product_url, - "product_detail": None, - "status": "failed", - "error": str(e), - "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S") - }) + "error": str(e), + "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + } + ) fail_count += 1 finally: @@ -94,7 +102,9 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - logger.info(f"상품 상세 크롤링 서비스 완료: success_rate={success_count}/{len(product_urls)}") + logger.info( + f"상품 상세 크롤링 서비스 완료: success_rate={success_count}/{len(product_urls)}" + ) return Response.ok(data) except Exception as e: @@ -132,4 +142,4 @@ async def crawl_single_product_detail(self, product_url: str) -> dict: logger.error(f"단일 크롤링 오류: url={product_url}, error='{e}'") raise InvalidItemDataException() finally: - await crawler.close() \ No newline at end of file + await crawler.close() diff --git a/apps/pre-processing-service/app/service/s3_upload_service.py b/apps/pre-processing-service/app/service/s3_upload_service.py index 033a11a6..9d6d1bef 100644 --- a/apps/pre-processing-service/app/service/s3_upload_service.py +++ b/apps/pre-processing-service/app/service/s3_upload_service.py @@ -38,22 +38,28 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: product_index = product_info.get("index", 0) product_detail = product_info.get("product_detail") - logger.info(f"상품 {product_index}/{len(crawled_products)} S3 업로드 시작") + logger.info( + f"상품 {product_index}/{len(crawled_products)} S3 업로드 시작" + ) # 크롤링 실패한 상품은 스킵 if not product_detail or product_info.get("status") != "success": - logger.warning(f"상품 {product_index}: 크롤링 실패로 인한 업로드 스킵") - upload_results.append({ - "product_index": product_index, - "product_title": "Unknown", - "product_url": product_info.get("url", ""), - "status": "skipped", - "reason": "크롤링 실패", - "success_count": 0, - "fail_count": 0, - "uploaded_images": [], - "failed_images": [] - }) + logger.warning( + f"상품 {product_index}: 크롤링 실패로 인한 업로드 스킵" + ) + upload_results.append( + { + "product_index": product_index, + "product_title": "Unknown", + "product_url": product_info.get("url", ""), + "status": "skipped", + "reason": "크롤링 실패", + "success_count": 0, + "fail_count": 0, + "uploaded_images": [], + "failed_images": [], + } + ) continue try: @@ -74,17 +80,19 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: except Exception as e: logger.error(f"상품 {product_index} S3 업로드 오류: {e}") - upload_results.append({ - "product_index": product_index, - "product_title": product_detail.get("title", "Unknown"), - "product_url": product_detail.get("url", ""), - "status": "error", - "error": str(e), - "success_count": 0, - "fail_count": 0, - "uploaded_images": [], - "failed_images": [] - }) + upload_results.append( + { + "product_index": product_index, + "product_title": product_detail.get("title", "Unknown"), + "product_url": product_detail.get("url", ""), + "status": "error", + "error": str(e), + "success_count": 0, + "fail_count": 0, + "uploaded_images": [], + "failed_images": [], + } + ) # 상품간 간격 (서버 부하 방지) if product_index < len(crawled_products): @@ -104,8 +112,11 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: "skipped_products": len(crawled_products) - processed_products, "total_success_images": total_success_images, "total_fail_images": total_fail_images, - "success_rate": f"{total_success_images}/{total_success_images + total_fail_images}" if ( - total_success_images + total_fail_images) > 0 else "0/0" + "success_rate": ( + f"{total_success_images}/{total_success_images + total_fail_images}" + if (total_success_images + total_fail_images) > 0 + else "0/0" + ), }, "base_folder": base_folder, "uploaded_at": time.strftime("%Y-%m-%d %H:%M:%S"), @@ -124,10 +135,17 @@ async def get_upload_status(self, upload_results: List[Dict]) -> Dict: """ try: total_products = len(upload_results) - successful_products = len([r for r in upload_results if r.get("status") == "completed"]) - failed_products = len([r for r in upload_results if r.get("status") in ["error", "skipped"]]) + successful_products = len( + [r for r in upload_results if r.get("status") == "completed"] + ) + failed_products = len( + [r for r in upload_results if r.get("status") in ["error", "skipped"]] + ) - total_images = sum(r.get("success_count", 0) + r.get("fail_count", 0) for r in upload_results) + total_images = sum( + r.get("success_count", 0) + r.get("fail_count", 0) + for r in upload_results + ) successful_images = sum(r.get("success_count", 0) for r in upload_results) failed_images = sum(r.get("fail_count", 0) for r in upload_results) @@ -136,18 +154,22 @@ async def get_upload_status(self, upload_results: List[Dict]) -> Dict: "total": total_products, "successful": successful_products, "failed": failed_products, - "success_rate": f"{successful_products}/{total_products}" + "success_rate": f"{successful_products}/{total_products}", }, "images": { "total": total_images, "successful": successful_images, "failed": failed_images, - "success_rate": f"{successful_images}/{total_images}" if total_images > 0 else "0/0" - } + "success_rate": ( + f"{successful_images}/{total_images}" + if total_images > 0 + else "0/0" + ), + }, } return status_summary except Exception as e: logger.error(f"업로드 상태 요약 오류: {e}") - return {} \ No newline at end of file + return {} diff --git a/apps/pre-processing-service/app/service/similarity_service.py b/apps/pre-processing-service/app/service/similarity_service.py index 2974640e..cf943279 100644 --- a/apps/pre-processing-service/app/service/similarity_service.py +++ b/apps/pre-processing-service/app/service/similarity_service.py @@ -9,7 +9,9 @@ class SimilarityService: def __init__(self): pass - def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> dict: + def select_top_products_by_similarity( + self, request: RequestSadaguSimilarity + ) -> dict: """ 형태소 분석 후 Top 10 선택 (10개 이하면 유사도 분석 생략) """ @@ -48,7 +50,9 @@ def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> try: # 형태소 분석 결과가 10개 이하면 유사도 분석 생략하고 바로 반환 if skip_similarity and analysis_mode == "matched_products": - logger.info(f"형태소 분석 결과가 {len(candidates)}개로 {top_count}개 이하 - 유사도 분석 생략") + logger.info( + f"형태소 분석 결과가 {len(candidates)}개로 {top_count}개 이하 - 유사도 분석 생략" + ) # 매칭 스코어 기준으로 정렬된 상태 유지 (이미 match_service에서 정렬됨) top_products = [] @@ -57,7 +61,9 @@ def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> enhanced_product["rank"] = i + 1 enhanced_product["selection_info"] = { "selection_type": "match_only", - "match_score": product.get("match_info", {}).get("match_score", 0.0), + "match_score": product.get("match_info", {}).get( + "match_score", 0.0 + ), "reason": "형태소 분석만으로 선택 (유사도 분석 생략)", "total_candidates": len(candidates), } @@ -87,15 +93,21 @@ def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> # 유사도 정보 추가 및 Top 10 선택 enhanced_products = [] - similarity_threshold = 0.3 if analysis_mode == "fallback_similarity_only" else 0.0 + similarity_threshold = ( + 0.3 if analysis_mode == "fallback_similarity_only" else 0.0 + ) for i, result in enumerate(similarity_results): product = candidates[result["index"]].copy() # 폴백 모드에서는 임계값 검증 - if analysis_mode == "fallback_similarity_only" and result["similarity"] < similarity_threshold: + if ( + analysis_mode == "fallback_similarity_only" + and result["similarity"] < similarity_threshold + ): logger.debug( - f"상품 {i + 1} 유사도 미달로 제외: similarity={result['similarity']:.4f} < threshold={similarity_threshold}") + f"상품 {i + 1} 유사도 미달로 제외: similarity={result['similarity']:.4f} < threshold={similarity_threshold}" + ) continue product["similarity_info"] = { @@ -129,10 +141,16 @@ def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> # 종합 점수 또는 유사도 기준으로 재정렬 if analysis_mode == "matched_products": - enhanced_products.sort(key=lambda x: x.get("final_score", x["similarity_info"]["similarity_score"]), - reverse=True) + enhanced_products.sort( + key=lambda x: x.get( + "final_score", x["similarity_info"]["similarity_score"] + ), + reverse=True, + ) else: - enhanced_products.sort(key=lambda x: x["similarity_info"]["similarity_score"], reverse=True) + enhanced_products.sort( + key=lambda x: x["similarity_info"]["similarity_score"], reverse=True + ) # Top 10 선택 top_products = enhanced_products[:top_count] @@ -149,10 +167,12 @@ def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> best_product = top_products[0] if "final_score" in best_product: logger.info( - f"1위 상품: title='{best_product['title'][:30]}', final_score={best_product['final_score']:.4f}") + f"1위 상품: title='{best_product['title'][:30]}', final_score={best_product['final_score']:.4f}" + ) else: logger.info( - f"1위 상품: title='{best_product['title'][:30]}', similarity={best_product['similarity_info']['similarity_score']:.4f}") + f"1위 상품: title='{best_product['title'][:30]}', similarity={best_product['similarity_info']['similarity_score']:.4f}" + ) data = { "keyword": keyword, @@ -163,4 +183,4 @@ def select_top_products_by_similarity(self, request: RequestSadaguSimilarity) -> except Exception as e: logger.error(f"유사도 분석 서비스 오류: keyword='{keyword}', error='{e}'") - raise InvalidItemDataException() \ No newline at end of file + raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/utils/s3_upload_util.py b/apps/pre-processing-service/app/utils/s3_upload_util.py index 98088ae3..9286231d 100644 --- a/apps/pre-processing-service/app/utils/s3_upload_util.py +++ b/apps/pre-processing-service/app/utils/s3_upload_util.py @@ -19,32 +19,42 @@ def __init__(self): self.region = os.getenv("AWS_REGION", "ap-northeast-2") if not self.aws_access_key or not self.aws_secret_key: - raise ValueError("AWS_ACCESS_KEY_ID와 AWS_SECRET_ACCESS_KEY 환경변수가 필요합니다") + raise ValueError( + "AWS_ACCESS_KEY_ID와 AWS_SECRET_ACCESS_KEY 환경변수가 필요합니다" + ) self.base_url = f"https://{self.bucket_name}.s3.{self.region}.amazonaws.com" # S3 클라이언트 초기화 self.s3_client = boto3.client( - 's3', + "s3", aws_access_key_id=self.aws_access_key, aws_secret_access_key=self.aws_secret_key, - region_name=self.region + region_name=self.region, ) - logger.info(f"S3 클라이언트 초기화 완료: bucket={self.bucket_name}, region={self.region}") + logger.info( + f"S3 클라이언트 초기화 완료: bucket={self.bucket_name}, region={self.region}" + ) - async def download_image(self, session: aiohttp.ClientSession, image_url: str) -> Optional[bytes]: + async def download_image( + self, session: aiohttp.ClientSession, image_url: str + ) -> Optional[bytes]: """이미지 URL에서 이미지 데이터 다운로드""" try: logger.debug(f"이미지 다운로드 시작: {image_url}") - async with session.get(image_url, timeout=aiohttp.ClientTimeout(total=30)) as response: + async with session.get( + image_url, timeout=aiohttp.ClientTimeout(total=30) + ) as response: if response.status == 200: image_data = await response.read() logger.debug(f"이미지 다운로드 완료: {len(image_data)} bytes") return image_data else: - logger.warning(f"이미지 다운로드 실패: {image_url}, status={response.status}") + logger.warning( + f"이미지 다운로드 실패: {image_url}, status={response.status}" + ) return None except Exception as e: @@ -57,25 +67,27 @@ def get_file_extension(self, image_url: str) -> str: path = parsed.path.lower() # 일반적인 이미지 확장자 확인 - for ext in ['.jpg', '.jpeg', '.png', '.gif', '.webp']: + for ext in [".jpg", ".jpeg", ".png", ".gif", ".webp"]: if ext in path: return ext # 기본값 - return '.jpg' + return ".jpg" def get_content_type(self, file_extension: str) -> str: """파일 확장자에 따른 Content-Type 반환""" content_types = { - '.jpg': 'image/jpeg', - '.jpeg': 'image/jpeg', - '.png': 'image/png', - '.gif': 'image/gif', - '.webp': 'image/webp' + ".jpg": "image/jpeg", + ".jpeg": "image/jpeg", + ".png": "image/png", + ".gif": "image/gif", + ".webp": "image/webp", } - return content_types.get(file_extension, 'image/jpeg') + return content_types.get(file_extension, "image/jpeg") - def upload_to_s3(self, image_data: bytes, s3_key: str, content_type: str = "image/jpeg") -> bool: + def upload_to_s3( + self, image_data: bytes, s3_key: str, content_type: str = "image/jpeg" + ) -> bool: """S3에 이미지 업로드""" try: logger.debug(f"S3 업로드 시작: key={s3_key}") @@ -94,11 +106,19 @@ def upload_to_s3(self, image_data: bytes, s3_key: str, content_type: str = "imag logger.error(f"S3 업로드 오류: key={s3_key}, error={e}") return False - def generate_s3_key(self, base_folder: str, product_index: int, product_title: str, - image_index: int, file_extension: str) -> str: + def generate_s3_key( + self, + base_folder: str, + product_index: int, + product_title: str, + image_index: int, + file_extension: str, + ) -> str: """S3 키 생성""" # 상품 제목에서 특수문자 제거 - safe_title = product_title.replace("/", "-").replace("\\", "-").replace(" ", "_")[:30] + safe_title = ( + product_title.replace("/", "-").replace("\\", "-").replace(" ", "_")[:30] + ) # 타임스탬프 + 상품 정보로 폴더명 생성 timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") @@ -113,11 +133,13 @@ def get_s3_url(self, s3_key: str) -> str: """S3 키에서 접근 가능한 URL 생성""" return f"{self.base_url}/{s3_key}" - async def upload_single_product_images(self, - session: aiohttp.ClientSession, - product_data: Dict, - product_index: int, - base_folder: str = "product-images") -> Dict: + async def upload_single_product_images( + self, + session: aiohttp.ClientSession, + product_data: Dict, + product_index: int, + base_folder: str = "product-images", + ) -> Dict: """단일 상품의 모든 이미지를 S3에 업로드""" product_title = product_data.get("title", "Unknown") @@ -139,10 +161,12 @@ async def upload_single_product_images(self, "success_count": 0, "fail_count": 0, "upload_folder": None, - "folder_s3_url": None + "folder_s3_url": None, } - logger.info(f"상품 {product_index} 이미지 업로드 시작: {len(product_images)}개 이미지") + logger.info( + f"상품 {product_index} 이미지 업로드 시작: {len(product_images)}개 이미지" + ) # 각 이미지 업로드 for img_idx, img_info in enumerate(product_images, 1): @@ -150,11 +174,13 @@ async def upload_single_product_images(self, if not original_url: logger.warning(f"상품 {product_index}, 이미지 {img_idx}: URL이 없음") - failed_images.append({ - "index": img_idx, - "original_url": original_url, - "error": "URL이 없음" - }) + failed_images.append( + { + "index": img_idx, + "original_url": original_url, + "error": "URL이 없음", + } + ) continue try: @@ -162,11 +188,13 @@ async def upload_single_product_images(self, image_data = await self.download_image(session, original_url) if not image_data: - failed_images.append({ - "index": img_idx, - "original_url": original_url, - "error": "다운로드 실패" - }) + failed_images.append( + { + "index": img_idx, + "original_url": original_url, + "error": "다운로드 실패", + } + ) continue # S3 키 생성 @@ -180,29 +208,31 @@ async def upload_single_product_images(self, if self.upload_to_s3(image_data, s3_key, content_type): s3_url = self.get_s3_url(s3_key) - uploaded_images.append({ - "index": img_idx, - "original_url": original_url, - "s3_key": s3_key, - "s3_url": s3_url, - "file_size": len(image_data), - "content_type": content_type - }) + uploaded_images.append( + { + "index": img_idx, + "original_url": original_url, + "s3_key": s3_key, + "s3_url": s3_url, + "file_size": len(image_data), + "content_type": content_type, + } + ) logger.debug(f"상품 {product_index}, 이미지 {img_idx} 업로드 완료") else: - failed_images.append({ - "index": img_idx, - "original_url": original_url, - "error": "S3 업로드 실패" - }) + failed_images.append( + { + "index": img_idx, + "original_url": original_url, + "error": "S3 업로드 실패", + } + ) except Exception as e: logger.error(f"상품 {product_index}, 이미지 {img_idx} 처리 오류: {e}") - failed_images.append({ - "index": img_idx, - "original_url": original_url, - "error": str(e) - }) + failed_images.append( + {"index": img_idx, "original_url": original_url, "error": str(e)} + ) # 이미지 간 간격 (서버 부하 방지) await asyncio.sleep(0.5) @@ -215,7 +245,9 @@ async def upload_single_product_images(self, upload_folder = "/".join(first_s3_key.split("/")[:-1]) # 파일명 제거 folder_s3_url = f"{self.base_url}/{upload_folder}" - logger.success(f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {len(failed_images)}개") + logger.success( + f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {len(failed_images)}개" + ) return { "product_index": product_index, @@ -227,5 +259,5 @@ async def upload_single_product_images(self, "uploaded_images": uploaded_images, "failed_images": failed_images, "success_count": len(uploaded_images), - "fail_count": len(failed_images) - } \ No newline at end of file + "fail_count": len(failed_images), + } From 17b65a26bfd5b086b19cd31514325a9f5323ae74 Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 22 Sep 2025 14:07:20 +0900 Subject: [PATCH 5/6] =?UTF-8?q?feat:=20S3=20=EC=97=85=EB=A1=9C=EB=93=9C=20?= =?UTF-8?q?=EB=A1=9C=EC=A7=81=201.=20=EC=8A=A4=ED=82=A4=EB=A7=88=EC=97=90?= =?UTF-8?q?=20=EC=9A=94=EC=B2=AD=20=EC=9D=91=EB=8B=B5=20=EB=8D=B0=EC=9D=B4?= =?UTF-8?q?=ED=84=B0=20=EC=B6=94=EA=B0=80=202.=20=EA=B8=B0=EC=A1=B4=20?= =?UTF-8?q?=EC=9D=B4=EB=AF=B8=EC=A7=80=EB=A7=8C=20=EC=97=85=EB=A1=9C?= =?UTF-8?q?=EB=93=9C=20->=20=EC=9D=B4=EB=AF=B8=EC=A7=80=20+=20=ED=81=AC?= =?UTF-8?q?=EB=A1=A4=EB=A7=81=20=EC=A0=84=EC=B2=B4=20=EB=8D=B0=EC=9D=B4?= =?UTF-8?q?=ED=84=B0=20json?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../app/model/schemas.py | 40 +--- .../app/service/s3_upload_service.py | 90 ++------- .../app/utils/s3_upload_util.py | 190 ++++++++++-------- 3 files changed, 122 insertions(+), 198 deletions(-) diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index cc182f84..d92f0b46 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -161,85 +161,54 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]): pass -# ============== 6단계: S3 이미지 업로드 ============== +# ============== S3 이미지 업로드 ============== class RequestS3Upload(RequestBase): + keyword: str = Field(..., title="검색 키워드", description="폴더명 생성용 키워드") # 추가 crawled_products: List[Dict] = Field( ..., title="크롤링된 상품 데이터", description="이전 단계에서 크롤링된 상품들의 데이터", ) base_folder: Optional[str] = Field( - "product-images", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" + "product", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" ) - # S3 업로드된 이미지 정보 class S3ImageInfo(BaseModel): index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번") original_url: str = Field( ..., title="원본 URL", description="크롤링된 원본 이미지 URL" ) - s3_key: str = Field(..., title="S3 키", description="S3 저장소 내 파일 키") s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL") - file_size: int = Field( - ..., title="파일 크기", description="업로드된 파일 크기 (bytes)" - ) - content_type: str = Field(..., title="콘텐츠 타입", description="파일의 MIME 타입") - # 상품별 S3 업로드 결과 class ProductS3UploadResult(BaseModel): product_index: int = Field(..., title="상품 순번", description="크롤링 순번") product_title: str = Field(..., title="상품 제목", description="상품명") - product_url: str = Field(..., title="상품 URL", description="상품 페이지 URL") status: str = Field(..., title="업로드 상태", description="completed/skipped/error") - upload_folder: Optional[str] = Field( - None, title="업로드 폴더", description="S3 내 상품별 폴더 경로" - ) - folder_s3_url: Optional[str] = Field( - None, title="폴더 S3 URL", description="S3 폴더 접근 URL" - ) uploaded_images: List[S3ImageInfo] = Field( default_factory=list, title="업로드 성공 이미지" ) - failed_images: List[Dict] = Field(default_factory=list, title="업로드 실패 이미지") success_count: int = Field( ..., title="성공 개수", description="업로드 성공한 이미지 수" ) fail_count: int = Field( ..., title="실패 개수", description="업로드 실패한 이미지 수" ) - reason: Optional[str] = Field( - None, title="건너뜀 사유", description="업로드를 건너뛴 이유" - ) - error: Optional[str] = Field( - None, title="오류 메시지", description="업로드 중 발생한 오류" - ) - # S3 업로드 요약 정보 class S3UploadSummary(BaseModel): total_products: int = Field( ..., title="총 상품 수", description="처리 대상 상품 총 개수" ) - processed_products: int = Field( - ..., title="처리된 상품 수", description="실제 처리된 상품 수" - ) - skipped_products: int = Field( - ..., title="건너뛴 상품 수", description="크롤링 실패로 건너뛴 상품 수" - ) total_success_images: int = Field( ..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수" ) total_fail_images: int = Field( ..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수" ) - success_rate: str = Field( - ..., title="성공률", description="이미지 업로드 성공률 (성공/전체)" - ) - # 응답 데이터 모델 class S3UploadData(BaseModel): @@ -249,9 +218,6 @@ class S3UploadData(BaseModel): summary: S3UploadSummary = Field( ..., title="업로드 요약", description="전체 업로드 결과 요약" ) - base_folder: str = Field( - ..., title="기본 폴더", description="S3 업로드에 사용된 기본 폴더" - ) uploaded_at: str = Field( ..., title="업로드 완료 시간", description="S3 업로드 완료 시간" ) diff --git a/apps/pre-processing-service/app/service/s3_upload_service.py b/apps/pre-processing-service/app/service/s3_upload_service.py index 9d6d1bef..4a47bca6 100644 --- a/apps/pre-processing-service/app/service/s3_upload_service.py +++ b/apps/pre-processing-service/app/service/s3_upload_service.py @@ -10,24 +10,24 @@ class S3UploadService: - """6단계: 크롤링된 상품 이미지들을 S3에 업로드하는 서비스""" + """6단계: 크롤링된 상품 이미지들과 데이터를 S3에 업로드하는 서비스""" def __init__(self): self.s3_util = S3UploadUtil() async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: """ - 크롤링된 상품들의 이미지를 S3에 업로드하는 비즈니스 로직 (6단계) + 크롤링된 상품들의 이미지와 데이터를 S3에 업로드하는 비즈니스 로직 (6단계) """ + keyword = request.keyword # 키워드 추가 crawled_products = request.crawled_products - base_folder = request.base_folder or "product-images" + base_folder = request.base_folder or "product" # 🔸 기본값 변경: product-images → product - logger.info(f"S3 업로드 서비스 시작: {len(crawled_products)}개 상품") + logger.info(f"S3 업로드 서비스 시작: keyword='{keyword}', {len(crawled_products)}개 상품") upload_results = [] total_success_images = 0 total_fail_images = 0 - processed_products = 0 try: # HTTP 세션을 사용한 이미지 다운로드 @@ -51,27 +51,25 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: { "product_index": product_index, "product_title": "Unknown", - "product_url": product_info.get("url", ""), "status": "skipped", - "reason": "크롤링 실패", + "folder_s3_url": None, + "uploaded_images": [], "success_count": 0, "fail_count": 0, - "uploaded_images": [], - "failed_images": [], } ) continue try: - # 상품 이미지 업로드 (유틸리티 사용) + # 상품 이미지 + 데이터 업로드 (키워드 전달 추가!) + # 🔸 전체 크롤링 데이터를 전달 (product_detail이 아닌 product_info 전체) upload_result = await self.s3_util.upload_single_product_images( - session, product_detail, product_index, base_folder + session, product_info, product_index, keyword, base_folder # product_detail → product_info ) upload_results.append(upload_result) total_success_images += upload_result["success_count"] total_fail_images += upload_result["fail_count"] - processed_products += 1 logger.success( f"상품 {product_index} S3 업로드 완료: 성공 {upload_result['success_count']}개, " @@ -84,13 +82,11 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: { "product_index": product_index, "product_title": product_detail.get("title", "Unknown"), - "product_url": product_detail.get("url", ""), "status": "error", - "error": str(e), + "folder_s3_url": None, + "uploaded_images": [], "success_count": 0, "fail_count": 0, - "uploaded_images": [], - "failed_images": [], } ) @@ -99,77 +95,23 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: await asyncio.sleep(1) logger.success( - f"S3 업로드 서비스 완료: 처리된 상품 {processed_products}개, " - f"총 성공 이미지 {total_success_images}개, 총 실패 이미지 {total_fail_images}개" + f"S3 업로드 서비스 완료: 총 성공 이미지 {total_success_images}개, 총 실패 이미지 {total_fail_images}개" ) - # 응답 데이터 구성 + # 간소화된 응답 데이터 구성 data = { "upload_results": upload_results, "summary": { "total_products": len(crawled_products), - "processed_products": processed_products, - "skipped_products": len(crawled_products) - processed_products, "total_success_images": total_success_images, "total_fail_images": total_fail_images, - "success_rate": ( - f"{total_success_images}/{total_success_images + total_fail_images}" - if (total_success_images + total_fail_images) > 0 - else "0/0" - ), }, - "base_folder": base_folder, "uploaded_at": time.strftime("%Y-%m-%d %H:%M:%S"), } - message = f"S3 업로드 완료: {total_success_images}개 이미지 업로드 성공" + message = f"S3 업로드 완료: {total_success_images}개 이미지 업로드 성공, 상품 데이터 JSON 파일 포함" return Response.ok(data, message) except Exception as e: logger.error(f"S3 업로드 서비스 전체 오류: {e}") - raise InvalidItemDataException() - - async def get_upload_status(self, upload_results: List[Dict]) -> Dict: - """ - 업로드 결과 상태 요약 (선택적 기능) - """ - try: - total_products = len(upload_results) - successful_products = len( - [r for r in upload_results if r.get("status") == "completed"] - ) - failed_products = len( - [r for r in upload_results if r.get("status") in ["error", "skipped"]] - ) - - total_images = sum( - r.get("success_count", 0) + r.get("fail_count", 0) - for r in upload_results - ) - successful_images = sum(r.get("success_count", 0) for r in upload_results) - failed_images = sum(r.get("fail_count", 0) for r in upload_results) - - status_summary = { - "products": { - "total": total_products, - "successful": successful_products, - "failed": failed_products, - "success_rate": f"{successful_products}/{total_products}", - }, - "images": { - "total": total_images, - "successful": successful_images, - "failed": failed_images, - "success_rate": ( - f"{successful_images}/{total_images}" - if total_images > 0 - else "0/0" - ), - }, - } - - return status_summary - - except Exception as e: - logger.error(f"업로드 상태 요약 오류: {e}") - return {} + raise InvalidItemDataException() \ No newline at end of file diff --git a/apps/pre-processing-service/app/utils/s3_upload_util.py b/apps/pre-processing-service/app/utils/s3_upload_util.py index 9286231d..86148325 100644 --- a/apps/pre-processing-service/app/utils/s3_upload_util.py +++ b/apps/pre-processing-service/app/utils/s3_upload_util.py @@ -1,10 +1,11 @@ import os +import json import boto3 import aiohttp import asyncio from datetime import datetime from urllib.parse import urlparse -from typing import List, Dict, Optional, Tuple +from typing import Dict, Optional from loguru import logger @@ -38,14 +39,14 @@ def __init__(self): ) async def download_image( - self, session: aiohttp.ClientSession, image_url: str + self, session: aiohttp.ClientSession, image_url: str ) -> Optional[bytes]: """이미지 URL에서 이미지 데이터 다운로드""" try: logger.debug(f"이미지 다운로드 시작: {image_url}") async with session.get( - image_url, timeout=aiohttp.ClientTimeout(total=30) + image_url, timeout=aiohttp.ClientTimeout(total=30) ) as response: if response.status == 200: image_data = await response.read() @@ -86,16 +87,16 @@ def get_content_type(self, file_extension: str) -> str: return content_types.get(file_extension, "image/jpeg") def upload_to_s3( - self, image_data: bytes, s3_key: str, content_type: str = "image/jpeg" + self, data: bytes, s3_key: str, content_type: str = "image/jpeg" ) -> bool: - """S3에 이미지 업로드""" + """S3에 데이터 업로드 (이미지 또는 JSON)""" try: logger.debug(f"S3 업로드 시작: key={s3_key}") self.s3_client.put_object( Bucket=self.bucket_name, Key=s3_key, - Body=image_data, + Body=data, ContentType=content_type, ) @@ -106,27 +107,51 @@ def upload_to_s3( logger.error(f"S3 업로드 오류: key={s3_key}, error={e}") return False - def generate_s3_key( - self, - base_folder: str, - product_index: int, - product_title: str, - image_index: int, - file_extension: str, - ) -> str: - """S3 키 생성""" - # 상품 제목에서 특수문자 제거 - safe_title = ( - product_title.replace("/", "-").replace("\\", "-").replace(" ", "_")[:30] + def upload_json_to_s3(self, json_data: Dict, s3_key: str) -> bool: + """JSON 데이터를 S3에 업로드""" + try: + json_str = json.dumps(json_data, ensure_ascii=False, indent=2) + json_bytes = json_str.encode('utf-8') + + return self.upload_to_s3(json_bytes, s3_key, "application/json") + + except Exception as e: + logger.error(f"JSON S3 업로드 오류: key={s3_key}, error={e}") + return False + + def generate_product_folder_name(self, product_index: int, keyword: str) -> str: + """상품별 폴더명 생성 (시간_키워드_번호)""" + # 키워드에서 특수문자 제거 + safe_keyword = ( + keyword.replace("/", "-") + .replace("\\", "-") + .replace(" ", "_") + .replace(":", "-") + .replace("*", "-") + .replace("?", "-") + .replace('"', "-") + .replace("<", "-") + .replace(">", "-") + .replace("|", "-")[:20] # 길이 제한 ) - # 타임스탬프 + 상품 정보로 폴더명 생성 - timestamp = datetime.now().strftime("%Y%m%d_%H%M%S") - folder_name = f"{timestamp}_product_{product_index}_{safe_title}" + # 날짜 형식: 20250922 + date_str = datetime.now().strftime("%Y%m%d") - # 최종 S3 키 - s3_key = f"{base_folder}/{folder_name}/image_{image_index:03d}{file_extension}" + # 폴더명: 20250922_키워드_1 + folder_name = f"{date_str}_{safe_keyword}_{product_index}" + return folder_name + + def generate_s3_key( + self, + base_folder: str, + folder_name: str, + file_name: str, + ) -> str: + """S3 키 생성""" + # 최종 S3 키: product/20250922_산리오_1/image_001.jpg 또는 product_data.json + s3_key = f"{base_folder}/{folder_name}/{file_name}" return s3_key def get_s3_url(self, s3_key: str) -> str: @@ -134,53 +159,71 @@ def get_s3_url(self, s3_key: str) -> str: return f"{self.base_url}/{s3_key}" async def upload_single_product_images( - self, - session: aiohttp.ClientSession, - product_data: Dict, - product_index: int, - base_folder: str = "product-images", + self, + session: aiohttp.ClientSession, + product_info: Dict, # 🔸 이름 변경: product_data → product_info (전체 크롤링 데이터) + product_index: int, + keyword: str, # 키워드 파라미터 추가 + base_folder: str = "product", # 🔸 기본 폴더 변경: product-images → product ) -> Dict: - """단일 상품의 모든 이미지를 S3에 업로드""" + """단일 상품의 모든 데이터(이미지 + JSON)를 S3에 업로드""" - product_title = product_data.get("title", "Unknown") - product_url = product_data.get("url", "") - product_images = product_data.get("product_images", []) + # 🔸 전체 크롤링 데이터에서 필요한 정보 추출 + product_detail = product_info.get("product_detail", {}) + product_title = product_detail.get("title", "Unknown") + product_images = product_detail.get("product_images", []) uploaded_images = [] - failed_images = [] + logger.info( + f"상품 {product_index} 업로드 시작: {len(product_images)}개 이미지, keyword='{keyword}'" + ) + + # 키워드 기반 폴더명 한 번만 생성 + folder_name = self.generate_product_folder_name(product_index, keyword) + + fail_count = 0 + folder_s3_url = f"{self.base_url}/{base_folder}/{folder_name}" + + # 🆕 1. 먼저 상품 데이터 JSON 파일 업로드 + try: + # 전체 크롤링 데이터를 JSON으로 저장 (S3 업로드 메타데이터 추가) + product_data_with_meta = { + **product_info, # 전체 크롤링 데이터 (index, url, product_detail, status, crawled_at 포함) + "s3_upload_keyword": keyword, # 추가 메타데이터 + "s3_uploaded_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + } + + json_s3_key = self.generate_s3_key(base_folder, folder_name, "product_data.json") + + if self.upload_json_to_s3(product_data_with_meta, json_s3_key): + logger.success(f"상품 {product_index} JSON 데이터 업로드 완료") + else: + logger.error(f"상품 {product_index} JSON 데이터 업로드 실패") + + except Exception as e: + logger.error(f"상품 {product_index} JSON 업로드 오류: {e}") + + # 2. 이미지 업로드 (기존 로직) if not product_images: logger.warning(f"상품 {product_index}: 업로드할 이미지가 없음") return { "product_index": product_index, "product_title": product_title, - "product_url": product_url, "status": "no_images", + "folder_s3_url": folder_s3_url, "uploaded_images": uploaded_images, - "failed_images": failed_images, "success_count": 0, "fail_count": 0, - "upload_folder": None, - "folder_s3_url": None, } - logger.info( - f"상품 {product_index} 이미지 업로드 시작: {len(product_images)}개 이미지" - ) - # 각 이미지 업로드 for img_idx, img_info in enumerate(product_images, 1): original_url = img_info.get("original_url", "") if not original_url: logger.warning(f"상품 {product_index}, 이미지 {img_idx}: URL이 없음") - failed_images.append( - { - "index": img_idx, - "original_url": original_url, - "error": "URL이 없음", - } - ) + fail_count += 1 continue try: @@ -188,20 +231,13 @@ async def upload_single_product_images( image_data = await self.download_image(session, original_url) if not image_data: - failed_images.append( - { - "index": img_idx, - "original_url": original_url, - "error": "다운로드 실패", - } - ) + fail_count += 1 continue - # S3 키 생성 + # S3 키 생성 (키워드 기반 폴더명 사용) file_extension = self.get_file_extension(original_url) - s3_key = self.generate_s3_key( - base_folder, product_index, product_title, img_idx, file_extension - ) + image_file_name = f"image_{img_idx:03d}{file_extension}" + s3_key = self.generate_s3_key(base_folder, folder_name, image_file_name) # S3 업로드 content_type = self.get_content_type(file_extension) @@ -212,52 +248,32 @@ async def upload_single_product_images( { "index": img_idx, "original_url": original_url, - "s3_key": s3_key, "s3_url": s3_url, - "file_size": len(image_data), - "content_type": content_type, } ) + logger.debug(f"상품 {product_index}, 이미지 {img_idx} 업로드 완료") else: - failed_images.append( - { - "index": img_idx, - "original_url": original_url, - "error": "S3 업로드 실패", - } - ) + fail_count += 1 except Exception as e: logger.error(f"상품 {product_index}, 이미지 {img_idx} 처리 오류: {e}") - failed_images.append( - {"index": img_idx, "original_url": original_url, "error": str(e)} - ) + fail_count += 1 # 이미지 간 간격 (서버 부하 방지) await asyncio.sleep(0.5) - # 업로드 폴더 정보 계산 - upload_folder = None - folder_s3_url = None - if uploaded_images: - first_s3_key = uploaded_images[0]["s3_key"] - upload_folder = "/".join(first_s3_key.split("/")[:-1]) # 파일명 제거 - folder_s3_url = f"{self.base_url}/{upload_folder}" - logger.success( - f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {len(failed_images)}개" + f"상품 {product_index} 업로드 완료: 성공 {len(uploaded_images)}개, 실패 {fail_count}개, folder='{folder_name}'" ) return { "product_index": product_index, "product_title": product_title, - "product_url": product_url, "status": "completed", - "upload_folder": upload_folder, - "folder_s3_url": folder_s3_url, + "folder_s3_url": folder_s3_url, # 🔸 폴더 전체를 가리킴 (이미지 + JSON 포함) + "json_s3_url": f"{folder_s3_url}/product_data.json", # 🆕 JSON 파일 직접 링크 "uploaded_images": uploaded_images, - "failed_images": failed_images, "success_count": len(uploaded_images), - "fail_count": len(failed_images), - } + "fail_count": fail_count, + } \ No newline at end of file From 67d06ea347860133444bd9fea956bf0a449b2c4e Mon Sep 17 00:00:00 2001 From: thkim7 Date: Mon, 22 Sep 2025 14:07:44 +0900 Subject: [PATCH 6/6] chore: poetry run black . --- .../app/model/schemas.py | 8 ++++- .../app/service/s3_upload_service.py | 16 ++++++--- .../app/utils/s3_upload_util.py | 36 ++++++++++--------- 3 files changed, 38 insertions(+), 22 deletions(-) diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py index d92f0b46..ebf19478 100644 --- a/apps/pre-processing-service/app/model/schemas.py +++ b/apps/pre-processing-service/app/model/schemas.py @@ -165,7 +165,9 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]): class RequestS3Upload(RequestBase): - keyword: str = Field(..., title="검색 키워드", description="폴더명 생성용 키워드") # 추가 + keyword: str = Field( + ..., title="검색 키워드", description="폴더명 생성용 키워드" + ) # 추가 crawled_products: List[Dict] = Field( ..., title="크롤링된 상품 데이터", @@ -175,6 +177,7 @@ class RequestS3Upload(RequestBase): "product", title="기본 폴더", description="S3 내 기본 저장 폴더 경로" ) + # S3 업로드된 이미지 정보 class S3ImageInfo(BaseModel): index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번") @@ -183,6 +186,7 @@ class S3ImageInfo(BaseModel): ) s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL") + # 상품별 S3 업로드 결과 class ProductS3UploadResult(BaseModel): product_index: int = Field(..., title="상품 순번", description="크롤링 순번") @@ -198,6 +202,7 @@ class ProductS3UploadResult(BaseModel): ..., title="실패 개수", description="업로드 실패한 이미지 수" ) + # S3 업로드 요약 정보 class S3UploadSummary(BaseModel): total_products: int = Field( @@ -210,6 +215,7 @@ class S3UploadSummary(BaseModel): ..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수" ) + # 응답 데이터 모델 class S3UploadData(BaseModel): upload_results: List[ProductS3UploadResult] = Field( diff --git a/apps/pre-processing-service/app/service/s3_upload_service.py b/apps/pre-processing-service/app/service/s3_upload_service.py index 4a47bca6..1c024a63 100644 --- a/apps/pre-processing-service/app/service/s3_upload_service.py +++ b/apps/pre-processing-service/app/service/s3_upload_service.py @@ -21,9 +21,13 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: """ keyword = request.keyword # 키워드 추가 crawled_products = request.crawled_products - base_folder = request.base_folder or "product" # 🔸 기본값 변경: product-images → product + base_folder = ( + request.base_folder or "product" + ) # 🔸 기본값 변경: product-images → product - logger.info(f"S3 업로드 서비스 시작: keyword='{keyword}', {len(crawled_products)}개 상품") + logger.info( + f"S3 업로드 서비스 시작: keyword='{keyword}', {len(crawled_products)}개 상품" + ) upload_results = [] total_success_images = 0 @@ -64,7 +68,11 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: # 상품 이미지 + 데이터 업로드 (키워드 전달 추가!) # 🔸 전체 크롤링 데이터를 전달 (product_detail이 아닌 product_info 전체) upload_result = await self.s3_util.upload_single_product_images( - session, product_info, product_index, keyword, base_folder # product_detail → product_info + session, + product_info, + product_index, + keyword, + base_folder, # product_detail → product_info ) upload_results.append(upload_result) @@ -114,4 +122,4 @@ async def upload_crawled_products_to_s3(self, request: RequestS3Upload) -> dict: except Exception as e: logger.error(f"S3 업로드 서비스 전체 오류: {e}") - raise InvalidItemDataException() \ No newline at end of file + raise InvalidItemDataException() diff --git a/apps/pre-processing-service/app/utils/s3_upload_util.py b/apps/pre-processing-service/app/utils/s3_upload_util.py index 86148325..0aaa5ace 100644 --- a/apps/pre-processing-service/app/utils/s3_upload_util.py +++ b/apps/pre-processing-service/app/utils/s3_upload_util.py @@ -39,14 +39,14 @@ def __init__(self): ) async def download_image( - self, session: aiohttp.ClientSession, image_url: str + self, session: aiohttp.ClientSession, image_url: str ) -> Optional[bytes]: """이미지 URL에서 이미지 데이터 다운로드""" try: logger.debug(f"이미지 다운로드 시작: {image_url}") async with session.get( - image_url, timeout=aiohttp.ClientTimeout(total=30) + image_url, timeout=aiohttp.ClientTimeout(total=30) ) as response: if response.status == 200: image_data = await response.read() @@ -87,7 +87,7 @@ def get_content_type(self, file_extension: str) -> str: return content_types.get(file_extension, "image/jpeg") def upload_to_s3( - self, data: bytes, s3_key: str, content_type: str = "image/jpeg" + self, data: bytes, s3_key: str, content_type: str = "image/jpeg" ) -> bool: """S3에 데이터 업로드 (이미지 또는 JSON)""" try: @@ -111,7 +111,7 @@ def upload_json_to_s3(self, json_data: Dict, s3_key: str) -> bool: """JSON 데이터를 S3에 업로드""" try: json_str = json.dumps(json_data, ensure_ascii=False, indent=2) - json_bytes = json_str.encode('utf-8') + json_bytes = json_str.encode("utf-8") return self.upload_to_s3(json_bytes, s3_key, "application/json") @@ -144,10 +144,10 @@ def generate_product_folder_name(self, product_index: int, keyword: str) -> str: return folder_name def generate_s3_key( - self, - base_folder: str, - folder_name: str, - file_name: str, + self, + base_folder: str, + folder_name: str, + file_name: str, ) -> str: """S3 키 생성""" # 최종 S3 키: product/20250922_산리오_1/image_001.jpg 또는 product_data.json @@ -159,12 +159,12 @@ def get_s3_url(self, s3_key: str) -> str: return f"{self.base_url}/{s3_key}" async def upload_single_product_images( - self, - session: aiohttp.ClientSession, - product_info: Dict, # 🔸 이름 변경: product_data → product_info (전체 크롤링 데이터) - product_index: int, - keyword: str, # 키워드 파라미터 추가 - base_folder: str = "product", # 🔸 기본 폴더 변경: product-images → product + self, + session: aiohttp.ClientSession, + product_info: Dict, # 🔸 이름 변경: product_data → product_info (전체 크롤링 데이터) + product_index: int, + keyword: str, # 키워드 파라미터 추가 + base_folder: str = "product", # 🔸 기본 폴더 변경: product-images → product ) -> Dict: """단일 상품의 모든 데이터(이미지 + JSON)를 S3에 업로드""" @@ -191,10 +191,12 @@ async def upload_single_product_images( product_data_with_meta = { **product_info, # 전체 크롤링 데이터 (index, url, product_detail, status, crawled_at 포함) "s3_upload_keyword": keyword, # 추가 메타데이터 - "s3_uploaded_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S") + "s3_uploaded_at": datetime.now().strftime("%Y-%m-%d %H:%M:%S"), } - json_s3_key = self.generate_s3_key(base_folder, folder_name, "product_data.json") + json_s3_key = self.generate_s3_key( + base_folder, folder_name, "product_data.json" + ) if self.upload_json_to_s3(product_data_with_meta, json_s3_key): logger.success(f"상품 {product_index} JSON 데이터 업로드 완료") @@ -276,4 +278,4 @@ async def upload_single_product_images( "uploaded_images": uploaded_images, "success_count": len(uploaded_images), "fail_count": fail_count, - } \ No newline at end of file + }