FlowWeaver · thkim7 · Sep 22, 2025 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py
@@ -6,6 +6,7 @@
     CustomException,
 )
 from ...service.crawl_service import CrawlService
+from ...service.s3_upload_service import S3UploadService
 from ...service.search_service import SearchService
 from ...service.match_service import MatchService
 from ...service.similarity_service import SimilarityService
@@ -60,11 +61,11 @@ async def match(request: RequestSadaguMatch):
 )
 async def similarity(request: RequestSadaguSimilarity):
     """
-    매칭된 상품들 중 키워드와의 유사도를 계산하여 최적의 상품을 선택합니다.
+    매칭된 상품들 중 키워드와의 유사도를 계산하여 상위 10개 상품을 선택합니다.
     """
     try:
         similarity_service = SimilarityService()
-        response_data = similarity_service.select_product_by_similarity(request)
+        response_data = similarity_service.select_top_products_by_similarity(request)
 
         if not response_data:
             raise CustomException(
@@ -99,3 +100,24 @@ async def crawl(body: RequestSadaguCrawl):
         raise HTTPException(status_code=e.status_code, detail=e.detail)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post("/s3-upload", response_model=ResponseS3Upload, summary="S3 이미지 업로드")
+async def s3_upload(request: RequestS3Upload):
+    """
+    크롤링 완료 후 별도로 호출하여 이미지들을 S3 저장소에 업로드합니다.
+    """
+    try:
+        s3_upload_service = S3UploadService()
+        response_data = await s3_upload_service.upload_crawled_products_to_s3(request)
+
+        if not response_data:
+            raise CustomException(
+                500, "S3 이미지 업로드에 실패했습니다.", "S3_UPLOAD_FAILED"
+            )
+
+        return response_data
+    except InvalidItemDataException as e:
+        raise HTTPException(status_code=e.status_code, detail=e.detail)
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py
@@ -110,8 +110,10 @@ class SadaguSimilarityData(BaseModel):
     keyword: str = Field(
         ..., title="분석 키워드", description="유사도 분석에 사용된 키워드"
     )
-    selected_product: Optional[Dict] = Field(
-        None, title="선택된 상품", description="유사도 분석 결과 선택된 상품"
+    top_products: List[Dict] = Field(
+        default_factory=list,
+        title="선택된 상품들",
+        description="유사도 분석 결과 선택된 상위 상품 목록",
     )
     reason: Optional[str] = Field(
         None, title="선택 이유", description="상품 선택 근거 및 점수 정보"
@@ -129,16 +131,23 @@ class ResponseSadaguSimilarity(ResponseBase[SadaguSimilarityData]):
 
 
 class RequestSadaguCrawl(RequestBase):
-    product_url: HttpUrl = Field(
+    product_urls: List[HttpUrl] = Field(
         ..., title="상품 URL", description="크롤링할 상품 페이지의 URL"
     )
 
 
 # 응답 데이터 모델
 class SadaguCrawlData(BaseModel):
-    product_url: str = Field(..., title="상품 URL", description="크롤링된 상품 URL")
-    product_detail: Optional[Dict] = Field(
-        None, title="상품 상세정보", description="크롤링된 상품의 상세 정보"
+    crawled_products: List[Dict] = Field(
+        ...,
+        title="크롤링된 상품들",
+        description="크롤링된 상품들의 상세 정보 목록 (URL 포함)",
+    )
+    success_count: int = Field(
+        ..., title="성공 개수", description="성공적으로 크롤링된 상품 개수"
+    )
+    fail_count: int = Field(
+        ..., title="실패 개수", description="크롤링에 실패한 상품 개수"
     )
     crawled_at: Optional[str] = Field(
         None, title="크롤링 시간", description="크롤링 완료 시간"
@@ -152,6 +161,81 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]):
     pass
 
 
+# ============== S3 이미지 업로드 ==============
+
+
+class RequestS3Upload(RequestBase):
+    keyword: str = Field(
+        ..., title="검색 키워드", description="폴더명 생성용 키워드"
+    )  # 추가
+    crawled_products: List[Dict] = Field(
+        ...,
+        title="크롤링된 상품 데이터",
+        description="이전 단계에서 크롤링된 상품들의 데이터",
+    )
+    base_folder: Optional[str] = Field(
+        "product", title="기본 폴더", description="S3 내 기본 저장 폴더 경로"
+    )
+
+
+# S3 업로드된 이미지 정보
+class S3ImageInfo(BaseModel):
+    index: int = Field(..., title="이미지 순번", description="상품 내 이미지 순번")
+    original_url: str = Field(
+        ..., title="원본 URL", description="크롤링된 원본 이미지 URL"
+    )
+    s3_url: str = Field(..., title="S3 URL", description="S3에서 접근 가능한 URL")
+
+
+# 상품별 S3 업로드 결과
+class ProductS3UploadResult(BaseModel):
+    product_index: int = Field(..., title="상품 순번", description="크롤링 순번")
+    product_title: str = Field(..., title="상품 제목", description="상품명")
+    status: str = Field(..., title="업로드 상태", description="completed/skipped/error")
+    uploaded_images: List[S3ImageInfo] = Field(
+        default_factory=list, title="업로드 성공 이미지"
+    )
+    success_count: int = Field(
+        ..., title="성공 개수", description="업로드 성공한 이미지 수"
+    )
+    fail_count: int = Field(
+        ..., title="실패 개수", description="업로드 실패한 이미지 수"
+    )
+
+
+# S3 업로드 요약 정보
+class S3UploadSummary(BaseModel):
+    total_products: int = Field(
+        ..., title="총 상품 수", description="처리 대상 상품 총 개수"
+    )
+    total_success_images: int = Field(
+        ..., title="성공 이미지 수", description="업로드 성공한 이미지 총 개수"
+    )
+    total_fail_images: int = Field(
+        ..., title="실패 이미지 수", description="업로드 실패한 이미지 총 개수"
+    )
+
+
+# 응답 데이터 모델
+class S3UploadData(BaseModel):
+    upload_results: List[ProductS3UploadResult] = Field(
+        ..., title="업로드 결과", description="각 상품의 S3 업로드 결과"
+    )
+    summary: S3UploadSummary = Field(
+        ..., title="업로드 요약", description="전체 업로드 결과 요약"
+    )
+    uploaded_at: str = Field(
+        ..., title="업로드 완료 시간", description="S3 업로드 완료 시간"
+    )
+
+
+# 최종 응답 모델
+class ResponseS3Upload(ResponseBase[S3UploadData]):
+    """S3 이미지 업로드 API 응답"""
+
+    pass
+
+
 # ============== 블로그 콘텐츠 생성 ==============
 
 

diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py
@@ -1,4 +1,5 @@
 import time
+import asyncio
 from app.service.crawlers.detail_crawler import DetailCrawler
 from app.errors.CustomException import InvalidItemDataException
 from app.model.schemas import RequestSadaguCrawl
@@ -12,45 +13,133 @@ def __init__(self):
 
     async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
         """
-        선택된 상품의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계)
-        상품 URL을 입력받아 상세 정보를 크롤링하여 딕셔너리로 반환합니다.
+        선택된 상품들의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계)
+        여러 상품 URL을 입력받아 순차적으로 상세 정보를 크롤링하여 딕셔너리로 반환합니다.
         """
-        crawler = DetailCrawler(use_selenium=True)
+        product_urls = [str(url) for url in request.product_urls]
+
+        logger.info(f"상품 상세 크롤링 서비스 시작: 총 {len(product_urls)}개 상품")
+
+        crawled_products = []
+        success_count = 0
+        fail_count = 0
 
         try:
-            logger.info(
-                f"상품 상세 크롤링 서비스 시작: product_url={request.product_url}"
+            # 각 상품을 순차적으로 크롤링 (안정성 확보)
+            for i, product_url in enumerate(product_urls, 1):
+                logger.info(f"상품 {i}/{len(product_urls)} 크롤링 시작: {product_url}")
+
+                crawler = DetailCrawler(use_selenium=True)
+
+                try:
+                    # 상세 정보 크롤링 실행
+                    product_detail = await crawler.crawl_detail(product_url)
+
+                    if product_detail:
+                        product_title = product_detail.get("title", "Unknown")[:50]
+                        logger.success(
+                            f"상품 {i} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}"
+                        )
+
+                        # 성공한 상품 추가
+                        crawled_products.append(
+                            {
+                                "index": i,
+                                "url": product_url,
+                                "product_detail": product_detail,
+                                "status": "success",
+                                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                            }
+                        )
+                        success_count += 1
+                    else:
+                        logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음")
+                        crawled_products.append(
+                            {
+                                "index": i,
+                                "url": product_url,
+                                "product_detail": None,
+                                "status": "failed",
+                                "error": "상세 정보 없음",
+                                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                            }
+                        )
+                        fail_count += 1
+
+                except Exception as e:
+                    logger.error(
+                        f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'"
+                    )
+                    crawled_products.append(
+                        {
+                            "index": i,
+                            "url": product_url,
+                            "product_detail": None,
+                            "status": "failed",
+                            "error": str(e),
+                            "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                        }
+                    )
+                    fail_count += 1
+
+                finally:
+                    # 각 크롤러 개별 정리
+                    await crawler.close()
+
+                # 상품간 간격 (서버 부하 방지)
+                if i < len(product_urls):
+                    await asyncio.sleep(1)
+
+            logger.success(
+                f"전체 크롤링 완료: 총 {len(product_urls)}개, 성공 {success_count}개, 실패 {fail_count}개"
             )
 
-            # 상세 정보 크롤링 실행
-            product_detail = await crawler.crawl_detail(
-                product_url=str(request.product_url)
+            # 응답 데이터 구성
+            data = {
+                "crawled_products": crawled_products,
+                "success_count": success_count,
+                "fail_count": fail_count,
+                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+            }
+
+            logger.info(
+                f"상품 상세 크롤링 서비스 완료: success_rate={success_count}/{len(product_urls)}"
             )
+            return Response.ok(data)
+
+        except Exception as e:
+            logger.error(f"배치 크롤링 서비스 오류: error='{e}'")
+            raise InvalidItemDataException()
+
+    # 기존 단일 크롤링 메서드도 유지 (하위 호환성)
+    async def crawl_single_product_detail(self, product_url: str) -> dict:
+        """
+        단일 상품 크롤링 (하위 호환성용)
+        """
+        crawler = DetailCrawler(use_selenium=True)
+
+        try:
+            logger.info(f"단일 상품 크롤링 시작: {product_url}")
+
+            product_detail = await crawler.crawl_detail(product_url)
 
             if not product_detail:
-                logger.error(f"상품 상세 정보 크롤링 실패: url={request.product_url}")
+                logger.error(f"상품 상세 정보 크롤링 실패: url={product_url}")
                 raise InvalidItemDataException()
 
             product_title = product_detail.get("title", "Unknown")[:50]
-            logger.success(
-                f"크롤링 완료: title='{product_title}', price={product_detail.get('price', 0)}, options_count={len(product_detail.get('options', []))}"
-            )
+            logger.success(f"크롤링 완료: title='{product_title}'")
 
-            # 응답 데이터 구성
             data = {
-                "product_url": str(request.product_url),
+                "product_url": product_url,
                 "product_detail": product_detail,
                 "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
             }
 
-            logger.info(f"상품 상세 크롤링 서비스 완료: status=success")
             return Response.ok(data)
 
         except Exception as e:
-            logger.error(
-                f"크롤링 서비스 오류: product_url={request.product_url}, error='{e}'"
-            )
+            logger.error(f"단일 크롤링 오류: url={product_url}, error='{e}'")
             raise InvalidItemDataException()
         finally:
             await crawler.close()
-            logger.debug("크롤러 리소스 정리 완료")
diff --git a/apps/pre-processing-service/app/service/crawlers/search_crawler.py b/apps/pre-processing-service/app/service/crawlers/search_crawler.py
@@ -49,7 +49,7 @@ async def search_products_selenium(self, keyword: str) -> list[dict]:
             logger.info(
                 f"Selenium으로 발견한 상품 링크: {len(unique_products)}개 (중복 제거 전: {len(product_links)}개)"
             )
-            return unique_products[:20]
+            return unique_products[:40]
 
         except Exception as e:
             logger.error(f"Selenium 검색 오류: keyword='{keyword}', error='{e}'")
@@ -88,7 +88,7 @@ async def search_products_httpx(self, keyword: str) -> list[dict]:
                     product_links.append({"url": full_url, "title": title})
 
             logger.info(f"httpx로 발견한 상품 링크: {len(product_links)}개")
-            return product_links[:20]
+            return product_links[:40]
 
         except Exception as e:
             logger.error(f"httpx 검색 오류: keyword='{keyword}', error='{e}'")