FlowWeaver · can019 · Sep 27, 2025 · Sep 23, 2025 · Sep 25, 2025 · Sep 25, 2025
diff --git a/.github/workflows/ci-java.yml b/.github/workflows/ci-java.yml
@@ -12,6 +12,7 @@ on:
       - release/**
     paths:
       - "apps/user-service/**"
+      - ".github/workflows/ci-java.yml"
 
 permissions:
   contents: read
@@ -32,12 +33,17 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
+
       - name: Set up JDK 21
         uses: actions/setup-java@v4
         with:
           java-version: '21'
           distribution: 'temurin'
-          cache: 'gradle'
+
+      - name: Setup Gradle
+        uses: gradle/actions/setup-gradle@v3
+        with:
+          cache-read-only: ${{ github.event_name == 'pull_request' }}
 
       - name: Grant execute permission for Gradle wrapper
         run: chmod +x ./gradlew
@@ -59,12 +65,17 @@ jobs:
       - name: Checkout repository
         uses: actions/checkout@v4
 
+
       - name: Set up JDK ${{ matrix.java-version }}
         uses: actions/setup-java@v4
         with:
           java-version: '${{ matrix.java-version }}'
           distribution: 'temurin'
-          cache: 'gradle'
+
+      - name: Setup Gradle
+        uses: gradle/actions/setup-gradle@v3
+        with:
+          cache-read-only: ${{ github.event_name == 'pull_request' }}
 
       - name: Grant execute permission for Gradle wrapper
         run: chmod +x ./gradlew
@@ -78,9 +89,15 @@ jobs:
         run: |
           ./gradlew unitTest
           ./gradlew integrationTest
-          ./gradlew javadoc
           if [ "${{ github.base_ref }}" = "main" ] || [[ "${{ github.ref }}" == refs/tags/* ]]; then
             ./gradlew e2eTest
+          fi
+        working-directory: apps/user-service
+
+      - name: Generate document artifacts
+        run: |
+          ./gradlew javadoc
+          if [ "${{ github.base_ref }}" = "main" ] || [[ "${{ github.ref }}" == refs/tags/* ]]; then
             ./gradlew openapi3
           fi
         working-directory: apps/user-service

diff --git a/apps/pre-processing-service/app/api/endpoints/product.py b/apps/pre-processing-service/app/api/endpoints/product.py
@@ -10,6 +10,7 @@
 from ...service.search_service import SearchService
 from ...service.match_service import MatchService
 from ...service.similarity_service import SimilarityService
+from ...service.product_selection_service import ProductSelectionService
 
 
 # from ...service.similarity_service import SimilarityService
@@ -121,3 +122,26 @@ async def s3_upload(request: RequestS3Upload):
         raise HTTPException(status_code=e.status_code, detail=e.detail)
     except Exception as e:
         raise HTTPException(status_code=500, detail=str(e))
+
+
+@router.post(
+    "/select", response_model=ResponseProductSelect, summary="콘텐츠용 상품 선택"
+)
+def select_product(request: RequestProductSelect):  # async 제거
+    """
+    S3 업로드 완료 후 콘텐츠 생성을 위한 최적 상품을 선택합니다.
+    """
+    try:
+        selection_service = ProductSelectionService()
+        response_data = selection_service.select_product_for_content(
+            request
+        )  # await 제거
+
+        if not response_data:
+            raise CustomException(
+                500, "상품 선택에 실패했습니다.", "PRODUCT_SELECTION_FAILED"
+            )
+
+        return response_data
+    except Exception as e:
+        raise HTTPException(status_code=500, detail=str(e))
diff --git a/apps/pre-processing-service/app/model/schemas.py b/apps/pre-processing-service/app/model/schemas.py
@@ -161,10 +161,11 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]):
     pass
 
 
-# ============== S3 이미지 업로드 ==============
+# ============== S3 업로드 ==============
 
 
 class RequestS3Upload(RequestBase):
+    task_run_id: int = Field(..., title="Task Run ID", description="워크플로우 실행 ID")
     keyword: str = Field(
         ..., title="검색 키워드", description="폴더명 생성용 키워드"
     )  # 추가
@@ -227,12 +228,6 @@ class S3UploadData(BaseModel):
     uploaded_at: str = Field(
         ..., title="업로드 완료 시간", description="S3 업로드 완료 시간"
     )
-    # 🆕 임시: 콘텐츠 생성용 단일 상품만 추가 (나중에 삭제 예정)
-    selected_product_for_content: Optional[Dict] = Field(
-        None,
-        title="콘텐츠 생성용 선택 상품",
-        description="임시: 블로그 콘텐츠 생성을 위해 선택된 단일 상품 정보",
-    )
 
 
 # 최종 응답 모델
@@ -242,6 +237,36 @@ class ResponseS3Upload(ResponseBase[S3UploadData]):
     pass
 
 
+# ============== 상품 선택 (새로 추가) ==============
+
+
+class RequestProductSelect(RequestBase):
+    task_run_id: int = Field(
+        ..., title="Task Run ID", description="상품을 선택할 task_run_id"
+    )
+    selection_criteria: Optional[str] = Field(
+        None, title="선택 기준", description="특별한 선택 기준 (기본: 이미지 개수 우선)"
+    )
+
+
+# 응답 데이터 모델
+class ProductSelectData(BaseModel):
+    task_run_id: int = Field(..., title="Task Run ID")
+    selected_product: Dict = Field(
+        ..., title="선택된 상품", description="콘텐츠 생성용으로 선택된 상품"
+    )
+    total_available_products: int = Field(
+        ..., title="전체 상품 수", description="선택 가능했던 전체 상품 개수"
+    )
+
+
+# 최종 응답 모델
+class ResponseProductSelect(ResponseBase[ProductSelectData]):
+    """상품 선택 API 응답"""
+
+    pass
+
+
 # ============== 블로그 콘텐츠 생성 ==============
 
 

diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py
@@ -5,16 +5,21 @@
 from app.model.schemas import RequestSadaguCrawl
 from loguru import logger
 from app.utils.response import Response
+import os
+
+os.environ["TOKENIZERS_PARALLELISM"] = "false"
 
 
 class CrawlService:
     def __init__(self):
         pass
 
-    async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
+    async def crawl_product_detail(
+        self, request: RequestSadaguCrawl, max_concurrent: int = 5
+    ) -> dict:
         """
         선택된 상품들의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계)
-        여러 상품 URL을 입력받아 순차적으로 상세 정보를 크롤링하여 딕셔너리로 반환합니다.
+        여러 상품 URL을 입력받아 비동기로 상세 정보를 크롤링하여 딕셔너리로 반환합니다.
         """
         product_urls = [str(url) for url in request.product_urls]
 
@@ -25,70 +30,44 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
         fail_count = 0
 
         try:
-            # 각 상품을 순차적으로 크롤링 (안정성 확보)
-            for i, product_url in enumerate(product_urls, 1):
-                logger.info(f"상품 {i}/{len(product_urls)} 크롤링 시작: {product_url}")
-
-                crawler = DetailCrawler(use_selenium=True)
-
-                try:
-                    # 상세 정보 크롤링 실행
-                    product_detail = await crawler.crawl_detail(product_url)
-
-                    if product_detail:
-                        product_title = product_detail.get("title", "Unknown")[:50]
-                        logger.success(
-                            f"상품 {i} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}"
-                        )
-
-                        # 성공한 상품 추가
-                        crawled_products.append(
-                            {
-                                "index": i,
-                                "url": product_url,
-                                "product_detail": product_detail,
-                                "status": "success",
-                                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
-                            }
-                        )
-                        success_count += 1
-                    else:
-                        logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음")
-                        crawled_products.append(
-                            {
-                                "index": i,
-                                "url": product_url,
-                                "product_detail": None,
-                                "status": "failed",
-                                "error": "상세 정보 없음",
-                                "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
-                            }
-                        )
-                        fail_count += 1
+            # 세마포어로 동시 실행 수 제한
+            semaphore = asyncio.Semaphore(max_concurrent)
 
-                except Exception as e:
-                    logger.error(
-                        f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'"
-                    )
+            # 모든 크롤링 태스크를 동시에 실행
+            tasks = []
+            for i, product_url in enumerate(product_urls, 1):
+                task = self._crawl_single_with_semaphore(
+                    semaphore, i, product_url, len(product_urls)
+                )
+                tasks.append(task)
+
+            # 모든 태스크 동시 실행 및 결과 수집
+            results = await asyncio.gather(*tasks, return_exceptions=True)
+
+            # 결과 정리
+            for result in results:
+                if isinstance(result, Exception):
+                    logger.error(f"크롤링 태스크 오류: {result}")
                     crawled_products.append(
                         {
-                            "index": i,
-                            "url": product_url,
+                            "index": len(crawled_products) + 1,
+                            "url": "unknown",
                             "product_detail": None,
                             "status": "failed",
-                            "error": str(e),
+                            "error": str(result),
                             "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
                         }
                     )
                     fail_count += 1
+                else:
+                    crawled_products.append(result)
+                    if result["status"] == "success":
+                        success_count += 1
+                    else:
+                        fail_count += 1
 
-                finally:
-                    # 각 크롤러 개별 정리
-                    await crawler.close()
-
-                # 상품간 간격 (서버 부하 방지)
-                if i < len(product_urls):
-                    await asyncio.sleep(1)
+            # 인덱스 순으로 정렬
+            crawled_products.sort(key=lambda x: x["index"])
 
             logger.success(
                 f"전체 크롤링 완료: 총 {len(product_urls)}개, 성공 {success_count}개, 실패 {fail_count}개"
@@ -111,10 +90,69 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
             logger.error(f"배치 크롤링 서비스 오류: error='{e}'")
             raise InvalidItemDataException()
 
-    # 기존 단일 크롤링 메서드도 유지 (하위 호환성)
+    async def _crawl_single_with_semaphore(
+        self,
+        semaphore: asyncio.Semaphore,
+        index: int,
+        product_url: str,
+        total_count: int,
+    ) -> dict:
+        """
+        세마포어를 사용한 단일 상품 크롤링
+        """
+        async with semaphore:
+            logger.info(f"상품 {index}/{total_count} 크롤링 시작: {product_url}")
+
+            crawler = DetailCrawler(use_selenium=True)
+
+            try:
+                # 상세 정보 크롤링 실행
+                product_detail = await crawler.crawl_detail(product_url)
+
+                if product_detail:
+                    product_title = product_detail.get("title", "Unknown")[:50]
+                    logger.success(
+                        f"상품 {index} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}"
+                    )
+
+                    return {
+                        "index": index,
+                        "url": product_url,
+                        "product_detail": product_detail,
+                        "status": "success",
+                        "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                    }
+                else:
+                    logger.error(f"상품 {index} 크롤링 실패: 상세 정보 없음")
+                    return {
+                        "index": index,
+                        "url": product_url,
+                        "product_detail": None,
+                        "status": "failed",
+                        "error": "상세 정보 없음",
+                        "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                    }
+
+            except Exception as e:
+                logger.error(
+                    f"상품 {index} 크롤링 오류: url={product_url}, error='{e}'"
+                )
+                return {
+                    "index": index,
+                    "url": product_url,
+                    "product_detail": None,
+                    "status": "failed",
+                    "error": str(e),
+                    "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                }
+
+            finally:
+                # 각 크롤러 개별 정리
+                await crawler.close()
+
     async def crawl_single_product_detail(self, product_url: str) -> dict:
         """
-        단일 상품 크롤링 (하위 호환성용)
+        단일 상품 크롤링
         """
         crawler = DetailCrawler(use_selenium=True)