Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
21 commits
Select commit Hold shift + click to select a range
d2c9336
워크플로우 default_config 사용 로직개발 (#190)
kakusiA Sep 23, 2025
b09558f
check-session, permissons api에 대한 테스트 코드 작성 (#200)
bwnfo3 Sep 25, 2025
c993a53
Merge pull request #203 from Kernel180-BE12/main
can019 Sep 25, 2025
0fd54d6
feat: S3 하고 RDB 연동
thkim7 Sep 25, 2025
9076e8a
chore: poetry run black . & spotlessApply
thkim7 Sep 25, 2025
e779b1b
Merge pull request #205 from Kernel180-BE12/feature/s3-rds
thkim7 Sep 25, 2025
ac8c784
refactor: 크롤링 서비스 순차적 크롤링에서 비동기 크롤링으로 변경
thkim7 Sep 25, 2025
426b4cf
refactor: RDB와 selection task 실행시 task_run_id가 mismatch 되는 문제 해결
thkim7 Sep 25, 2025
f9845f3
chore: poetry run black .
thkim7 Sep 25, 2025
d561ab3
chore: spotlessApply
thkim7 Sep 25, 2025
51a257c
Merge pull request #208 from Kernel180-BE12/feature/refactor_service
thkim7 Sep 25, 2025
f3210d3
Workflow 수동 실행 및 Retry 로직 테스트 코드 작성 (#209)
jihukimme Sep 25, 2025
773b61f
Workflow 생성 api (#211)
bwnfo3 Sep 25, 2025
f8b7026
Timezone Instant(UTC) 마이그레이션 (#210)
can019 Sep 25, 2025
9e4aa31
Jackson Timezone 직렬화가 되지 않던 문제 해결 (#212)
can019 Sep 26, 2025
a905f9d
ExecutionLog API 구현 및 traceId 일관성 개선 (#215)
can019 Sep 27, 2025
61150f9
User 관련 api test 및 api document 작성 (#217)
can019 Sep 27, 2025
cfc6397
Gradle 캐싱을 통해 CI (Java) 속도 개선 (#218)
can019 Sep 27, 2025
bb23534
chore: version 0.1.0-SNAPSHOT으로 build.gradle update
can019 Sep 27, 2025
b4a62ff
chore: Document artifcat step 분리
can019 Sep 27, 2025
369e616
fix: Working directory document-java step에 추가
can019 Sep 27, 2025
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
23 changes: 20 additions & 3 deletions .github/workflows/ci-java.yml
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,7 @@ on:
- release/**
paths:
- "apps/user-service/**"
- ".github/workflows/ci-java.yml"

permissions:
contents: read
Expand All @@ -32,12 +33,17 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4


- name: Set up JDK 21
uses: actions/setup-java@v4
with:
java-version: '21'
distribution: 'temurin'
cache: 'gradle'

- name: Setup Gradle
uses: gradle/actions/setup-gradle@v3
with:
cache-read-only: ${{ github.event_name == 'pull_request' }}

- name: Grant execute permission for Gradle wrapper
run: chmod +x ./gradlew
Expand All @@ -59,12 +65,17 @@ jobs:
- name: Checkout repository
uses: actions/checkout@v4


- name: Set up JDK ${{ matrix.java-version }}
uses: actions/setup-java@v4
with:
java-version: '${{ matrix.java-version }}'
distribution: 'temurin'
cache: 'gradle'

- name: Setup Gradle
uses: gradle/actions/setup-gradle@v3
with:
cache-read-only: ${{ github.event_name == 'pull_request' }}

- name: Grant execute permission for Gradle wrapper
run: chmod +x ./gradlew
Expand All @@ -78,9 +89,15 @@ jobs:
run: |
./gradlew unitTest
./gradlew integrationTest
./gradlew javadoc
if [ "${{ github.base_ref }}" = "main" ] || [[ "${{ github.ref }}" == refs/tags/* ]]; then
./gradlew e2eTest
fi
working-directory: apps/user-service

- name: Generate document artifacts
run: |
./gradlew javadoc
if [ "${{ github.base_ref }}" = "main" ] || [[ "${{ github.ref }}" == refs/tags/* ]]; then
./gradlew openapi3
fi
working-directory: apps/user-service
Expand Down
24 changes: 24 additions & 0 deletions apps/pre-processing-service/app/api/endpoints/product.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,6 +10,7 @@
from ...service.search_service import SearchService
from ...service.match_service import MatchService
from ...service.similarity_service import SimilarityService
from ...service.product_selection_service import ProductSelectionService


# from ...service.similarity_service import SimilarityService
Expand Down Expand Up @@ -121,3 +122,26 @@ async def s3_upload(request: RequestS3Upload):
raise HTTPException(status_code=e.status_code, detail=e.detail)
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))


@router.post(
"/select", response_model=ResponseProductSelect, summary="콘텐츠용 상품 선택"
)
def select_product(request: RequestProductSelect): # async 제거
"""
S3 업로드 완료 후 콘텐츠 생성을 위한 최적 상품을 선택합니다.
"""
try:
selection_service = ProductSelectionService()
response_data = selection_service.select_product_for_content(
request
) # await 제거

if not response_data:
raise CustomException(
500, "상품 선택에 실패했습니다.", "PRODUCT_SELECTION_FAILED"
)

return response_data
except Exception as e:
raise HTTPException(status_code=500, detail=str(e))
39 changes: 32 additions & 7 deletions apps/pre-processing-service/app/model/schemas.py
Original file line number Diff line number Diff line change
Expand Up @@ -161,10 +161,11 @@ class ResponseSadaguCrawl(ResponseBase[SadaguCrawlData]):
pass


# ============== S3 이미지 업로드 ==============
# ============== S3 업로드 ==============


class RequestS3Upload(RequestBase):
task_run_id: int = Field(..., title="Task Run ID", description="워크플로우 실행 ID")
keyword: str = Field(
..., title="검색 키워드", description="폴더명 생성용 키워드"
) # 추가
Expand Down Expand Up @@ -227,12 +228,6 @@ class S3UploadData(BaseModel):
uploaded_at: str = Field(
..., title="업로드 완료 시간", description="S3 업로드 완료 시간"
)
# 🆕 임시: 콘텐츠 생성용 단일 상품만 추가 (나중에 삭제 예정)
selected_product_for_content: Optional[Dict] = Field(
None,
title="콘텐츠 생성용 선택 상품",
description="임시: 블로그 콘텐츠 생성을 위해 선택된 단일 상품 정보",
)


# 최종 응답 모델
Expand All @@ -242,6 +237,36 @@ class ResponseS3Upload(ResponseBase[S3UploadData]):
pass


# ============== 상품 선택 (새로 추가) ==============


class RequestProductSelect(RequestBase):
task_run_id: int = Field(
..., title="Task Run ID", description="상품을 선택할 task_run_id"
)
selection_criteria: Optional[str] = Field(
None, title="선택 기준", description="특별한 선택 기준 (기본: 이미지 개수 우선)"
)


# 응답 데이터 모델
class ProductSelectData(BaseModel):
task_run_id: int = Field(..., title="Task Run ID")
selected_product: Dict = Field(
..., title="선택된 상품", description="콘텐츠 생성용으로 선택된 상품"
)
total_available_products: int = Field(
..., title="전체 상품 수", description="선택 가능했던 전체 상품 개수"
)


# 최종 응답 모델
class ResponseProductSelect(ResponseBase[ProductSelectData]):
"""상품 선택 API 응답"""

pass


# ============== 블로그 콘텐츠 생성 ==============


Expand Down
154 changes: 96 additions & 58 deletions apps/pre-processing-service/app/service/crawl_service.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,21 @@
from app.model.schemas import RequestSadaguCrawl
from loguru import logger
from app.utils.response import Response
import os

os.environ["TOKENIZERS_PARALLELISM"] = "false"


class CrawlService:
def __init__(self):
pass

async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
async def crawl_product_detail(
self, request: RequestSadaguCrawl, max_concurrent: int = 5
) -> dict:
"""
선택된 상품들의 상세 정보를 크롤링하는 비즈니스 로직입니다. (5단계)
여러 상품 URL을 입력받아 순차적으로 상세 정보를 크롤링하여 딕셔너리로 반환합니다.
여러 상품 URL을 입력받아 비동기로 상세 정보를 크롤링하여 딕셔너리로 반환합니다.
"""
product_urls = [str(url) for url in request.product_urls]

Expand All @@ -25,70 +30,44 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
fail_count = 0

try:
# 각 상품을 순차적으로 크롤링 (안정성 확보)
for i, product_url in enumerate(product_urls, 1):
logger.info(f"상품 {i}/{len(product_urls)} 크롤링 시작: {product_url}")

crawler = DetailCrawler(use_selenium=True)

try:
# 상세 정보 크롤링 실행
product_detail = await crawler.crawl_detail(product_url)

if product_detail:
product_title = product_detail.get("title", "Unknown")[:50]
logger.success(
f"상품 {i} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}"
)

# 성공한 상품 추가
crawled_products.append(
{
"index": i,
"url": product_url,
"product_detail": product_detail,
"status": "success",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
)
success_count += 1
else:
logger.error(f"상품 {i} 크롤링 실패: 상세 정보 없음")
crawled_products.append(
{
"index": i,
"url": product_url,
"product_detail": None,
"status": "failed",
"error": "상세 정보 없음",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
)
fail_count += 1
# 세마포어로 동시 실행 수 제한
semaphore = asyncio.Semaphore(max_concurrent)

except Exception as e:
logger.error(
f"상품 {i} 크롤링 오류: url={product_url}, error='{e}'"
)
# 모든 크롤링 태스크를 동시에 실행
tasks = []
for i, product_url in enumerate(product_urls, 1):
task = self._crawl_single_with_semaphore(
semaphore, i, product_url, len(product_urls)
)
tasks.append(task)

# 모든 태스크 동시 실행 및 결과 수집
results = await asyncio.gather(*tasks, return_exceptions=True)

# 결과 정리
for result in results:
if isinstance(result, Exception):
logger.error(f"크롤링 태스크 오류: {result}")
crawled_products.append(
{
"index": i,
"url": product_url,
"index": len(crawled_products) + 1,
"url": "unknown",
"product_detail": None,
"status": "failed",
"error": str(e),
"error": str(result),
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
)
fail_count += 1
else:
crawled_products.append(result)
if result["status"] == "success":
success_count += 1
else:
fail_count += 1

finally:
# 각 크롤러 개별 정리
await crawler.close()

# 상품간 간격 (서버 부하 방지)
if i < len(product_urls):
await asyncio.sleep(1)
# 인덱스 순으로 정렬
crawled_products.sort(key=lambda x: x["index"])

logger.success(
f"전체 크롤링 완료: 총 {len(product_urls)}개, 성공 {success_count}개, 실패 {fail_count}개"
Expand All @@ -111,10 +90,69 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
logger.error(f"배치 크롤링 서비스 오류: error='{e}'")
raise InvalidItemDataException()

# 기존 단일 크롤링 메서드도 유지 (하위 호환성)
async def _crawl_single_with_semaphore(
self,
semaphore: asyncio.Semaphore,
index: int,
product_url: str,
total_count: int,
) -> dict:
"""
세마포어를 사용한 단일 상품 크롤링
"""
async with semaphore:
logger.info(f"상품 {index}/{total_count} 크롤링 시작: {product_url}")

crawler = DetailCrawler(use_selenium=True)

try:
# 상세 정보 크롤링 실행
product_detail = await crawler.crawl_detail(product_url)

if product_detail:
product_title = product_detail.get("title", "Unknown")[:50]
logger.success(
f"상품 {index} 크롤링 성공: title='{product_title}', price={product_detail.get('price', 0)}"
)

return {
"index": index,
"url": product_url,
"product_detail": product_detail,
"status": "success",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}
else:
logger.error(f"상품 {index} 크롤링 실패: 상세 정보 없음")
return {
"index": index,
"url": product_url,
"product_detail": None,
"status": "failed",
"error": "상세 정보 없음",
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}

except Exception as e:
logger.error(
f"상품 {index} 크롤링 오류: url={product_url}, error='{e}'"
)
return {
"index": index,
"url": product_url,
"product_detail": None,
"status": "failed",
"error": str(e),
"crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
}

finally:
# 각 크롤러 개별 정리
await crawler.close()

async def crawl_single_product_detail(self, product_url: str) -> dict:
"""
단일 상품 크롤링 (하위 호환성용)
단일 상품 크롤링
"""
crawler = DetailCrawler(use_selenium=True)

Expand Down
Loading
Loading