Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .github/workflows/ci-python.yml
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,8 @@ name: CI (Python/FastAPI)

on:
push:
branches:
- feature/onnx
tags:
- 'pre-processing-v*'
pull_request:
Expand Down
354 changes: 177 additions & 177 deletions apps/pre-processing-service/app/service/similarity_service.py
Original file line number Diff line number Diff line change
@@ -1,177 +1,177 @@
# from app.utils.similarity_analyzer import SimilarityAnalyzer
# from app.errors.CustomException import InvalidItemDataException
# from ..model.schemas import RequestSadaguSimilarity
# from loguru import logger
#
#
# class SimilarityService:
# def __init__(self):
# pass
#
# def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict:
# """
# BERT ๊ธฐ๋ฐ˜ ์œ ์‚ฌ๋„ ๋ถ„์„ ํ›„ ์ƒํ’ˆ ์„ ํƒ - 4๋‹จ๊ณ„
# """
# keyword = request.keyword
# candidates = request.matched_products
# fallback_products = request.search_results or []
#
# logger.info(
# f"์œ ์‚ฌ๋„ ๋ถ„์„ ์„œ๋น„์Šค ์‹œ์ž‘: job_id={request.job_id}, keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}"
# )
#
# # ๋งค์นญ๋œ ์ƒํ’ˆ์ด ์—†์œผ๋ฉด ์ „์ฒด ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋กœ ํด๋ฐฑ
# if not candidates:
# if not fallback_products:
# logger.warning(
# f"๋งค์นญ๋œ ์ƒํ’ˆ๊ณผ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ๋ชจ๋‘ ์—†์Œ: keyword='{keyword}'"
# )
# return {
# "job_id": request.job_id,
# "schedule_id": request.schedule_id,
# "schedule_his_id": request.schedule_his_id,
# "keyword": keyword,
# "selected_product": None,
# "reason": "๋งค์นญ๋œ ์ƒํ’ˆ๊ณผ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ๋ชจ๋‘ ์—†์Œ",
# "status": "success",
# }
#
# logger.info("๋งค์นญ๋œ ์ƒํ’ˆ ์—†์Œ โ†’ ์ „์ฒด ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์—์„œ ์œ ์‚ฌ๋„ ๋ถ„์„")
# candidates = fallback_products
# analysis_mode = "fallback_similarity_only"
# else:
# analysis_mode = "matched_products"
#
# try:
# analyzer = SimilarityAnalyzer()
#
# logger.info(
# f"ํ‚ค์›Œ๋“œ '{keyword}'์™€ {len(candidates)}๊ฐœ ์ƒํ’ˆ์˜ ์œ ์‚ฌ๋„ ๋ถ„์„ ์‹œ์ž‘... (๋ชจ๋“œ: {analysis_mode})"
# )
#
# # ํ•œ ๊ฐœ๋งŒ ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ์„ ํƒ
# if len(candidates) == 1:
# selected_product = candidates[0]
#
# logger.info("๋‹จ์ผ ํ›„๋ณด ์ƒํ’ˆ - ์œ ์‚ฌ๋„ ๊ฒ€์ฆ ์ง„ํ–‰")
# # ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
# similarity = analyzer.calculate_similarity(
# keyword, selected_product["title"]
# )
#
# # ํด๋ฐฑ ๋ชจ๋“œ์—์„œ๋Š” ์ž„๊ณ„๊ฐ’ ๊ฒ€์ฆ
# if analysis_mode == "fallback_similarity_only":
# similarity_threshold = 0.3
# if similarity < similarity_threshold:
# logger.warning(
# f"๋‹จ์ผ ์ƒํ’ˆ ์œ ์‚ฌ๋„ ๋ฏธ๋‹ฌ: similarity={similarity:.4f} < threshold={similarity_threshold}"
# )
# return {
# "job_id": request.job_id,
# "schedule_id": request.schedule_id,
# "schedule_his_id": request.schedule_his_id,
# "keyword": keyword,
# "selected_product": None,
# "reason": f"๋‹จ์ผ ์ƒํ’ˆ ์œ ์‚ฌ๋„({similarity:.4f}) < ๊ธฐ์ค€({similarity_threshold})",
# "status": "success",
# }
#
# selected_product["similarity_info"] = {
# "similarity_score": float(similarity),
# "analysis_type": "single_candidate",
# "analysis_mode": analysis_mode,
# }
#
# logger.success(
# f"๋‹จ์ผ ์ƒํ’ˆ ์„ ํƒ ์™„๋ฃŒ: title='{selected_product['title'][:30]}', similarity={similarity:.4f}"
# )
#
# return {
# "job_id": request.job_id,
# "schedule_id": request.schedule_id,
# "schedule_his_id": request.schedule_his_id,
# "keyword": keyword,
# "selected_product": selected_product,
# "reason": f"๋‹จ์ผ ์ƒํ’ˆ - ์œ ์‚ฌ๋„: {similarity:.4f} ({analysis_mode})",
# "status": "success",
# }
#
# # ์—ฌ๋Ÿฌ ๊ฐœ๊ฐ€ ์žˆ์œผ๋ฉด ์œ ์‚ฌ๋„ ๋น„๊ต
# logger.info("์—ฌ๋Ÿฌ ์ƒํ’ˆ ์ค‘ ์ตœ๊ณ  ์œ ์‚ฌ๋„๋กœ ์„ ํƒ...")
#
# # ์ œ๋ชฉ๋งŒ ์ถ”์ถœํ•ด์„œ ๋ฐฐ์น˜ ๋ถ„์„
# titles = [product["title"] for product in candidates]
# similarity_results = analyzer.analyze_similarity_batch(keyword, titles)
#
# # ๊ฒฐ๊ณผ ์ถœ๋ ฅ
# logger.info("์œ ์‚ฌ๋„ ๋ถ„์„ ๊ฒฐ๊ณผ:")
# for i, result in enumerate(similarity_results[:5]): # ์ƒ์œ„ 5๊ฐœ๋งŒ ๋กœ๊ทธ
# logger.info(
# f" {i+1}์œ„: {result['title'][:40]} | ์œ ์‚ฌ๋„: {result['similarity']:.4f}"
# )
#
# # ์ตœ๊ณ  ์œ ์‚ฌ๋„ ์„ ํƒ
# best_result = similarity_results[0]
# selected_product = candidates[best_result["index"]].copy()
#
# # ํด๋ฐฑ ๋ชจ๋“œ์—์„œ๋Š” ์ž„๊ณ„๊ฐ’ ๊ฒ€์ฆ
# similarity_threshold = 0.3
# if (
# analysis_mode == "fallback_similarity_only"
# and best_result["similarity"] < similarity_threshold
# ):
# logger.warning(
# f"์ตœ๊ณ  ์œ ์‚ฌ๋„ ๋ฏธ๋‹ฌ: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}"
# )
# return {
# "job_id": request.job_id,
# "schedule_id": request.schedule_id,
# "schedule_his_id": request.schedule_his_id,
# "keyword": keyword,
# "selected_product": None,
# "reason": f"์ตœ๊ณ  ์œ ์‚ฌ๋„({best_result['similarity']:.4f}) < ๊ธฐ์ค€({similarity_threshold})",
# "status": "success",
# }
#
# # ์œ ์‚ฌ๋„ ์ •๋ณด ์ถ”๊ฐ€
# selected_product["similarity_info"] = {
# "similarity_score": best_result["similarity"],
# "analysis_type": "multi_candidate_bert",
# "analysis_mode": analysis_mode,
# "rank": 1,
# "total_candidates": len(candidates),
# }
#
# # ๋งค์นญ ๋ชจ๋“œ์—์„œ๋Š” ์ข…ํ•ฉ ์ ์ˆ˜๋„ ๊ณ„์‚ฐ
# if analysis_mode == "matched_products" and "match_info" in selected_product:
# match_score = selected_product["match_info"]["match_score"]
# similarity_score = best_result["similarity"]
# # ๊ฐ€์ค‘์น˜: ๋งค์นญ 40%, ์œ ์‚ฌ๋„ 60%
# final_score = match_score * 0.4 + similarity_score * 0.6
# selected_product["final_score"] = final_score
# reason = f"์ข…ํ•ฉ์ ์ˆ˜({final_score:.4f}) = ๋งค์นญ({match_score:.4f})*0.4 + ์œ ์‚ฌ๋„({similarity_score:.4f})*0.6"
# logger.info(
# f"์ข…ํ•ฉ ์ ์ˆ˜ ๊ณ„์‚ฐ: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}"
# )
# else:
# reason = f"์œ ์‚ฌ๋„({best_result['similarity']:.4f}) ๊ธฐ์ค€ ์„ ํƒ ({analysis_mode})"
#
# logger.success(
# f"์ƒํ’ˆ ์„ ํƒ ์™„๋ฃŒ: title='{selected_product['title'][:30]}', {reason}"
# )
#
# return {
# "job_id": request.job_id,
# "schedule_id": request.schedule_id,
# "schedule_his_id": request.schedule_his_id,
# "keyword": keyword,
# "selected_product": selected_product,
# "reason": reason,
# "status": "success",
# }
#
# except Exception as e:
# logger.error(
# f"์œ ์‚ฌ๋„ ๋ถ„์„ ์„œ๋น„์Šค ์˜ค๋ฅ˜: job_id={request.job_id}, keyword='{keyword}', error='{e}'"
# )
# raise InvalidItemDataException()
from app.utils.similarity_analyzer import SimilarityAnalyzerONNX
from app.errors.CustomException import InvalidItemDataException
from ..model.schemas import RequestSadaguSimilarity
from loguru import logger


class SimilarityService:
def __init__(self):
pass

def select_product_by_similarity(self, request: RequestSadaguSimilarity) -> dict:
"""
BERT ๊ธฐ๋ฐ˜ ์œ ์‚ฌ๋„ ๋ถ„์„ ํ›„ ์ƒํ’ˆ ์„ ํƒ - 4๋‹จ๊ณ„
"""
keyword = request.keyword
candidates = request.matched_products
fallback_products = request.search_results or []

logger.info(
f"์œ ์‚ฌ๋„ ๋ถ„์„ ์„œ๋น„์Šค ์‹œ์ž‘: job_id={request.job_id}, keyword='{keyword}', matched_count={len(candidates) if candidates else 0}, fallback_count={len(fallback_products)}"
)

# ๋งค์นญ๋œ ์ƒํ’ˆ์ด ์—†์œผ๋ฉด ์ „์ฒด ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๋กœ ํด๋ฐฑ
if not candidates:
if not fallback_products:
logger.warning(
f"๋งค์นญ๋œ ์ƒํ’ˆ๊ณผ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ๋ชจ๋‘ ์—†์Œ: keyword='{keyword}'"
)
return {
"job_id": request.job_id,
"schedule_id": request.schedule_id,
"schedule_his_id": request.schedule_his_id,
"keyword": keyword,
"selected_product": None,
"reason": "๋งค์นญ๋œ ์ƒํ’ˆ๊ณผ ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ๊ฐ€ ๋ชจ๋‘ ์—†์Œ",
"status": "success",
}

logger.info("๋งค์นญ๋œ ์ƒํ’ˆ ์—†์Œ โ†’ ์ „์ฒด ๊ฒ€์ƒ‰ ๊ฒฐ๊ณผ์—์„œ ์œ ์‚ฌ๋„ ๋ถ„์„")
candidates = fallback_products
analysis_mode = "fallback_similarity_only"
else:
analysis_mode = "matched_products"

try:
analyzer = SimilarityAnalyzerONNX()

logger.info(
f"ํ‚ค์›Œ๋“œ '{keyword}'์™€ {len(candidates)}๊ฐœ ์ƒํ’ˆ์˜ ์œ ์‚ฌ๋„ ๋ถ„์„ ์‹œ์ž‘... (๋ชจ๋“œ: {analysis_mode})"
)

# ํ•œ ๊ฐœ๋งŒ ์žˆ์œผ๋ฉด ๋ฐ”๋กœ ์„ ํƒ
if len(candidates) == 1:
selected_product = candidates[0]

logger.info("๋‹จ์ผ ํ›„๋ณด ์ƒํ’ˆ - ์œ ์‚ฌ๋„ ๊ฒ€์ฆ ์ง„ํ–‰")
# ์œ ์‚ฌ๋„ ๊ณ„์‚ฐ
similarity = analyzer.calculate_similarity(
keyword, selected_product["title"]
)

# ํด๋ฐฑ ๋ชจ๋“œ์—์„œ๋Š” ์ž„๊ณ„๊ฐ’ ๊ฒ€์ฆ
if analysis_mode == "fallback_similarity_only":
similarity_threshold = 0.3
if similarity < similarity_threshold:
logger.warning(
f"๋‹จ์ผ ์ƒํ’ˆ ์œ ์‚ฌ๋„ ๋ฏธ๋‹ฌ: similarity={similarity:.4f} < threshold={similarity_threshold}"
)
return {
"job_id": request.job_id,
"schedule_id": request.schedule_id,
"schedule_his_id": request.schedule_his_id,
"keyword": keyword,
"selected_product": None,
"reason": f"๋‹จ์ผ ์ƒํ’ˆ ์œ ์‚ฌ๋„({similarity:.4f}) < ๊ธฐ์ค€({similarity_threshold})",
"status": "success",
}

selected_product["similarity_info"] = {
"similarity_score": float(similarity),
"analysis_type": "single_candidate",
"analysis_mode": analysis_mode,
}

logger.success(
f"๋‹จ์ผ ์ƒํ’ˆ ์„ ํƒ ์™„๋ฃŒ: title='{selected_product['title'][:30]}', similarity={similarity:.4f}"
)

return {
"job_id": request.job_id,
"schedule_id": request.schedule_id,
"schedule_his_id": request.schedule_his_id,
"keyword": keyword,
"selected_product": selected_product,
"reason": f"๋‹จ์ผ ์ƒํ’ˆ - ์œ ์‚ฌ๋„: {similarity:.4f} ({analysis_mode})",
"status": "success",
}

# ์—ฌ๋Ÿฌ ๊ฐœ๊ฐ€ ์žˆ์œผ๋ฉด ์œ ์‚ฌ๋„ ๋น„๊ต
logger.info("์—ฌ๋Ÿฌ ์ƒํ’ˆ ์ค‘ ์ตœ๊ณ  ์œ ์‚ฌ๋„๋กœ ์„ ํƒ...")

# ์ œ๋ชฉ๋งŒ ์ถ”์ถœํ•ด์„œ ๋ฐฐ์น˜ ๋ถ„์„
titles = [product["title"] for product in candidates]
similarity_results = analyzer.analyze_similarity_batch(keyword, titles)

# ๊ฒฐ๊ณผ ์ถœ๋ ฅ
logger.info("์œ ์‚ฌ๋„ ๋ถ„์„ ๊ฒฐ๊ณผ:")
for i, result in enumerate(similarity_results[:5]): # ์ƒ์œ„ 5๊ฐœ๋งŒ ๋กœ๊ทธ
logger.info(
f" {i+1}์œ„: {result['title'][:40]} | ์œ ์‚ฌ๋„: {result['similarity']:.4f}"
)

# ์ตœ๊ณ  ์œ ์‚ฌ๋„ ์„ ํƒ
best_result = similarity_results[0]
selected_product = candidates[best_result["index"]].copy()

# ํด๋ฐฑ ๋ชจ๋“œ์—์„œ๋Š” ์ž„๊ณ„๊ฐ’ ๊ฒ€์ฆ
similarity_threshold = 0.3
if (
analysis_mode == "fallback_similarity_only"
and best_result["similarity"] < similarity_threshold
):
logger.warning(
f"์ตœ๊ณ  ์œ ์‚ฌ๋„ ๋ฏธ๋‹ฌ: similarity={best_result['similarity']:.4f} < threshold={similarity_threshold}"
)
return {
"job_id": request.job_id,
"schedule_id": request.schedule_id,
"schedule_his_id": request.schedule_his_id,
"keyword": keyword,
"selected_product": None,
"reason": f"์ตœ๊ณ  ์œ ์‚ฌ๋„({best_result['similarity']:.4f}) < ๊ธฐ์ค€({similarity_threshold})",
"status": "success",
}

# ์œ ์‚ฌ๋„ ์ •๋ณด ์ถ”๊ฐ€
selected_product["similarity_info"] = {
"similarity_score": best_result["similarity"],
"analysis_type": "multi_candidate_bert",
"analysis_mode": analysis_mode,
"rank": 1,
"total_candidates": len(candidates),
}

# ๋งค์นญ ๋ชจ๋“œ์—์„œ๋Š” ์ข…ํ•ฉ ์ ์ˆ˜๋„ ๊ณ„์‚ฐ
if analysis_mode == "matched_products" and "match_info" in selected_product:
match_score = selected_product["match_info"]["match_score"]
similarity_score = best_result["similarity"]
# ๊ฐ€์ค‘์น˜: ๋งค์นญ 40%, ์œ ์‚ฌ๋„ 60%
final_score = match_score * 0.4 + similarity_score * 0.6
selected_product["final_score"] = final_score
reason = f"์ข…ํ•ฉ์ ์ˆ˜({final_score:.4f}) = ๋งค์นญ({match_score:.4f})*0.4 + ์œ ์‚ฌ๋„({similarity_score:.4f})*0.6"
logger.info(
f"์ข…ํ•ฉ ์ ์ˆ˜ ๊ณ„์‚ฐ: match_score={match_score:.4f}, similarity_score={similarity_score:.4f}, final_score={final_score:.4f}"
)
else:
reason = f"์œ ์‚ฌ๋„({best_result['similarity']:.4f}) ๊ธฐ์ค€ ์„ ํƒ ({analysis_mode})"

logger.success(
f"์ƒํ’ˆ ์„ ํƒ ์™„๋ฃŒ: title='{selected_product['title'][:30]}', {reason}"
)

return {
"job_id": request.job_id,
"schedule_id": request.schedule_id,
"schedule_his_id": request.schedule_his_id,
"keyword": keyword,
"selected_product": selected_product,
"reason": reason,
"status": "success",
}

except Exception as e:
logger.error(
f"์œ ์‚ฌ๋„ ๋ถ„์„ ์„œ๋น„์Šค ์˜ค๋ฅ˜: job_id={request.job_id}, keyword='{keyword}', error='{e}'"
)
raise InvalidItemDataException()
Loading
Loading