FlowWeaver · thkim7 · Sep 21, 2025 · Sep 21, 2025 · Sep 21, 2025
diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py
@@ -24,7 +24,7 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict:
 
             # 상세 정보 크롤링 실행
             product_detail = await crawler.crawl_detail(
-                product_url=str(request.product_url), include_images=False
+                product_url=str(request.product_url)
             )
 
             if not product_detail:

diff --git a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py
@@ -8,19 +8,17 @@
 class DetailCrawler(SearchCrawler):
     """SearchCrawler를 확장한 상세 크롤링 클래스"""
 
-    async def crawl_detail(
-        self, product_url: str, include_images: bool = False
-    ) -> dict:
-        """상품 상세 정보 크롤링"""
+    async def crawl_detail(self, product_url: str) -> dict:
+        """상품 상세 정보 크롤링 (이미지 항상 포함)"""
         try:
-            logger.info(
-                f"상품 상세 크롤링 시작: url='{product_url}', include_images={include_images}"
-            )
+            logger.info(f"상품 상세 크롤링 시작: url='{product_url}'")
 
-            if self.use_selenium:
-                soup = await self._get_soup_selenium(product_url)
-            else:
-                soup = await self._get_soup_httpx(product_url)
+            # HTML 가져오기
+            soup = (
+                await self._get_soup_selenium(product_url)
+                if self.use_selenium
+                else await self._get_soup_httpx(product_url)
+            )
 
             # 기본 정보 추출
             title = self._extract_title(soup)
@@ -29,6 +27,15 @@ async def crawl_detail(
             options = self._extract_options(soup)
             material_info = self._extract_material_info(soup)
 
+            # 이미지 정보 추출 (항상 실행)
+            logger.info("이미지 정보 추출 중...")
+            page_images = self._extract_images(soup)
+            option_images = [
+                opt["image_url"] for opt in options if opt.get("image_url")
+            ]
+            # 중복 제거 후 합치기
+            all_images = list(set(page_images + option_images))
+
             product_data = {
                 "url": product_url,
                 "title": title,
@@ -37,23 +44,13 @@ async def crawl_detail(
                 "options": options,
                 "material_info": material_info,
                 "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"),
+                "product_images": [{"original_url": url} for url in all_images],
             }
 
+            logger.info(f"추출된 이미지: {len(all_images)}개")
             logger.info(
-                f"기본 상품 정보 추출 완료: title='{title[:50]}', price={price}, rating={rating}, options_count={len(options)}"
+                f"상품 상세 크롤링 완료: title='{title[:50]}', price={price}, rating={rating}, options_count={len(options)}"
             )
-
-            if include_images:
-                logger.info("이미지 정보 추출 중...")
-                product_images = self._extract_images(soup)
-                product_data["product_images"] = [
-                    {"original_url": img_url} for img_url in product_images
-                ]
-                logger.info(f"추출된 이미지: {len(product_images)}개")
-            else:
-                product_data["product_images"] = []
-
-            logger.info(f"상품 상세 크롤링 완료: url='{product_url}'")
             return product_data
 
         except Exception as e:
@@ -89,54 +86,43 @@ async def _get_soup_httpx(self, product_url: str) -> BeautifulSoup:
             raise Exception(f"HTTP 요청 실패: {e}")
 
     def _extract_title(self, soup: BeautifulSoup) -> str:
-        """제목 추출"""
         title_element = soup.find("h1", {"id": "kakaotitle"})
         title = title_element.get_text(strip=True) if title_element else "제목 없음"
         logger.debug(f"제목 추출: '{title[:50]}'")
         return title
 
     def _extract_price(self, soup: BeautifulSoup) -> int:
-        """가격 추출"""
         price = 0
-        price_selectors = [
+        selectors = [
             "span.price.gsItemPriceKWR",
             ".pdt_price span.price",
             "span.price",
             ".price",
         ]
-
-        for selector in price_selectors:
-            price_element = soup.select_one(selector)
-            if price_element:
-                price_text = (
-                    price_element.get_text(strip=True)
-                    .replace(",", "")
-                    .replace("원", "")
-                )
-                price_match = re.search(r"(\d+)", price_text)
-                if price_match:
-                    price = int(price_match.group(1))
-                    logger.debug(f"가격 추출 성공: {price}원 (selector: {selector})")
+        for sel in selectors:
+            el = soup.select_one(sel)
+            if el:
+                text = el.get_text(strip=True).replace(",", "").replace("원", "")
+                match = re.search(r"(\d+)", text)
+                if match:
+                    price = int(match.group(1))
+                    logger.debug(f"가격 추출 성공: {price}원 (selector: {sel})")
                     break
-
         if price == 0:
             logger.debug("가격 추출 실패 - 0원으로 설정")
-
         return price
 
     def _extract_rating(self, soup: BeautifulSoup) -> float:
-        """평점 추출"""
         rating = 0.0
-        star_containers = [
+        containers = [
             soup.find("a", class_="start"),
             soup.find("div", class_=re.compile(r"star|rating")),
             soup.find("a", href="#reviews_wrap"),
         ]
-
-        for container in star_containers:
-            if container:
-                star_imgs = container.find_all("img")
-                for img in star_imgs:
+        for cont in containers:
+            if cont:
+                imgs = cont.find_all("img")
+                for img in imgs:
                     src = img.get("src", "")
                     if "icon_star.svg" in src:
                         rating += 1
@@ -145,88 +131,57 @@ def _extract_rating(self, soup: BeautifulSoup) -> float:
                 if rating > 0:
                     logger.debug(f"평점 추출 성공: {rating}점")
                     break
-
         if rating == 0.0:
             logger.debug("평점 추출 실패 - 0.0점으로 설정")
-
         return rating
 
     def _extract_options(self, soup: BeautifulSoup) -> list[dict]:
-        """상품 옵션 추출"""
         options = []
         sku_list = soup.find("ul", {"id": "skubox"})
-
         if sku_list:
-            option_items = sku_list.find_all("li", class_=re.compile(r"imgWrapper"))
-            logger.debug(f"옵션 항목 발견: {len(option_items)}개")
-
-            for item in option_items:
-                title_element = item.find("a", title=True)
-                if title_element:
-                    option_name = title_element.get("title", "").strip()
-
-                    # 재고 정보 추출
+            items = sku_list.find_all("li", class_=re.compile(r"imgWrapper"))
+            for item in items:
+                title_el = item.find("a", title=True)
+                if title_el:
+                    name = title_el.get("title", "").strip()
                     stock = 0
-                    item_text = item.get_text()
-                    stock_match = re.search(r"재고\s*:\s*(\d+)", item_text)
+                    stock_match = re.search(r"재고\s*:\s*(\d+)", item.get_text())
                     if stock_match:
                         stock = int(stock_match.group(1))
-
-                    # 이미지 URL 추출
-                    img_element = item.find("img", class_="colorSpec_hashPic")
-                    image_url = ""
-                    if img_element and img_element.get("src"):
-                        image_url = img_element["src"]
-
-                    if option_name:
+                    img_el = item.find("img", class_="colorSpec_hashPic")
+                    img_url = img_el["src"] if img_el and img_el.get("src") else ""
+                    if name:
                         options.append(
-                            {
-                                "name": option_name,
-                                "stock": stock,
-                                "image_url": image_url,
-                            }
+                            {"name": name, "stock": stock, "image_url": img_url}
                         )
-                        logger.debug(f"옵션 추출: name='{option_name}', stock={stock}")
-
         logger.info(f"총 {len(options)}개 옵션 추출 완료")
         return options
 
     def _extract_material_info(self, soup: BeautifulSoup) -> dict:
-        """소재 정보 추출"""
         material_info = {}
-        info_items = soup.find_all("div", class_="pro-info-item")
-
-        for item in info_items:
-            title_element = item.find("div", class_="pro-info-title")
-            info_element = item.find("div", class_="pro-info-info")
-
-            if title_element and info_element:
-                title = title_element.get_text(strip=True)
-                info = info_element.get_text(strip=True)
-                material_info[title] = info
-                logger.debug(f"소재 정보 추출: {title}='{info}'")
-
+        items = soup.find_all("div", class_="pro-info-item")
+        for item in items:
+            title_el = item.find("div", class_="pro-info-title")
+            info_el = item.find("div", class_="pro-info-info")
+            if title_el and info_el:
+                material_info[title_el.get_text(strip=True)] = info_el.get_text(
+                    strip=True
+                )
         logger.info(f"총 {len(material_info)}개 소재 정보 추출 완료")
         return material_info
 
     def _extract_images(self, soup: BeautifulSoup) -> list[str]:
-        """상품 이미지 추출"""
         images = []
+        # img_translate_x 패턴
         img_elements = soup.find_all("img", {"id": re.compile(r"img_translate_\d+")})
-
         for img in img_elements:
-            src = img.get("src", "")
-            if src:
-                if src.startswith("//"):
-                    src = "https:" + src
-                elif src.startswith("/"):
-                    src = self.base_url + src
-                elif src.startswith("http"):
-                    pass
-                else:
-                    continue
-                images.append(src)
-                logger.debug(f"이미지 URL 추출: {src}")
-
+            src = img.get("src") or img.get("data-src")
+            if not src:
+                continue
+            if src.startswith("//"):
+                src = "https:" + src
+            elif src.startswith("/"):
+                src = self.base_url + src
+            images.append(src)
         logger.info(f"총 {len(images)}개 이미지 URL 추출 완료")
         return images