diff --git a/apps/pre-processing-service/app/service/crawl_service.py b/apps/pre-processing-service/app/service/crawl_service.py index df90ba01..af8f91bc 100644 --- a/apps/pre-processing-service/app/service/crawl_service.py +++ b/apps/pre-processing-service/app/service/crawl_service.py @@ -24,7 +24,7 @@ async def crawl_product_detail(self, request: RequestSadaguCrawl) -> dict: # 상세 정보 크롤링 실행 product_detail = await crawler.crawl_detail( - product_url=str(request.product_url), include_images=False + product_url=str(request.product_url) ) if not product_detail: diff --git a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py index 885fd2f0..f01ed53a 100644 --- a/apps/pre-processing-service/app/service/crawlers/detail_crawler.py +++ b/apps/pre-processing-service/app/service/crawlers/detail_crawler.py @@ -8,19 +8,17 @@ class DetailCrawler(SearchCrawler): """SearchCrawler를 확장한 상세 크롤링 클래스""" - async def crawl_detail( - self, product_url: str, include_images: bool = False - ) -> dict: - """상품 상세 정보 크롤링""" + async def crawl_detail(self, product_url: str) -> dict: + """상품 상세 정보 크롤링 (이미지 항상 포함)""" try: - logger.info( - f"상품 상세 크롤링 시작: url='{product_url}', include_images={include_images}" - ) + logger.info(f"상품 상세 크롤링 시작: url='{product_url}'") - if self.use_selenium: - soup = await self._get_soup_selenium(product_url) - else: - soup = await self._get_soup_httpx(product_url) + # HTML 가져오기 + soup = ( + await self._get_soup_selenium(product_url) + if self.use_selenium + else await self._get_soup_httpx(product_url) + ) # 기본 정보 추출 title = self._extract_title(soup) @@ -29,6 +27,15 @@ async def crawl_detail( options = self._extract_options(soup) material_info = self._extract_material_info(soup) + # 이미지 정보 추출 (항상 실행) + logger.info("이미지 정보 추출 중...") + page_images = self._extract_images(soup) + option_images = [ + opt["image_url"] for opt in options if opt.get("image_url") + ] + # 중복 제거 후 합치기 + all_images = list(set(page_images + option_images)) + product_data = { "url": product_url, "title": title, @@ -37,23 +44,13 @@ async def crawl_detail( "options": options, "material_info": material_info, "crawled_at": time.strftime("%Y-%m-%d %H:%M:%S"), + "product_images": [{"original_url": url} for url in all_images], } + logger.info(f"추출된 이미지: {len(all_images)}개") logger.info( - f"기본 상품 정보 추출 완료: title='{title[:50]}', price={price}, rating={rating}, options_count={len(options)}" + f"상품 상세 크롤링 완료: title='{title[:50]}', price={price}, rating={rating}, options_count={len(options)}" ) - - if include_images: - logger.info("이미지 정보 추출 중...") - product_images = self._extract_images(soup) - product_data["product_images"] = [ - {"original_url": img_url} for img_url in product_images - ] - logger.info(f"추출된 이미지: {len(product_images)}개") - else: - product_data["product_images"] = [] - - logger.info(f"상품 상세 크롤링 완료: url='{product_url}'") return product_data except Exception as e: @@ -89,54 +86,43 @@ async def _get_soup_httpx(self, product_url: str) -> BeautifulSoup: raise Exception(f"HTTP 요청 실패: {e}") def _extract_title(self, soup: BeautifulSoup) -> str: - """제목 추출""" title_element = soup.find("h1", {"id": "kakaotitle"}) title = title_element.get_text(strip=True) if title_element else "제목 없음" logger.debug(f"제목 추출: '{title[:50]}'") return title def _extract_price(self, soup: BeautifulSoup) -> int: - """가격 추출""" price = 0 - price_selectors = [ + selectors = [ "span.price.gsItemPriceKWR", ".pdt_price span.price", "span.price", ".price", ] - - for selector in price_selectors: - price_element = soup.select_one(selector) - if price_element: - price_text = ( - price_element.get_text(strip=True) - .replace(",", "") - .replace("원", "") - ) - price_match = re.search(r"(\d+)", price_text) - if price_match: - price = int(price_match.group(1)) - logger.debug(f"가격 추출 성공: {price}원 (selector: {selector})") + for sel in selectors: + el = soup.select_one(sel) + if el: + text = el.get_text(strip=True).replace(",", "").replace("원", "") + match = re.search(r"(\d+)", text) + if match: + price = int(match.group(1)) + logger.debug(f"가격 추출 성공: {price}원 (selector: {sel})") break - if price == 0: logger.debug("가격 추출 실패 - 0원으로 설정") - return price def _extract_rating(self, soup: BeautifulSoup) -> float: - """평점 추출""" rating = 0.0 - star_containers = [ + containers = [ soup.find("a", class_="start"), soup.find("div", class_=re.compile(r"star|rating")), soup.find("a", href="#reviews_wrap"), ] - - for container in star_containers: - if container: - star_imgs = container.find_all("img") - for img in star_imgs: + for cont in containers: + if cont: + imgs = cont.find_all("img") + for img in imgs: src = img.get("src", "") if "icon_star.svg" in src: rating += 1 @@ -145,88 +131,57 @@ def _extract_rating(self, soup: BeautifulSoup) -> float: if rating > 0: logger.debug(f"평점 추출 성공: {rating}점") break - if rating == 0.0: logger.debug("평점 추출 실패 - 0.0점으로 설정") - return rating def _extract_options(self, soup: BeautifulSoup) -> list[dict]: - """상품 옵션 추출""" options = [] sku_list = soup.find("ul", {"id": "skubox"}) - if sku_list: - option_items = sku_list.find_all("li", class_=re.compile(r"imgWrapper")) - logger.debug(f"옵션 항목 발견: {len(option_items)}개") - - for item in option_items: - title_element = item.find("a", title=True) - if title_element: - option_name = title_element.get("title", "").strip() - - # 재고 정보 추출 + items = sku_list.find_all("li", class_=re.compile(r"imgWrapper")) + for item in items: + title_el = item.find("a", title=True) + if title_el: + name = title_el.get("title", "").strip() stock = 0 - item_text = item.get_text() - stock_match = re.search(r"재고\s*:\s*(\d+)", item_text) + stock_match = re.search(r"재고\s*:\s*(\d+)", item.get_text()) if stock_match: stock = int(stock_match.group(1)) - - # 이미지 URL 추출 - img_element = item.find("img", class_="colorSpec_hashPic") - image_url = "" - if img_element and img_element.get("src"): - image_url = img_element["src"] - - if option_name: + img_el = item.find("img", class_="colorSpec_hashPic") + img_url = img_el["src"] if img_el and img_el.get("src") else "" + if name: options.append( - { - "name": option_name, - "stock": stock, - "image_url": image_url, - } + {"name": name, "stock": stock, "image_url": img_url} ) - logger.debug(f"옵션 추출: name='{option_name}', stock={stock}") - logger.info(f"총 {len(options)}개 옵션 추출 완료") return options def _extract_material_info(self, soup: BeautifulSoup) -> dict: - """소재 정보 추출""" material_info = {} - info_items = soup.find_all("div", class_="pro-info-item") - - for item in info_items: - title_element = item.find("div", class_="pro-info-title") - info_element = item.find("div", class_="pro-info-info") - - if title_element and info_element: - title = title_element.get_text(strip=True) - info = info_element.get_text(strip=True) - material_info[title] = info - logger.debug(f"소재 정보 추출: {title}='{info}'") - + items = soup.find_all("div", class_="pro-info-item") + for item in items: + title_el = item.find("div", class_="pro-info-title") + info_el = item.find("div", class_="pro-info-info") + if title_el and info_el: + material_info[title_el.get_text(strip=True)] = info_el.get_text( + strip=True + ) logger.info(f"총 {len(material_info)}개 소재 정보 추출 완료") return material_info def _extract_images(self, soup: BeautifulSoup) -> list[str]: - """상품 이미지 추출""" images = [] + # img_translate_x 패턴 img_elements = soup.find_all("img", {"id": re.compile(r"img_translate_\d+")}) - for img in img_elements: - src = img.get("src", "") - if src: - if src.startswith("//"): - src = "https:" + src - elif src.startswith("/"): - src = self.base_url + src - elif src.startswith("http"): - pass - else: - continue - images.append(src) - logger.debug(f"이미지 URL 추출: {src}") - + src = img.get("src") or img.get("data-src") + if not src: + continue + if src.startswith("//"): + src = "https:" + src + elif src.startswith("/"): + src = self.base_url + src + images.append(src) logger.info(f"총 {len(images)}개 이미지 URL 추출 완료") return images