Skip to content

How to scratch TEMU #689

@KalvinThien

Description

@KalvinThien

I'm trying to scrape the information from TEMU https://www.temu.com/.
Processing https://www.temu.com/vn-en/2--car--------universal--sun----pvc---accessories-----g-601099650626830.html ...
Extracted Data:
Title: No title found.

image

`import sys
import asyncio
import time
from PyQt6.QtWidgets import QApplication, QMainWindow, QVBoxLayout, QWidget, QPushButton, QTextEdit, QLineEdit, QLabel
from PyQt6.QtCore import QThread, pyqtSignal
from crawlee.playwright_crawler import PlaywrightCrawler, PlaywrightCrawlingContext

class CrawlerThread(QThread):
log_signal = pyqtSignal(str)
data_signal = pyqtSignal(str)
runtime_signal = pyqtSignal(str)

def __init__(self, url):
    super().__init__()
    self.url = url
    self.request_count = 0
    self.failed_requests = 0
    self.total_duration = 0

async def run_crawler(self):
    crawler = PlaywrightCrawler(max_requests_per_crawl=1)

    @crawler.router.default_handler
    async def request_handler(context: PlaywrightCrawlingContext):
        self.log_signal.emit(f"Processing {context.request.url} ...")
        start_time = time.time()
        
        try:
            # Extract title from the specified div with class '_2rn4tqXP'
            title_element = context.page.locator('._2rn4tqXP')
            title = await title_element.inner_text() if await title_element.count() > 0 else "No title found."

            # Calculate request duration
            request_duration = time.time() - start_time
            self.request_count += 1
            self.total_duration += request_duration

            # Emit data signal
            self.data_signal.emit(f"Title: {title}\n")

        except Exception as e:
            self.failed_requests += 1
            self.log_signal.emit(f"Error: {e}")

    await crawler.run([self.url])

    # Calculate and emit runtime statistics
    average_duration = self.total_duration / self.request_count if self.request_count > 0 else 0
    runtime_stats = (
        f"Requests Finished: {self.request_count}\n"
        f"Requests Failed: {self.failed_requests}\n"
        f"Average Request Duration: {average_duration:.2f} seconds\n"
        f"Total Runtime: {self.total_duration:.2f} seconds"
    )
    self.runtime_signal.emit(runtime_stats)

def run(self):
    asyncio.run(self.run_crawler())

class MainWindow(QMainWindow):
def init(self):
super().init()
self.setWindowTitle("Web Data Crawler")
self.setGeometry(100, 100, 800, 600)

    # Widgets
    self.url_input = QLineEdit()
    self.url_input.setPlaceholderText("Enter URL here")
    self.start_button = QPushButton("Start Crawling")
    self.output_area = QTextEdit()
    self.output_area.setReadOnly(True)
    self.runtime_label = QLabel("Runtime Statistics:")

    # Layout
    layout = QVBoxLayout()
    layout.addWidget(self.url_input)
    layout.addWidget(self.start_button)
    layout.addWidget(self.output_area)
    layout.addWidget(self.runtime_label)
    container = QWidget()
    container.setLayout(layout)
    self.setCentralWidget(container)

    # Connections
    self.start_button.clicked.connect(self.start_crawling)

def start_crawling(self):
    url = self.url_input.text().strip()
    if not url:
        self.output_area.setText("Please enter a valid URL.")
        return

    self.output_area.clear()

    # Run the crawler in a separate thread
    self.crawler_thread = CrawlerThread(url)
    self.crawler_thread.log_signal.connect(self.update_output)
    self.crawler_thread.data_signal.connect(self.display_data)
    self.crawler_thread.runtime_signal.connect(self.display_runtime)
    self.crawler_thread.start()

def update_output(self, text):
    self.output_area.append(text)

def display_data(self, data):
    self.output_area.append("Extracted Data:\n" + data)

def display_runtime(self, runtime):
    self.runtime_label.setText("Runtime Statistics:\n" + runtime)

app = QApplication(sys.argv)
window = MainWindow()
window.show()
sys.exit(app.exec())

`

I've tried but they all return no find.can someone help me?

Metadata

Metadata

Assignees

No one assigned

    Labels

    t-toolingIssues with this label are in the ownership of the tooling team.

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions