Skip to content

Commit

Permalink
v3.13.0
Browse files Browse the repository at this point in the history
- Added --workers to adjust pool thread workers for Thread Pool Scraper
- Adjusted default pool thread workers to 5
- Use Firefox instead of Chrome
- Adjusted wait time for clicking pop-up ad and load more result button while scrolling down to 0.1 seconds
- Adjust WebDriver Wait poll frequency to 0
- Adjusted scroll-down length to 2,000 pixels
- Use logging instead of loguru
- Adjusted scripts to use the same logger for all scrapers
- Added driver wait for clicking 'load more result' button
- Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper
- Added more tests
- Added logic to check in case the past year is entered for Thread Pool and Month End scraper
- Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action
- Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action
- Adjusted log message
- Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button
- Added NoSuchWindowException handler while scrolling down the browser window
- Added finally block to ensure that the driver is closed
- Handle case when HTML content is None.
- Add CSS selector list for clicking load more result button
- Use SQLite journal mode
  • Loading branch information
sakan811 committed Jun 16, 2024
1 parent ff59fd4 commit 9498b46
Show file tree
Hide file tree
Showing 3 changed files with 53 additions and 93 deletions.
135 changes: 45 additions & 90 deletions japan_avg_hotel_price_finder/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -195,60 +195,26 @@ def _click_load_more_result_button(self, wait: WebDriverWait, driver: WebDriver)
"""
logger.info("Click 'load more result' button.")

load_more_result_css_selector_list = [
'#bodyconstraint-inner > div:nth-child(8) > div > div.c1cce822c4 > '
'div.b3869ababc > div.b2c588d242 > div.c1b783d372.b99ea5ed8e > '
'div.fb4e9b097f > div.fa298e29e2.a1b24d26fa > button > span',
'#bodyconstraint-inner > div:nth-child(8) > div > div.c1cce822c4 > div.b3869ababc > div.b2c588d242 > '
'div.c1b783d372.b99ea5ed8e > div.fb4e9b097f > div.fa298e29e2.a1b24d26fa',
'#bodyconstraint-inner > div:nth-child(8) > div > div.c1cce822c4 > div.b3869ababc > div.b2c588d242 > '
'div.c1b783d372.b99ea5ed8e > div.fb4e9b097f > div.fa298e29e2.a1b24d26fa > button',
'#bodyconstraint-inner > div:nth-child(8) > div > div.c1cce822c4 > div.b3869ababc > div.b2c588d242 > '
'div.c1b783d372.b99ea5ed8e > div.fb4e9b097f > div.fa298e29e2.a1b24d26fa > button > span'
]

for load_more_result_css_selector in load_more_result_css_selector_list:
try:
load_more_button = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_result_css_selector)))
load_more_button.click()
except NoSuchElementException:
logger.error(f'NoSuchElementException: The \'load more result\' button not found. Keep scrolling.')
except TimeoutException:
logger.error(f'TimeoutException: The \'load more result\' button timed out.')
except ElementClickInterceptedException:
logger.warning("ElementClickInterceptedException: The load more result button is obscured. "
"Trying to handle the obstruction.")

self._hide_overlay_element(driver)

logger.info("Retry clicking the load more result button")
load_more_button = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_result_css_selector)))
load_more_button.click()
except Exception as e:
logger.error(e)
logger.error(f'Unexpected error occurred')
else:
logger.debug(f'Load more result button clicked successfully')
return 1

def _hide_overlay_element(self, driver) -> None:
"""
Hide the overlay element.
:param driver: Selenium WebDriver.
:return: None
"""
logger.info("Identifying the obstructing element(s)...")
for class_name in self.obstructing_classes:
try:
overlay = driver.find_element(By.CLASS_NAME, class_name)
if overlay.is_displayed():
logger.info(f"Found an obstructing overlay with class '{class_name}', attempting to hide it.")
driver.execute_script("arguments[0].style.display='none';", overlay)
logger.info(f"Obstructing overlay with class '{class_name}' hidden.")
except NoSuchElementException:
logger.info(f"No obstructing overlay found with class '{class_name}'.")
load_more_result_css_selector = ('#bodyconstraint-inner > div:nth-child(8) > div > div.c1cce822c4 > '
'div.b3869ababc > div.b2c588d242 > div.c1b783d372.b99ea5ed8e > '
'div.fb4e9b097f > div.fa298e29e2.a1b24d26fa > button > span')
try:
load_more_button = wait.until(
EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_result_css_selector)))
load_more_button.click()
except NoSuchElementException:
logger.error(f'NoSuchElementException: The \'load more result\' button not found. Keep scrolling.')
except ElementClickInterceptedException as e:
logger.error(e)
logger.error("ElementClickInterceptedException: The pop-up ad is obscured.")
except TimeoutException:
logger.error(f'TimeoutException: The \'load more result\' button timed out.')
except Exception as e:
logger.error(e)
logger.error(f'Unexpected error occurred')
else:
logger.debug(f'Load more result button clicked successfully')
return 1

def _find_box_elements(self, soup) -> bs4.ResultSet:
"""
Expand Down Expand Up @@ -410,36 +376,26 @@ def _click_pop_up_ad(self, wait: WebDriverWait, driver: WebDriver) -> int | None
"""
logger.info("Clicking pop-up ad...")

ads_css_selector_list = [
'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1)',
'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1)',
'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1) > svg:nth-child(1)'
]
ads_css_selector = ('div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1) '
'> svg:nth-child(1)')

for ads_css_selector in ads_css_selector_list:
try:
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except NoSuchElementException:
logger.error(f'Pop-up ad not found')
except TimeoutException:
logger.error(f'Pop-up ad timed out')
logger.error(f'Moving on')
except ElementClickInterceptedException:
logger.warning("ElementClickInterceptedException: The pop-up ad is obscured. "
"Trying to handle the obstruction.")

self._hide_overlay_element(driver)

logger.info("Retry clicking the pop-up ad")
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except Exception as e:
logger.error(e)
logger.error(f'Unexpected error occurred')
else:
logger.debug('Clicked the pop-up ads successfully')
return 1
try:
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except NoSuchElementException:
logger.error(f'Pop-up ad not found')
except TimeoutException:
logger.error(f'Pop-up ad timed out')
logger.error(f'Moving on')
except ElementClickInterceptedException as e:
logger.error(e)
logger.error("ElementClickInterceptedException: The pop-up ad is obscured.")
except Exception as e:
logger.error(e)
logger.error(f'Unexpected error occurred')
else:
logger.debug('Clicked the pop-up ads successfully')
return 1

def _scroll_down_until_page_bottom(self, wait: WebDriverWait, driver: WebDriver) -> tuple[int, int] | None:
"""
Expand Down Expand Up @@ -471,14 +427,10 @@ def _scroll_down_until_page_bottom(self, wait: WebDriverWait, driver: WebDriver)
except NoSuchWindowException:
logger.error('No such window: The browsing context has been discarded.')

if new_height == 0:
logger.error('Failed to scroll down, refreshing the page...')
# If the new height is the same as the last height, then the bottom is reached
if current_height == new_height:
logger.info("Reached the bottom of the page.")
break
else:
# If the new height is the same as the last height, then the bottom is reached
if current_height == new_height:
logger.info("Reached the bottom of the page.")
break

# Click 'load more result' button if present
num_load_more_result_button_clicked = self._click_load_more_result_button(wait, driver)
Expand All @@ -490,6 +442,9 @@ def _scroll_down_until_page_bottom(self, wait: WebDriverWait, driver: WebDriver)
if num_pop_up_ad_clicked is not None:
click_pop_up_ad_clicked_list.append(num_pop_up_ad_clicked)

# Update current height
current_height = new_height

return sum(load_more_result_button_clicked_list), sum(click_pop_up_ad_clicked_list)


Expand Down
9 changes: 7 additions & 2 deletions tests/test_thread_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,9 +32,14 @@ def test_thread_scraper() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
df = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=3)
df = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=1)

assert not df.empty
# Check row
assert df.shape[0] == 2

# Check column
assert df.shape[1] == 7


def test_thread_scraper_past_month() -> None:
Expand Down Expand Up @@ -63,7 +68,7 @@ def test_thread_scraper_past_month() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
df = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=3)
df = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=1)

assert df is None

Expand Down
2 changes: 1 addition & 1 deletion tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -154,7 +154,7 @@ def test_check_if_all_date_was_scraped() -> None:
)

thread_scrape = ThreadPoolScraper(hotel_stay)
thread_scrape.thread_scrape(to_sqlite=True, timezone=city_timezone, max_workers=3)
thread_scrape.thread_scrape(to_sqlite=True, timezone=city_timezone, max_workers=1)
check_db_if_all_date_was_scraped(sqlite_name)

with sqlite3.connect(sqlite_name) as conn:
Expand Down

0 comments on commit 9498b46

Please sign in to comment.