Skip to content

Commit

Permalink
v3.4.1
Browse files Browse the repository at this point in the history
- Added --workers to adjust pool thread workers for Thread Pool Scraper
- Use Firefox instead of Chrome
- Adjusted wait time for clicking pop-up ad while scrolling down to 1 second
- Adjusted wait time for clicking load more result button to 1 second
- Adjusted wait time for clicking pop-up ad to 2 seconds
- Adjusted scroll-down length to 2,000 pixels
- Use logging instead of loguru
- Adjusted scripts to use the same logger for all scrapers
- Added driver wait for clicking 'load more result' button
- Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper
- Added more tests
- Added logic to check in case the past year is entered for Thread Pool and Month End scraper
- Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action
- Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action
- Adjusted log message
- Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button
- Added NoSuchWindowException handler while scrolling down the browser window
  • Loading branch information
sakan811 committed Jun 14, 2024
1 parent 399eae2 commit fae8610
Showing 1 changed file with 37 additions and 35 deletions.
72 changes: 37 additions & 35 deletions japan_avg_hotel_price_finder/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,8 @@
import pandas as pd
from pandas import DataFrame
from selenium import webdriver
from selenium.common import NoSuchElementException, TimeoutException, WebDriverException, ElementClickInterceptedException
from selenium.common import NoSuchElementException, TimeoutException, WebDriverException, \
ElementClickInterceptedException, NoSuchWindowException
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Expand Down Expand Up @@ -204,20 +205,7 @@ def _click_load_more_result_button(self, driver: WebDriver) -> None:
logger.warning("ElementClickInterceptedException: The load more result button is obscured. "
"Trying to handle the obstruction.")

# List of possible obstructing class names
obstructing_classes = self.obstructing_classes

logger.info("Identifying the obstructing element(s)")

for class_name in obstructing_classes:
try:
overlay = driver.find_element(By.CLASS_NAME, class_name)
if overlay.is_displayed():
logger.info(f"Found an obstructing overlay with class '{class_name}', attempting to hide it.")
driver.execute_script("arguments[0].style.display='none';", overlay)
logger.info(f"Obstructing overlay with class '{class_name}' hidden.")
except NoSuchElementException:
logger.info(f"No obstructing overlay found with class '{class_name}'.")
self._hide_overlay_element(driver)

logger.info("Retry clicking the load more result button")
load_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_result_css_selector)))
Expand All @@ -229,6 +217,23 @@ def _click_load_more_result_button(self, driver: WebDriver) -> None:
self.load_more_result_clicked += 1
logger.debug(f'Load more result button clicked successfully')

def _hide_overlay_element(self, driver) -> None:
"""
Hide the overlay element.
:param driver: Selenium WebDriver.
:return: None
"""
logger.info("Identifying the obstructing element(s)...")
for class_name in self.obstructing_classes:
try:
overlay = driver.find_element(By.CLASS_NAME, class_name)
if overlay.is_displayed():
logger.info(f"Found an obstructing overlay with class '{class_name}', attempting to hide it.")
driver.execute_script("arguments[0].style.display='none';", overlay)
logger.info(f"Obstructing overlay with class '{class_name}' hidden.")
except NoSuchElementException:
logger.info(f"No obstructing overlay found with class '{class_name}'.")

def _find_box_elements(self, soup) -> bs4.ResultSet:
"""
Find box elements from box class.
Expand Down Expand Up @@ -386,20 +391,7 @@ def _click_pop_up_ad(self, wait: WebDriverWait, driver: WebDriver) -> None:
logger.warning("ElementClickInterceptedException: The pop-up ad is obscured. "
"Trying to handle the obstruction.")

# List of possible obstructing class names
obstructing_classes = self.obstructing_classes

logger.info("Identifying the obstructing element(s)")

for class_name in obstructing_classes:
try:
overlay = driver.find_element(By.CLASS_NAME, class_name)
if overlay.is_displayed():
logger.info(f"Found an obstructing overlay with class '{class_name}', attempting to hide it.")
driver.execute_script("arguments[0].style.display='none';", overlay)
logger.info(f"Obstructing overlay with class '{class_name}' hidden.")
except NoSuchElementException:
logger.info(f"No obstructing overlay found with class '{class_name}'.")
self._hide_overlay_element(driver)

logger.info("Retry clicking the pop-up ad")
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
Expand All @@ -420,17 +412,27 @@ def _scroll_down_until_page_bottom(self, driver: WebDriver) -> None:
:return: None.
"""
logger.info("Scrolling down until the bottom of the page...")

current_height = 0
new_height = 0
while True:
current_height = driver.execute_script("return window.scrollY")
logger.debug(f'{current_height = }')
# Get current height
try:
current_height = driver.execute_script("return window.scrollY")
logger.debug(f'{current_height = }')
except NoSuchWindowException as e:
logger.error(e)
logger.error('No such window: The browsing context has been discarded.')

# Scroll down to the bottom
driver.execute_script("window.scrollBy(0, 2000);")

# Get current height
new_height = driver.execute_script("return window.scrollY")
logger.debug(f'{new_height = }')
try:
# Get current height
new_height = driver.execute_script("return window.scrollY")
logger.debug(f'{new_height = }')
except NoSuchWindowException as e:
logger.error(e)
logger.error('No such window: The browsing context has been discarded.')

# If the new height is the same as the last height, then the bottom is reached
if current_height == new_height:
Expand Down

0 comments on commit fae8610

Please sign in to comment.