Skip to content

Commit

Permalink
v3.4.0
Browse files Browse the repository at this point in the history
- Added --workers to adjust pool thread workers for Thread Pool Scraper
- Use Firefox instead of Chrome
- Adjusted wait time for clicking pop-up ad while scrolling down to 1 second
- Adjusted wait time for clicking load more result button to 1 second
- Adjusted wait time for clicking pop-up ad to 2 seconds
- Adjusted scroll-down length to 2,000 pixels
- Use logging instead of loguru
- Adjusted scripts to use the same logger for all scrapers
- Added driver wait for clicking 'load more result' button
- Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper
- Added more tests
- Added logic to check in case the past year is entered for Thread Pool and Month End scraper
- Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action
- Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action
- Adjusted log message
- Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button
  • Loading branch information
sakan811 committed Jun 13, 2024
1 parent 825e526 commit c2b8cc3
Show file tree
Hide file tree
Showing 4 changed files with 57 additions and 14 deletions.
57 changes: 47 additions & 10 deletions japan_avg_hotel_price_finder/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -21,7 +21,7 @@
import pandas as pd
from pandas import DataFrame
from selenium import webdriver
from selenium.common import NoSuchElementException, TimeoutException, WebDriverException
from selenium.common import NoSuchElementException, TimeoutException, WebDriverException, ElementClickInterceptedException
from selenium.webdriver.chrome.webdriver import WebDriver
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
Expand Down Expand Up @@ -191,16 +191,34 @@ def _click_load_more_result_button(self, driver: WebDriver) -> None:
'div.b3869ababc > div.b2c588d242 > div.c1b783d372.b99ea5ed8e > '
'div.fb4e9b097f > div.fa298e29e2.a1b24d26fa > button')

wait = WebDriverWait(driver, 1)
try:
wait = WebDriverWait(driver, 1)
load_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_result_css_selector)))
load_more_button.click()
except NoSuchElementException as e:
logger.error(e)
logger.error(f'{load_more_result_css_selector} not found. Keep scrolling.')
logger.error(f'The \'load more result\' button not found. Keep scrolling.')
except ElementClickInterceptedException as e:
logger.warning(e)
logger.warning("ElementClickInterceptedException: The button is obscured. Trying to handle the obstruction.")

logger.info("Identify the obstructing element")
try:
overlay = driver.find_element(By.CLASS_NAME, 'a3f7e233ba')
if overlay.is_displayed():
logger.info("Found an obstructing overlay, attempting to hide it.")
driver.execute_script("arguments[0].style.display='none';", overlay)
logger.info("Obstructing overlay hidden.")
except NoSuchElementException as e:
logger.warning(e)
logger.warning("No obstructing overlay found.")

logger.info("Retry clicking the button")
load_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, load_more_result_css_selector)))
load_more_button.click()
except Exception as e:
logger.error(e)
logger.error(f'{load_more_result_css_selector} failed due to {e}')
logger.error(f'Unexpected error occurred')
else:
self.load_more_result_clicked += 1
logger.debug(f'{load_more_result_css_selector} clicked successfully')
Expand Down Expand Up @@ -264,7 +282,7 @@ def _scrape(self, url: str) -> dict:

wait = WebDriverWait(driver, 2)

self._click_pop_up_ad(wait)
self._click_pop_up_ad(wait, driver)

self._scroll_down_until_page_bottom(driver)

Expand Down Expand Up @@ -336,10 +354,11 @@ def start_scraping_process(self, check_in: str, check_out: str, to_sqlite: bool
logger.info('Return data as DataFrame')
return df_filtered

def _click_pop_up_ad(self, wait: WebDriverWait) -> None:
def _click_pop_up_ad(self, wait: WebDriverWait, driver: WebDriver) -> None:
"""
Click pop-up ad.
:param wait: Selenium WebDriverWait object.
:param driver: Selenium WebDriver object.
:return: None.
"""
logger.info("Clicking pop-up ad...")
Expand All @@ -351,14 +370,32 @@ def _click_pop_up_ad(self, wait: WebDriverWait) -> None:
ads.click()
except NoSuchElementException as e:
logger.error(e)
logger.error(f'{ads_css_selector} not found')
logger.error(f'Pop-up ad not found')
except TimeoutException as e:
logger.error(e)
logger.error(f'{ads_css_selector} timed out')
logger.error(f'Pop-up ad timed out')
logger.error(f'Moving on')
except ElementClickInterceptedException as e:
logger.warning(e)
logger.warning("ElementClickInterceptedException: The pop-up ad is obscured. Trying to handle the obstruction.")

logger.info("Identify the obstructing element")
try:
overlay = driver.find_element(By.CLASS_NAME, 'a3f7e233ba')
if overlay.is_displayed():
logger.info("Found an obstructing overlay, attempting to hide it.")
driver.execute_script("arguments[0].style.display='none';", overlay)
logger.info("Obstructing overlay hidden.")
except NoSuchElementException as e:
logger.warning(e)
logger.warning("No obstructing overlay found.")

logger.info("Retry clicking the pop-up ad")
load_more_button = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
load_more_button.click()
except Exception as e:
logger.error(e)
logger.error(f'{ads_css_selector} failed due to {e}')
logger.error(f'Unexpected error occurred')
else:
self.pop_up_clicked += 1
logger.debug('Clicked the pop-up ads successfully')
Expand Down Expand Up @@ -394,7 +431,7 @@ def _scroll_down_until_page_bottom(self, driver: WebDriver) -> None:

wait = WebDriverWait(driver, 1)
logger.info("Clicking pop-up ad in case it appears...")
self._click_pop_up_ad(wait)
self._click_pop_up_ad(wait, driver)


if __name__ == '__main__':
Expand Down
2 changes: 1 addition & 1 deletion japan_avg_hotel_price_finder/scrape_until_month_end.py
Original file line number Diff line number Diff line change
Expand Up @@ -84,7 +84,7 @@ def scrape_until_month_end(self, to_sqlite: bool = False, timezone=None) -> None
month = current_date.month
year = current_date.year

current_date_has_passed: bool = check_if_current_date_has_passed(year, month, start_day)
current_date_has_passed: bool = check_if_current_date_has_passed(year, month, start_day, timezone)
if current_date_has_passed:
logger.warning(f'The current day of the month to scrape was passed. '
f'Skip {year}-{month}-{start_day}.')
Expand Down
2 changes: 1 addition & 1 deletion japan_avg_hotel_price_finder/thread_scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -73,7 +73,7 @@ def scrape_each_date(day: int) -> None:
"""
logger.info('Scraping hotel data of the given date...')

current_date_has_passed: bool = check_if_current_date_has_passed(self.year, self.month, day)
current_date_has_passed: bool = check_if_current_date_has_passed(self.year, self.month, day, timezone)

current_date = datetime(self.year, self.month, day)
if current_date_has_passed:
Expand Down
10 changes: 8 additions & 2 deletions japan_avg_hotel_price_finder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,15 +14,21 @@
logger = configure_logging_with_file('jp_hotel_data.log', 'jp_hotel_data')


def check_if_current_date_has_passed(year: int, month: int, day: int) -> bool:
def check_if_current_date_has_passed(year: int, month: int, day: int, timezone=None) -> bool:
"""
Check if the current date has passed the given day of the month.
:param year: The year of the date to check.
:param month: The month of the date to check.
:param day: The day of the month to check.
:param timezone: Set timezone.
Default is None.
:return: True if the current date has passed the given day, False otherwise.
"""
today_for_check = datetime.datetime.today().strftime('%Y-%m-%d')
if timezone is not None:
today = datetime.datetime.now(timezone)
else:
today = datetime.datetime.today()
today_for_check = today.strftime('%Y-%m-%d')
current_date_for_check = datetime.datetime(year, month, day).strftime('%Y-%m-%d')
if current_date_for_check < today_for_check:
return True
Expand Down

0 comments on commit c2b8cc3

Please sign in to comment.