Skip to content

Commit

Permalink
v3.12.2
Browse files Browse the repository at this point in the history
- Added --workers to adjust pool thread workers for Thread Pool Scraper
- Adjusted default pool thread workers to 5
- Use Firefox instead of Chrome
- Adjusted wait time for clicking pop-up ad and load more result button while scrolling down to 0.1 seconds
- Adjust WebDriver Wait poll frequency to 0
- Adjusted scroll-down length to 2,000 pixels
- Use logging instead of loguru
- Adjusted scripts to use the same logger for all scrapers
- Added driver wait for clicking 'load more result' button
- Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper
- Added more tests
- Added logic to check in case the past year is entered for Thread Pool and Month End scraper
- Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action
- Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action
- Adjusted log message
- Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button
- Added NoSuchWindowException handler while scrolling down the browser window
- Added finally block to ensure that the driver is closed
- Handle case when HTML content is None.
- Add CSS selector list for clicking load more result button
- Use SQLite journal mode
  • Loading branch information
sakan811 committed Jun 16, 2024
1 parent 57b0450 commit e584ee8
Showing 1 changed file with 31 additions and 24 deletions.
55 changes: 31 additions & 24 deletions japan_avg_hotel_price_finder/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,7 @@
import datetime
import os
import re
import time

import bs4
import pandas as pd
Expand Down Expand Up @@ -412,30 +413,36 @@ def _click_pop_up_ad(self, wait: WebDriverWait, driver: WebDriver) -> int | None
"""
logger.info("Clicking pop-up ad...")

ads_css_selector = 'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1)'
try:
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except NoSuchElementException:
logger.error(f'Pop-up ad not found')
except TimeoutException:
logger.error(f'Pop-up ad timed out')
logger.error(f'Moving on')
except ElementClickInterceptedException:
logger.warning("ElementClickInterceptedException: The pop-up ad is obscured. "
"Trying to handle the obstruction.")

self._hide_overlay_element(driver)

logger.info("Retry clicking the pop-up ad")
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except Exception as e:
logger.error(e)
logger.error(f'Unexpected error occurred')
else:
logger.debug('Clicked the pop-up ads successfully')
return 1
ads_css_selector_list = [
'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1)',
'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1)',
'div.e93d17c51f:nth-child(1) > button:nth-child(1) > span:nth-child(1) > span:nth-child(1) > svg:nth-child(1)'
]

for ads_css_selector in ads_css_selector_list:
try:
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except NoSuchElementException:
logger.error(f'Pop-up ad not found')
except TimeoutException:
logger.error(f'Pop-up ad timed out')
logger.error(f'Moving on')
except ElementClickInterceptedException:
logger.warning("ElementClickInterceptedException: The pop-up ad is obscured. "
"Trying to handle the obstruction.")

self._hide_overlay_element(driver)

logger.info("Retry clicking the pop-up ad")
ads = wait.until(EC.element_to_be_clickable((By.CSS_SELECTOR, ads_css_selector)))
ads.click()
except Exception as e:
logger.error(e)
logger.error(f'Unexpected error occurred')
else:
logger.debug('Clicked the pop-up ads successfully')
return 1

def _scroll_down_until_page_bottom(self, wait: WebDriverWait, driver: WebDriver) -> tuple[int, int] | None:
"""
Expand Down

0 comments on commit e584ee8

Please sign in to comment.