Skip to content

Commit

Permalink
v4.0.0
Browse files Browse the repository at this point in the history
- Added --workers to adjust pool thread workers for Thread Pool Scraper
- Adjusted default pool thread workers to 5
- Use Firefox instead of Chrome
- Adjusted wait time for clicking pop-up ad and load more result button while scrolling down to 0.1 seconds
- Adjust WebDriver Wait poll frequency to 0
- Adjusted scroll-down length to 2,000 pixels
- Use logging instead of loguru
- Adjusted scripts to use the same logger for all scrapers
- Added driver wait for clicking 'load more result' button
- Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper
- Added more tests
- Added logic to check in case the past year is entered for Thread Pool and Month End scraper
- Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action
- Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action
- Adjusted log message
- Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button
- Added NoSuchWindowException handler while scrolling down the browser window
- Added finally block to ensure that the driver is closed
- Handle case when HTML content is None.
- Add CSS selector list for clicking load more result button
- Adjusted save data process
- REmoved Month End scraper
  • Loading branch information
sakan811 committed Jun 17, 2024
1 parent 8213b2e commit bcee4b6
Show file tree
Hide file tree
Showing 3 changed files with 11 additions and 9 deletions.
10 changes: 5 additions & 5 deletions japan_avg_hotel_price_finder/scrape.py
Original file line number Diff line number Diff line change
Expand Up @@ -337,12 +337,12 @@ def start_scraping_process(self, check_in: str, check_out: str) -> tuple[DataFra

if self.num_load_more_result_clicked_list < 1:
logger.warning("Load more result button is never clicked. "
"The CSS selector for the load more result button might have a problem."
"The CSS selector for the load more result button might have a problem. "
"Please update the CSS selector in '_click_load_more_result_button' function.")
if self.num_pop_up_clicked_list < 1:
logger.warning("Pop-up ad is never clicked. "
"The CSS selector for the pop-up ad might have a problem."
"Please update the CSS selector of the pop-up ad in '_click_pop_up_ad' function.")
if self.num_pop_up_clicked_list < 1:
logger.warning("Pop-up ad is never clicked. "
"The CSS selector for the pop-up ad might have a problem. "
"Please update the CSS selector of the pop-up ad in '_click_pop_up_ad' function.")

logger.info('Return scraped data as a Pandas DataFrame')
return df_filtered, city, check_in, check_out
Expand Down
4 changes: 2 additions & 2 deletions japan_avg_hotel_price_finder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,12 +97,12 @@ def check_db_if_all_date_was_scraped(db: str) -> None:
scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True)


def check_csv_if_all_date_was_scraped() -> None:
def check_csv_if_all_date_was_scraped(directory) -> None:
"""
Check inside the CSV files directory if all dates of each month were scraped today.
:param directory: Path to the CSV files directory.
:returns: None
"""
directory = 'scraped_hotel_data_csv'
logger.info(f"Checking CSV files in the {directory} directory if all date was scraped today...")
temp_db = 'temp_db.db'
try:
Expand Down
6 changes: 4 additions & 2 deletions tests/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -102,11 +102,13 @@ def test_check_if_all_date_was_scraped_csv() -> None:
start_day=start_day, month=month, year=year, nights=nights, sqlite_name=sqlite_name
)

directory = 'test_check_if_all_date_was_scraped_csv'

thread_scrape = ThreadPoolScraper(hotel_stay)
df, city, check_in, check_out = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=5)
save_scraped_data(dataframe=df, city=city, check_in=check_in,
check_out=check_out, save_dir='test_check_if_all_date_was_scraped_csv')
check_csv_if_all_date_was_scraped()
check_out=check_out, save_dir=directory)
check_csv_if_all_date_was_scraped(directory)

with sqlite3.connect(sqlite_name) as conn:
directory = 'test_check_if_all_date_was_scraped_csv'
Expand Down

0 comments on commit bcee4b6

Please sign in to comment.