v4.1.0

- Added --workers to adjust pool thread workers for Thread Pool Scraper - Adjusted default pool thread workers to 5 - Use Firefox instead of Chrome - Adjusted wait time for clicking pop-up ad and load more result button while scrolling down to 0.1 seconds - Adjust WebDriver Wait poll frequency to 0 - Adjusted scroll-down length to 2,000 pixels - Use logging instead of loguru - Adjusted scripts to use the same logger for all scrapers - Added driver wait for clicking 'load more result' button - Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper - Added more tests - Added logic to check in case the past year is entered for Thread Pool and Month End scraper - Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action - Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action - Adjusted log message - Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button - Added NoSuchWindowException handler while scrolling down the browser window - Added finally block to ensure that the driver is closed - Handle case when HTML content is None. - Add CSS selector list for clicking load more result button - Adjusted save data process - REmoved Month End scraper
sakan811 · Jun 17, 2024 · 745b615 · 745b615
1 parent bcee4b6
commit 745b615
Show file tree

Hide file tree

Showing 3 changed files with 20 additions and 16 deletions.
diff --git a/japan_avg_hotel_price_finder/utils.py b/japan_avg_hotel_price_finder/utils.py
@@ -86,21 +86,23 @@ def find_missing_dates_in_db(sqlite_db: str) -> list:
     return missing_dates
 
 
-def check_db_if_all_date_was_scraped(db: str) -> None:
+def check_in_db_if_all_date_was_scraped(db: str, to_sqlite: bool = False) -> None:
     """
     Check inside the SQLite database if all dates of each month were scraped today.
     :param db: Path to the SQLite database.
+    :param to_sqlite: If True, load the data to the SQLite database, else save to CSV.
     :returns: None
     """
     logger.info(f"Checking in the SQLite database '{db}' if any date was not scraped today...")
     missing_dates = find_missing_dates_in_db(db)
-    scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True)
+    scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=to_sqlite)
 
 
-def check_csv_if_all_date_was_scraped(directory) -> None:
+def check_in_csv_dir_if_all_date_was_scraped(directory: str = 'scraped_hotel_data_csv') -> None:
     """
     Check inside the CSV files directory if all dates of each month were scraped today.
     :param directory: Path to the CSV files directory.
+                    Default is 'scraped_hotel_data_csv' folder.
     :returns: None
     """
     logger.info(f"Checking CSV files in the {directory} directory if all date was scraped today...")
@@ -115,15 +117,15 @@ def check_csv_if_all_date_was_scraped(directory) -> None:
             with sqlite3.connect(temp_db) as con:
                 df.to_sql('HotelPrice', con, if_exists='replace', index=False)
 
-            missing_dates = find_missing_dates_in_db(temp_db)
-            scrape_missing_dates(missing_dates=missing_dates)
+            check_in_db_if_all_date_was_scraped(temp_db)
         else:
             logger.warning("No CSV files were found")
     except FileNotFoundError as e:
         logger.error(e)
         logger.error(f"{directory} folder not found.")
     except Exception as e:
-        logger.error(f"An unexpected error occurred: {e}")
+        logger.error(e)
+        logger.error(f"An unexpected error occurred")
 
     if os.path.exists(temp_db):
         try:
@@ -308,6 +310,8 @@ def save_scraped_data(
         save_dir='scraped_hotel_data_csv') -> None:
     """
     Save scraped data to CSV or SQLite database.
+    The CSV files directory is created automatically if it doesn't exist.
+    The default CSV files directory name is depended on the default value of 'save_dir' parameter.
     :param dataframe: Pandas DataFrame.
     :param details_dataclass: Details dataclass object.
                             Only needed if saving to SQLite database.

diff --git a/main.py b/main.py
@@ -17,8 +17,8 @@
 from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
 from japan_avg_hotel_price_finder.scrape import BasicScraper
 from japan_avg_hotel_price_finder.thread_scrape import ThreadPoolScraper
-from japan_avg_hotel_price_finder.utils import check_csv_if_all_date_was_scraped, check_db_if_all_date_was_scraped, \
-    save_scraped_data
+from japan_avg_hotel_price_finder.utils import check_in_db_if_all_date_was_scraped, \
+    save_scraped_data, check_in_csv_dir_if_all_date_was_scraped
 from set_details import Details
 
 logger = configure_logging_with_file('jp_hotel_data.log', 'jp_hotel_data')
@@ -53,23 +53,23 @@
             data_tuple = thread_scrape.thread_scrape(max_workers=workers)
             df = data_tuple[0]
             save_scraped_data(dataframe=df, details_dataclass=details, to_sqlite=to_sqlite)
-            check_db_if_all_date_was_scraped(details.sqlite_name)
+            check_in_db_if_all_date_was_scraped(details.sqlite_name, to_sqlite=to_sqlite)
         else:
             df, city, check_in, check_out = thread_scrape.thread_scrape(max_workers=workers)
             save_scraped_data(dataframe=df, city=city, check_in=check_in,
                               check_out=check_out)
-            check_csv_if_all_date_was_scraped()
+            check_in_csv_dir_if_all_date_was_scraped()
     else:
         if to_sqlite:
             data_tuple = thread_scrape.thread_scrape()
             df = data_tuple[0]
             save_scraped_data(dataframe=df, details_dataclass=details, to_sqlite=to_sqlite)
-            check_db_if_all_date_was_scraped(details.sqlite_name)
+            check_in_db_if_all_date_was_scraped(details.sqlite_name, to_sqlite=to_sqlite)
         else:
             df, city, check_in, check_out = thread_scrape.thread_scrape()
             save_scraped_data(dataframe=df, city=city, check_in=check_in,
                               check_out=check_out)
-            check_csv_if_all_date_was_scraped()
+            check_in_csv_dir_if_all_date_was_scraped()
 elif args.scraper:
     logger.info('Using basic scraper')
     check_in = details.check_in

diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -7,8 +7,8 @@
 
 from japan_avg_hotel_price_finder.thread_scrape import ThreadPoolScraper
 from japan_avg_hotel_price_finder.utils import check_if_current_date_has_passed, find_missing_dates, find_csv_files, \
-    convert_csv_to_df, get_count_of_date_by_mth_asof_today_query, check_csv_if_all_date_was_scraped, \
-    check_db_if_all_date_was_scraped, save_scraped_data
+    convert_csv_to_df, get_count_of_date_by_mth_asof_today_query, \
+    check_in_db_if_all_date_was_scraped, save_scraped_data, check_in_csv_dir_if_all_date_was_scraped
 from set_details import Details
 
 
@@ -108,7 +108,7 @@ def test_check_if_all_date_was_scraped_csv() -> None:
     df, city, check_in, check_out = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=5)
     save_scraped_data(dataframe=df, city=city, check_in=check_in,
                       check_out=check_out, save_dir=directory)
-    check_csv_if_all_date_was_scraped(directory)
+    check_in_csv_dir_if_all_date_was_scraped(directory)
 
     with sqlite3.connect(sqlite_name) as conn:
         directory = 'test_check_if_all_date_was_scraped_csv'
@@ -160,7 +160,7 @@ def test_check_if_all_date_was_scraped() -> None:
     data_tuple = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=5)
     df = data_tuple[0]
     save_scraped_data(dataframe=df, details_dataclass=hotel_stay, to_sqlite=True)
-    check_db_if_all_date_was_scraped(hotel_stay.sqlite_name)
+    check_in_db_if_all_date_was_scraped(hotel_stay.sqlite_name)
 
     with sqlite3.connect(sqlite_name) as conn:
         query = get_count_of_date_by_mth_asof_today_query()