v4.3.0

- Added --workers to adjust pool thread workers for Thread Pool Scraper - Adjusted default pool thread workers to 5 - Use Firefox instead of Chrome - Adjusted wait time for clicking pop-up ad and load more result button while scrolling down to 0.1 seconds - Adjust WebDriver Wait poll frequency to 0 - Adjusted scroll-down length to 2,000 pixels - Use logging instead of loguru - Adjusted scripts to use the same logger for all scrapers - Added driver wait for clicking 'load more result' button - Fixed 'check_if_current_date_has_passed' function bug for Month End Scraper - Added more tests - Added logic to check in case the past year is entered for Thread Pool and Month End scraper - Added timezone parameter for Thread Pool and Month End scrapers so that they check the past date based on the entered timezone, mostly for fixing timezone problems when testing using GitHub Action - Added timezone parameter for 'check_if_current_date_has_passed' mostly for fixing timezone problems when testing using GitHub Action - Adjusted log message - Added ElementClickInterceptedException handler when clicking pop-up ad and the load more result button - Added NoSuchWindowException handler while scrolling down the browser window - Added finally block to ensure that the driver is closed - Handle case when HTML content is None. - Add CSS selector list for clicking load more result button - Adjusted save data process - Removed Month End scraper - Added check_missing_dates.py
sakan811 · Jun 17, 2024 · 8b608b1 · 8b608b1
1 parent f43eefc
commit 8b608b1
Show file tree

Hide file tree

Showing 5 changed files with 60 additions and 100 deletions.
diff --git a/README.md b/README.md
@@ -84,6 +84,22 @@ This script can also be used to scrape data from other cities.
   as using --month will make the scraper starts from the day specified in 'start_day' variable 
   in [set_details.py](set_details.py) 
 
+### To the missing dates in the database or in the CSV files directory
+To ensure that all dates of the month were scraped when using the Thread Pool scraper, functions in
+[check_missing_dates.py](check_missing_dates.py) will check in the given SQLite database or CSV files directory.
+- To check in the database, use the following command line as an example:
+  ```  
+  python check_missing_dates.py --check_db=hotel_data.db
+  ``` 
+  - ```--check_db``` should be follow by the path of the database, without any quote.
+- To check in the CSV files directory, use the following command line as an example:
+  ```  
+  python check_missing_dates.py --check_csv=scraped_hotel_data_csv
+  ``` 
+  - ```--check_csv``` should be follow by the path of the CSV files directory, without any quote.
+- If there are missing dates, a Basic Scraper will automatically start to scrape those dates.
+- Only check the missing dates of the data that was scraped today.
+
 ### Dataclass
 [set_details.py](set_details.py)
 - Dataclass that stores booking details, date, and length of stay.
@@ -105,6 +121,9 @@ This script can also be used to scrape data from other cities.
 [utils.py](japan_avg_hotel_price_finder%2Futils.py)
 - Contain utility functions.
 
+### [check_missing_dates.py](check_missing_dates.py)
+- Check the missing dates in the database or in the CSV files directory.
+
 ### Automated Hotel Scraper
 - Scrape Osaka hotel data daily using GitHub action for all 12 months.
   - Use the ThreadPool scraper from [main.py](main.py) with --month for each month.

diff --git a/check_missing_dates.py b/check_missing_dates.py
@@ -0,0 +1,23 @@
+import argparse
+
+from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
+from japan_avg_hotel_price_finder.utils import check_in_db_if_all_date_was_scraped, \
+    check_in_csv_dir_if_all_date_was_scraped
+
+logger = configure_logging_with_file('jp_hotel_data.log', 'jp_hotel_data')
+
+parser = argparse.ArgumentParser(description='Parser that control which kind of missing dates checkers to use.')
+parser.add_argument('--check_db', type=str, default=False, help='Check missing dates in database')
+parser.add_argument('--check_csv', type=str, default=False, help='Check missing dates in CSV file directory')
+
+args = parser.parse_args()
+
+if args.check_db:
+    db = args.check_db
+    check_in_db_if_all_date_was_scraped(db=db, to_sqlite=True)
+elif args.check_csv:
+    directory = args.check_csv
+    directory = str(directory)
+    check_in_csv_dir_if_all_date_was_scraped(directory)
+else:
+    logger.warning('Please use --check_db or --check_csv')
diff --git a/japan_avg_hotel_price_finder/utils.py b/japan_avg_hotel_price_finder/utils.py
@@ -90,8 +90,9 @@ def check_in_db_if_all_date_was_scraped(db: str, to_sqlite: bool = False) -> Non
     """
     Check inside the SQLite database if all dates of each month were scraped today.
     :param db: Path to the SQLite database.
-    :param to_sqlite: If True, load the data to the SQLite database, else save to CSV.
-    :returns: None
+    :param to_sqlite: If True, after scraping the missing dates, load the scraped data to the SQLite database,
+                    else save to CSV.
+    :returns: None.
     """
     logger.info(f"Checking in the SQLite database '{db}' if any date was not scraped today...")
     missing_dates = find_missing_dates_in_db(db)

diff --git a/main.py b/main.py
@@ -17,8 +17,7 @@
 from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
 from japan_avg_hotel_price_finder.scrape import BasicScraper
 from japan_avg_hotel_price_finder.thread_scrape import ThreadPoolScraper
-from japan_avg_hotel_price_finder.utils import check_in_db_if_all_date_was_scraped, \
-    save_scraped_data, check_in_csv_dir_if_all_date_was_scraped
+from japan_avg_hotel_price_finder.utils import save_scraped_data
 from set_details import Details
 
 logger = configure_logging_with_file('jp_hotel_data.log', 'jp_hotel_data')
@@ -53,21 +52,17 @@
             data_tuple = thread_scrape.thread_scrape(max_workers=workers)
             df = data_tuple[0]
             save_scraped_data(dataframe=df, details_dataclass=details, to_sqlite=to_sqlite)
-            check_in_db_if_all_date_was_scraped(details.sqlite_name, to_sqlite=to_sqlite)
         else:
             df, city, month_number, year = thread_scrape.thread_scrape(max_workers=workers)
             save_scraped_data(dataframe=df, city=city, month=month_number, year=year)
-            check_in_csv_dir_if_all_date_was_scraped()
     else:
         if to_sqlite:
             data_tuple = thread_scrape.thread_scrape()
             df = data_tuple[0]
             save_scraped_data(dataframe=df, details_dataclass=details, to_sqlite=to_sqlite)
-            check_in_db_if_all_date_was_scraped(details.sqlite_name, to_sqlite=to_sqlite)
         else:
             df, city, month_number, year = thread_scrape.thread_scrape()
             save_scraped_data(dataframe=df, city=city, month=month_number, year=year)
-            check_in_csv_dir_if_all_date_was_scraped()
 elif args.scraper:
     logger.info('Using basic scraper')
     check_in = details.check_in
@@ -87,3 +82,5 @@
 
 
 
+
+
diff --git a/tests/test_utils.py b/tests/test_utils.py
@@ -3,13 +3,10 @@
 from calendar import monthrange
 
 import pytest
-import pytz
 
-from japan_avg_hotel_price_finder.thread_scrape import ThreadPoolScraper
 from japan_avg_hotel_price_finder.utils import check_if_current_date_has_passed, find_missing_dates, find_csv_files, \
     convert_csv_to_df, get_count_of_date_by_mth_asof_today_query, \
-    check_in_db_if_all_date_was_scraped, save_scraped_data, check_in_csv_dir_if_all_date_was_scraped
-from set_details import Details
+    scrape_missing_dates
 
 
 def test_check_if_current_date_has_passed():
@@ -69,106 +66,29 @@ def test_convert_csv_to_df():
     df = convert_csv_to_df(csv_files)
     assert df is None
 
-@pytest.mark.skip
-def test_check_if_all_date_was_scraped_csv() -> None:
-    city = 'Osaka'
-    group_adults = 1
-    num_rooms = 1
-    group_children = 0
-    selected_currency = 'USD'
 
-    # Define the timezone
-    city_timezone = pytz.timezone('Asia/Singapore')
-
-    # Get the current date in the specified timezone
-    today = datetime.datetime.now(city_timezone).date()
-
-    start_day = 5
-
-    if today.month == 12:
-        month = 1
-        year = today.year + 1
-    else:
-        month = today.month + 1
-        year = today.year
-
-    nights = 1
-
-    sqlite_name = 'test_check_if_all_date_was_scraped_csv.db'
-
-    hotel_stay = Details(
-        city=city, group_adults=group_adults, num_rooms=num_rooms,
-        group_children=group_children, selected_currency=selected_currency,
-        start_day=start_day, month=month, year=year, nights=nights, sqlite_name=sqlite_name
-    )
-
-    directory = 'test_check_if_all_date_was_scraped_csv'
-
-    thread_scrape = ThreadPoolScraper(hotel_stay)
-    df, city, month, year = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=2)
-    save_scraped_data(dataframe=df, city=city, month=month,
-                      year=year, save_dir=directory)
-    check_in_csv_dir_if_all_date_was_scraped(directory)
-
-    with sqlite3.connect(sqlite_name) as conn:
-        directory = 'test_check_if_all_date_was_scraped_csv'
-        csv_files: list = find_csv_files(directory)
-        if csv_files:
-            df = convert_csv_to_df(csv_files)
-            df.to_sql('HotelPrice', conn, if_exists='replace', index=False)
-
-        query = get_count_of_date_by_mth_asof_today_query()
-        result = conn.execute(query).fetchall()
-        days_in_month = monthrange(year, month)[1]
-        for row in result:
-            assert row[1] == days_in_month
-
-
-@pytest.mark.skip
-def test_check_if_all_date_was_scraped() -> None:
-    city = 'Osaka'
-    group_adults = 1
-    num_rooms = 1
-    group_children = 0
-    selected_currency = 'USD'
-
-    # Define the timezone
-    city_timezone = pytz.timezone('Asia/Singapore')
-
-    # Get the current date in the specified timezone
-    today = datetime.datetime.now(city_timezone).date()
-
-    start_day = 15
+def test_scrape_missing_dates() -> None:
+    db = 'test_scrape_missing_dates.db'
 
+    today = datetime.datetime.today()
     if today.month == 12:
         month = 1
         year = today.year + 1
     else:
         month = today.month + 1
         year = today.year
 
-    nights = 1
-
-    sqlite_name = 'test_check_if_all_date_was_scraped.db'
-
-    hotel_stay = Details(
-        city=city, group_adults=group_adults, num_rooms=num_rooms,
-        group_children=group_children, selected_currency=selected_currency,
-        start_day=start_day, month=month, year=year, nights=nights, sqlite_name=sqlite_name
-    )
-
-    thread_scrape = ThreadPoolScraper(hotel_stay)
-    data_tuple = thread_scrape.thread_scrape(timezone=city_timezone, max_workers=2)
-    df = data_tuple[0]
-    save_scraped_data(dataframe=df, details_dataclass=hotel_stay, to_sqlite=True)
-    check_in_db_if_all_date_was_scraped(hotel_stay.sqlite_name)
+    first_missing_date = f'{year}-{month}-01'
+    second_missing_date = f'{year}-{month}-11'
+    third_missing_date = f'{year}-{month}-20'
+    missing_dates = [first_missing_date, second_missing_date, third_missing_date]
+    scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True)
 
-    with sqlite3.connect(sqlite_name) as conn:
+    with sqlite3.connect(db) as con:
         query = get_count_of_date_by_mth_asof_today_query()
-        result = conn.execute(query).fetchall()
-        days_in_month = monthrange(year, month)[1]
+        result = con.execute(query).fetchall()
         for row in result:
-            assert row[1] == days_in_month
+            assert row[1] == 3
 
 
 if __name__ == '__main__':