From f8c6a7553f29eefe9b2da787aaaa9f72605e235b Mon Sep 17 00:00:00 2001
From: Sakan <sakanbeer@gmail.com>
Date: Sun, 2 Jun 2024 01:38:55 +0700
Subject: [PATCH] v2.5.0 - Added to_sqlite flag argument - Added logic to
 prevent scraping the day that already passed for scrape_until_month_end.py
 and thread_scrape.py - Added utils.py - Deleted automated_scraper.py

---
 .github/workflows/scrape.yml                  |  24 +--
 README.md                                     |   9 +-
 automated_scraper.py                          | 199 ------------------
 .../scrape_until_month_end.py                 |   2 +-
 japan_avg_hotel_price_finder/thread_scrape.py |   4 +-
 japan_avg_hotel_price_finder/utils.py         |   1 -
 main.py                                       |   4 +-
 7 files changed, 25 insertions(+), 218 deletions(-)
 delete mode 100644 automated_scraper.py

diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml
index bde1da1..7f5f696 100644
--- a/.github/workflows/scrape.yml
+++ b/.github/workflows/scrape.yml
@@ -23,40 +23,40 @@ jobs:
         run: pip install -r requirements.txt
 
       - name: Run Scraper For January
-        run: python automated_scraper.py --month=1
+        run: python main.py --thread_pool=True --month=1
 
       - name: Run Scraper For February
-        run: python automated_scraper.py --month=2
+        run: python main.py --thread_pool=True --month=2
 
       - name: Run Scraper For March
-        run: python automated_scraper.py --month=3
+        run: python main.py --thread_pool=True --month=3
 
       - name: Run Scraper For April
-        run: python automated_scraper.py --month=4
+        run: python main.py --thread_pool=True --month=4
 
       - name: Run Scraper For May
-        run: python automated_scraper.py --month=5
+        run: python main.py --thread_pool=True --month=5
 
       - name: Run Scraper For June
-        run: python automated_scraper.py --month=6
+        run: python main.py --thread_pool=True --month=6
 
       - name: Run Scraper For July
-        run: python automated_scraper.py --month=7
+        run: python main.py --thread_pool=True --month=7
 
       - name: Run Scraper For August
-        run: python automated_scraper.py --month=8
+        run: python main.py --thread_pool=True --month=8
 
       - name: Run Scraper For September
-        run: python automated_scraper.py --month=9
+        run: python main.py --thread_pool=True --month=9
 
       - name: Run Scraper For October
-        run: python automated_scraper.py --month=10
+        run: python main.py --thread_pool=True --month=10
 
       - name: Run Scraper For November
-        run: python automated_scraper.py --month=11
+        run: python main.py --thread_pool=True --month=11
 
       - name: Run Scraper For December
-        run: python automated_scraper.py --month=12
+        run: python main.py --thread_pool=True --month=12
 
       - id: 'auth'
         uses: 'google-github-actions/auth@v2'
diff --git a/README.md b/README.md
index 0db31a8..e7ccb96 100644
--- a/README.md
+++ b/README.md
@@ -75,7 +75,12 @@ This script can also be used to scrape data from other cities.
   ```  
   python main.py --to_sqlite=True
   ```
-    
+- Month to scrape can be specified using ```--month=(month number as int)``` for Thread Pool and Month End Scraper.
+  - For example, to scrape data from June of the current year using Thread Pool Scraper, run the following command line:
+  ```  
+  python main.py --thread_pool=True --month=6
+  ``` 
+
 ### Dataclass
 [set_details.py](set_details.py)
 - Dataclass that stores booking details, date, and length of stay.
@@ -105,4 +110,4 @@ This script can also be used to scrape data from other cities.
 [automated_scraper.py](automated_scraper.py)
 - Scrape Osaka hotel data daily using GitHub action for all 12 months.
   - Save to CSV for each month.
-- Save CSV to Google Cloud Storage
+- Save CSV to Google Cloud Storage.
diff --git a/automated_scraper.py b/automated_scraper.py
deleted file mode 100644
index 6dddfdf..0000000
--- a/automated_scraper.py
+++ /dev/null
@@ -1,199 +0,0 @@
-#
-#    Licensed under the Apache License, Version 2.0 (the "License");
-#    you may not use this file except in compliance with the License.
-#    You may obtain a copy of the License at
-#
-#        http://www.apache.org/licenses/LICENSE-2.0
-#
-#    Unless required by applicable law or agreed to in writing, software
-#    distributed under the License is distributed on an "AS IS" BASIS,
-#    WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-#    See the License for the specific language governing permissions and
-#    limitations under the License.
-
-import argparse
-import calendar
-from concurrent.futures import ThreadPoolExecutor
-from datetime import datetime, timedelta
-
-import pandas as pd
-from loguru import logger
-from pandas import DataFrame
-
-from japan_avg_hotel_price_finder.scrape import transform_data
-from set_details import Details
-from japan_avg_hotel_price_finder.thread_scrape import ThreadPoolScraper
-
-logger.add('osaka_hotel_weekly_scraper.log',
-           format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {thread} |  {name} | {module} | {function} | {line} | {message}",
-           mode='w')
-
-
-class AutomatedThreadPoolScraper(ThreadPoolScraper):
-    def __init__(self, details: Details):
-        """
-        Scrape hotel data from the start day to the end of the same month using Thread Pool executor.
-        :param details: Details data class object.
-        """
-        super().__init__(details)
-
-    def thread_scrape(self) -> pd.DataFrame:
-        """
-        Scrape hotel data from the start day to the end of the same month using Thread Pool executor.
-        :return: Pandas dataframe containing hotel data.
-        """
-        logger.info('Scraping hotel data using Pool Thread executor...')
-
-        start_day = self.details.start_day
-
-        # Determine the last day of the given month
-        last_day: int = calendar.monthrange(self.details.year, self.details.month)[1]
-
-        # Define a list to store the result DataFrame from each thread
-        results = []
-
-        # Define a function to perform scraping for each date
-        def scrape_each_date(day) -> None:
-            """
-            Scrape hotel data of the given date.
-            :param day: Day of the month.
-            :return: None
-            """
-            logger.info('Scraping each date...')
-
-            current_date = datetime(self.details.year, self.details.month, day)
-            check_in = current_date.strftime('%Y-%m-%d')
-            check_out = (current_date + timedelta(days=self.details.nights)).strftime('%Y-%m-%d')
-
-            df = self.start_weekly_scraping_process(check_in, check_out)
-
-            # Append the result to the 'results' list
-            results.append(df)
-
-        # Create a thread pool with a maximum of 5 threads
-        with ThreadPoolExecutor(max_workers=5) as executor:
-            # Submit tasks for each date within the specified range
-            futures = [executor.submit(scrape_each_date, day) for day in range(start_day, last_day + 1)]
-
-            # Wait for all tasks to complete
-            for future in futures:
-                future.result()
-
-        # Concatenate all DataFrames in the 'results' list into a single DataFrame
-        df = pd.concat(results, ignore_index=True)
-
-        return df
-
-    def start_weekly_scraping_process(
-            self,
-            check_in: str,
-            check_out: str) -> pd.DataFrame:
-        """
-        Main function to start the web scraping process.
-        :param check_in: Check-in date.
-        :param check_out: Check-out date.
-        :return: None.
-                Return a Pandas DataFrame for testing purpose only.
-        """
-        logger.info("Starting web-scraping...")
-
-        city = self.details.city
-        group_adults = self.details.group_adults
-        group_children = self.details.group_children
-        num_rooms = self.details.num_rooms
-        selected_currency = self.details.selected_currency
-
-        url = (f'https://www.booking.com/searchresults.en-gb.html?ss={city}&checkin'
-               f'={check_in}&checkout={check_out}&group_adults={group_adults}'
-               f'&no_rooms={num_rooms}&group_children={group_children}'
-               f'&selected_currency={selected_currency}&nflt=ht_id%3D204')
-
-        dataframe = self._scrape(url)
-
-        df_filtered = None
-        # Create a DataFrame from the collected data
-        try:
-            df = pd.DataFrame(dataframe)
-            df['City'] = city
-
-            # Hotel data of the given date
-            df['Date'] = check_in
-
-            # Date which the data was collected
-            df['AsOf'] = datetime.now()
-
-            df_filtered = transform_data(df)
-        except ValueError as e:
-            logger.error(e)
-            logger.error(f'Error when creating a DataFrame for {check_in} to {check_out} data')
-        finally:
-            return df_filtered
-
-
-def automated_scraper_main(month: int, details: Details) -> None | DataFrame:
-    """
-    Automated scraper main function.
-    :param month: Month to start scraping.
-    :param details: HotelStay dataclass object.
-    :return: None
-            Return a Pandas DataFrame for testing purpose only.
-    """
-    details.month = month
-
-    # Initialize an empty DataFrame to collect all data
-    all_data = pd.DataFrame()
-
-    today = datetime.today()
-
-    # Can only scrape data from the current date onward
-    if month < today.month:
-        logger.info(
-            f'{calendar.month_name[month]} has already passed. The current month is {calendar.month_name[today.month]}'
-        )
-        all_data.to_csv(f'osaka_month_{month}_daily_hotel_data.csv', index=False)
-    else:
-        # Can only scrape data from the today onward
-        if month == today.month:
-            details.start_day = today.day
-
-        logger.info(f'Scraping data for {calendar.month_name[month]}...')
-
-        # Initialize and run the scraper
-        automated_scraper = AutomatedThreadPoolScraper(details)
-        df = automated_scraper.thread_scrape()
-
-        # Append the data to the all_data DataFrame
-        all_data = pd.concat([all_data, df], ignore_index=True)
-
-        # Save the collected data to a CSV file
-        all_data.to_csv(f'osaka_month_{month}_daily_hotel_data.csv', index=False)
-
-    return all_data
-
-
-if __name__ == '__main__':
-    # Define booking parameters for the hotel search.
-    city = 'Osaka'
-    group_adults = 1
-    num_rooms = 1
-    group_children = 0
-    selected_currency = 'USD'
-
-    today = datetime.today()
-    start_day: int = 1
-    year: int = today.year
-
-    details = Details(
-        city=city, group_adults=group_adults, num_rooms=num_rooms, group_children=group_children,
-        selected_currency=selected_currency, start_day=start_day, year=year
-    )
-
-    # Initialize argument parser
-    parser = argparse.ArgumentParser(description='Specify the month for data scraping.')
-    parser.add_argument('--month', type=int, help='Month to scrape data for (1-12)', required=True)
-    args = parser.parse_args()
-
-    # Extract the month value from the command line argument
-    month = args.month
-
-    automated_scraper_main(month, details)
diff --git a/japan_avg_hotel_price_finder/scrape_until_month_end.py b/japan_avg_hotel_price_finder/scrape_until_month_end.py
index af71477..cbd9e83 100644
--- a/japan_avg_hotel_price_finder/scrape_until_month_end.py
+++ b/japan_avg_hotel_price_finder/scrape_until_month_end.py
@@ -62,7 +62,7 @@ def scrape_until_month_end(self, to_sqlite: bool = False) -> None | pd.DataFrame
         while current_date <= end_date:
             current_date_has_passed: bool = check_if_current_date_has_passed(self.year, self.month, self.start_day)
             if current_date_has_passed:
-                logger.warning(f'The current day of the month to scrape was passed. Skip this day.')
+                logger.warning(f'The current day of the month to scrape was passed. Skip {self.year}-{self.month}-{self.start_day}.')
             else:
                 check_in = current_date.strftime('%Y-%m-%d')
                 check_out = (current_date + timedelta(days=self.nights)).strftime('%Y-%m-%d')
diff --git a/japan_avg_hotel_price_finder/thread_scrape.py b/japan_avg_hotel_price_finder/thread_scrape.py
index 5491f57..8f8a971 100644
--- a/japan_avg_hotel_price_finder/thread_scrape.py
+++ b/japan_avg_hotel_price_finder/thread_scrape.py
@@ -48,7 +48,7 @@ def thread_scrape(self, to_sqlite: bool = False) -> None | pd.DataFrame:
         results = []
 
         # Define a function to perform scraping for each date
-        def scrape_each_date(day: int):
+        def scrape_each_date(day: int) -> None:
             """
             Scrape hotel data of the given date.
             :param day: Day of the month.
@@ -60,7 +60,7 @@ def scrape_each_date(day: int):
 
             current_date = datetime(self.year, self.month, day)
             if current_date_has_passed:
-                logger.warning(f'The current day of the month to scrape was passed. Skip this day.')
+                logger.warning(f'The current day of the month to scrape was passed. Skip {self.year}-{self.month}-{day}.')
             else:
                 check_in: str = current_date.strftime('%Y-%m-%d')
                 check_out: str = (current_date + timedelta(days=self.nights)).strftime('%Y-%m-%d')
diff --git a/japan_avg_hotel_price_finder/utils.py b/japan_avg_hotel_price_finder/utils.py
index 8e8c896..5c5fafb 100644
--- a/japan_avg_hotel_price_finder/utils.py
+++ b/japan_avg_hotel_price_finder/utils.py
@@ -14,7 +14,6 @@ def check_if_current_date_has_passed(year, month, day):
     today_for_check = datetime.today().strftime('%Y-%m-%d')
     current_date_for_check = datetime(year, month, day).strftime('%Y-%m-%d')
     if current_date_for_check < today_for_check:
-        logger.warning(f'The current day of the month to scrape was passed. Skip {year}-{month}-{day}.')
         return True
     else:
         return False
diff --git a/main.py b/main.py
index 51461b5..f9e4c07 100644
--- a/main.py
+++ b/main.py
@@ -31,9 +31,11 @@
 parser.add_argument('--month_end', type=bool, default=False, help='Scrape until month end')
 parser.add_argument('--scraper', type=bool, default=True, help='Use basic scraper')
 parser.add_argument('--to_sqlite', type=bool, default=False, help='Use basic scraper')
+parser.add_argument('--month', type=int, help='Month to scrape data for (1-12)', required=True)
 args = parser.parse_args()
 
-details = Details()
+month = args.month
+details = Details(month=month)
 
 if args.thread_pool:
     logger.info('Using thread pool scraper')