From f8c6a7553f29eefe9b2da787aaaa9f72605e235b Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 2 Jun 2024 01:38:55 +0700 Subject: [PATCH] v2.5.0 - Added to_sqlite flag argument - Added logic to prevent scraping the day that already passed for scrape_until_month_end.py and thread_scrape.py - Added utils.py - Deleted automated_scraper.py --- .github/workflows/scrape.yml | 24 +-- README.md | 9 +- automated_scraper.py | 199 ------------------ .../scrape_until_month_end.py | 2 +- japan_avg_hotel_price_finder/thread_scrape.py | 4 +- japan_avg_hotel_price_finder/utils.py | 1 - main.py | 4 +- 7 files changed, 25 insertions(+), 218 deletions(-) delete mode 100644 automated_scraper.py diff --git a/.github/workflows/scrape.yml b/.github/workflows/scrape.yml index bde1da1..7f5f696 100644 --- a/.github/workflows/scrape.yml +++ b/.github/workflows/scrape.yml @@ -23,40 +23,40 @@ jobs: run: pip install -r requirements.txt - name: Run Scraper For January - run: python automated_scraper.py --month=1 + run: python main.py --thread_pool=True --month=1 - name: Run Scraper For February - run: python automated_scraper.py --month=2 + run: python main.py --thread_pool=True --month=2 - name: Run Scraper For March - run: python automated_scraper.py --month=3 + run: python main.py --thread_pool=True --month=3 - name: Run Scraper For April - run: python automated_scraper.py --month=4 + run: python main.py --thread_pool=True --month=4 - name: Run Scraper For May - run: python automated_scraper.py --month=5 + run: python main.py --thread_pool=True --month=5 - name: Run Scraper For June - run: python automated_scraper.py --month=6 + run: python main.py --thread_pool=True --month=6 - name: Run Scraper For July - run: python automated_scraper.py --month=7 + run: python main.py --thread_pool=True --month=7 - name: Run Scraper For August - run: python automated_scraper.py --month=8 + run: python main.py --thread_pool=True --month=8 - name: Run Scraper For September - run: python automated_scraper.py --month=9 + run: python main.py --thread_pool=True --month=9 - name: Run Scraper For October - run: python automated_scraper.py --month=10 + run: python main.py --thread_pool=True --month=10 - name: Run Scraper For November - run: python automated_scraper.py --month=11 + run: python main.py --thread_pool=True --month=11 - name: Run Scraper For December - run: python automated_scraper.py --month=12 + run: python main.py --thread_pool=True --month=12 - id: 'auth' uses: 'google-github-actions/auth@v2' diff --git a/README.md b/README.md index 0db31a8..e7ccb96 100644 --- a/README.md +++ b/README.md @@ -75,7 +75,12 @@ This script can also be used to scrape data from other cities. ``` python main.py --to_sqlite=True ``` - +- Month to scrape can be specified using ```--month=(month number as int)``` for Thread Pool and Month End Scraper. + - For example, to scrape data from June of the current year using Thread Pool Scraper, run the following command line: + ``` + python main.py --thread_pool=True --month=6 + ``` + ### Dataclass [set_details.py](set_details.py) - Dataclass that stores booking details, date, and length of stay. @@ -105,4 +110,4 @@ This script can also be used to scrape data from other cities. [automated_scraper.py](automated_scraper.py) - Scrape Osaka hotel data daily using GitHub action for all 12 months. - Save to CSV for each month. -- Save CSV to Google Cloud Storage +- Save CSV to Google Cloud Storage. diff --git a/automated_scraper.py b/automated_scraper.py deleted file mode 100644 index 6dddfdf..0000000 --- a/automated_scraper.py +++ /dev/null @@ -1,199 +0,0 @@ -# -# Licensed under the Apache License, Version 2.0 (the "License"); -# you may not use this file except in compliance with the License. -# You may obtain a copy of the License at -# -# http://www.apache.org/licenses/LICENSE-2.0 -# -# Unless required by applicable law or agreed to in writing, software -# distributed under the License is distributed on an "AS IS" BASIS, -# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -# See the License for the specific language governing permissions and -# limitations under the License. - -import argparse -import calendar -from concurrent.futures import ThreadPoolExecutor -from datetime import datetime, timedelta - -import pandas as pd -from loguru import logger -from pandas import DataFrame - -from japan_avg_hotel_price_finder.scrape import transform_data -from set_details import Details -from japan_avg_hotel_price_finder.thread_scrape import ThreadPoolScraper - -logger.add('osaka_hotel_weekly_scraper.log', - format="{time:YYYY-MM-DD at HH:mm:ss} | {level} | {thread} | {name} | {module} | {function} | {line} | {message}", - mode='w') - - -class AutomatedThreadPoolScraper(ThreadPoolScraper): - def __init__(self, details: Details): - """ - Scrape hotel data from the start day to the end of the same month using Thread Pool executor. - :param details: Details data class object. - """ - super().__init__(details) - - def thread_scrape(self) -> pd.DataFrame: - """ - Scrape hotel data from the start day to the end of the same month using Thread Pool executor. - :return: Pandas dataframe containing hotel data. - """ - logger.info('Scraping hotel data using Pool Thread executor...') - - start_day = self.details.start_day - - # Determine the last day of the given month - last_day: int = calendar.monthrange(self.details.year, self.details.month)[1] - - # Define a list to store the result DataFrame from each thread - results = [] - - # Define a function to perform scraping for each date - def scrape_each_date(day) -> None: - """ - Scrape hotel data of the given date. - :param day: Day of the month. - :return: None - """ - logger.info('Scraping each date...') - - current_date = datetime(self.details.year, self.details.month, day) - check_in = current_date.strftime('%Y-%m-%d') - check_out = (current_date + timedelta(days=self.details.nights)).strftime('%Y-%m-%d') - - df = self.start_weekly_scraping_process(check_in, check_out) - - # Append the result to the 'results' list - results.append(df) - - # Create a thread pool with a maximum of 5 threads - with ThreadPoolExecutor(max_workers=5) as executor: - # Submit tasks for each date within the specified range - futures = [executor.submit(scrape_each_date, day) for day in range(start_day, last_day + 1)] - - # Wait for all tasks to complete - for future in futures: - future.result() - - # Concatenate all DataFrames in the 'results' list into a single DataFrame - df = pd.concat(results, ignore_index=True) - - return df - - def start_weekly_scraping_process( - self, - check_in: str, - check_out: str) -> pd.DataFrame: - """ - Main function to start the web scraping process. - :param check_in: Check-in date. - :param check_out: Check-out date. - :return: None. - Return a Pandas DataFrame for testing purpose only. - """ - logger.info("Starting web-scraping...") - - city = self.details.city - group_adults = self.details.group_adults - group_children = self.details.group_children - num_rooms = self.details.num_rooms - selected_currency = self.details.selected_currency - - url = (f'https://www.booking.com/searchresults.en-gb.html?ss={city}&checkin' - f'={check_in}&checkout={check_out}&group_adults={group_adults}' - f'&no_rooms={num_rooms}&group_children={group_children}' - f'&selected_currency={selected_currency}&nflt=ht_id%3D204') - - dataframe = self._scrape(url) - - df_filtered = None - # Create a DataFrame from the collected data - try: - df = pd.DataFrame(dataframe) - df['City'] = city - - # Hotel data of the given date - df['Date'] = check_in - - # Date which the data was collected - df['AsOf'] = datetime.now() - - df_filtered = transform_data(df) - except ValueError as e: - logger.error(e) - logger.error(f'Error when creating a DataFrame for {check_in} to {check_out} data') - finally: - return df_filtered - - -def automated_scraper_main(month: int, details: Details) -> None | DataFrame: - """ - Automated scraper main function. - :param month: Month to start scraping. - :param details: HotelStay dataclass object. - :return: None - Return a Pandas DataFrame for testing purpose only. - """ - details.month = month - - # Initialize an empty DataFrame to collect all data - all_data = pd.DataFrame() - - today = datetime.today() - - # Can only scrape data from the current date onward - if month < today.month: - logger.info( - f'{calendar.month_name[month]} has already passed. The current month is {calendar.month_name[today.month]}' - ) - all_data.to_csv(f'osaka_month_{month}_daily_hotel_data.csv', index=False) - else: - # Can only scrape data from the today onward - if month == today.month: - details.start_day = today.day - - logger.info(f'Scraping data for {calendar.month_name[month]}...') - - # Initialize and run the scraper - automated_scraper = AutomatedThreadPoolScraper(details) - df = automated_scraper.thread_scrape() - - # Append the data to the all_data DataFrame - all_data = pd.concat([all_data, df], ignore_index=True) - - # Save the collected data to a CSV file - all_data.to_csv(f'osaka_month_{month}_daily_hotel_data.csv', index=False) - - return all_data - - -if __name__ == '__main__': - # Define booking parameters for the hotel search. - city = 'Osaka' - group_adults = 1 - num_rooms = 1 - group_children = 0 - selected_currency = 'USD' - - today = datetime.today() - start_day: int = 1 - year: int = today.year - - details = Details( - city=city, group_adults=group_adults, num_rooms=num_rooms, group_children=group_children, - selected_currency=selected_currency, start_day=start_day, year=year - ) - - # Initialize argument parser - parser = argparse.ArgumentParser(description='Specify the month for data scraping.') - parser.add_argument('--month', type=int, help='Month to scrape data for (1-12)', required=True) - args = parser.parse_args() - - # Extract the month value from the command line argument - month = args.month - - automated_scraper_main(month, details) diff --git a/japan_avg_hotel_price_finder/scrape_until_month_end.py b/japan_avg_hotel_price_finder/scrape_until_month_end.py index af71477..cbd9e83 100644 --- a/japan_avg_hotel_price_finder/scrape_until_month_end.py +++ b/japan_avg_hotel_price_finder/scrape_until_month_end.py @@ -62,7 +62,7 @@ def scrape_until_month_end(self, to_sqlite: bool = False) -> None | pd.DataFrame while current_date <= end_date: current_date_has_passed: bool = check_if_current_date_has_passed(self.year, self.month, self.start_day) if current_date_has_passed: - logger.warning(f'The current day of the month to scrape was passed. Skip this day.') + logger.warning(f'The current day of the month to scrape was passed. Skip {self.year}-{self.month}-{self.start_day}.') else: check_in = current_date.strftime('%Y-%m-%d') check_out = (current_date + timedelta(days=self.nights)).strftime('%Y-%m-%d') diff --git a/japan_avg_hotel_price_finder/thread_scrape.py b/japan_avg_hotel_price_finder/thread_scrape.py index 5491f57..8f8a971 100644 --- a/japan_avg_hotel_price_finder/thread_scrape.py +++ b/japan_avg_hotel_price_finder/thread_scrape.py @@ -48,7 +48,7 @@ def thread_scrape(self, to_sqlite: bool = False) -> None | pd.DataFrame: results = [] # Define a function to perform scraping for each date - def scrape_each_date(day: int): + def scrape_each_date(day: int) -> None: """ Scrape hotel data of the given date. :param day: Day of the month. @@ -60,7 +60,7 @@ def scrape_each_date(day: int): current_date = datetime(self.year, self.month, day) if current_date_has_passed: - logger.warning(f'The current day of the month to scrape was passed. Skip this day.') + logger.warning(f'The current day of the month to scrape was passed. Skip {self.year}-{self.month}-{day}.') else: check_in: str = current_date.strftime('%Y-%m-%d') check_out: str = (current_date + timedelta(days=self.nights)).strftime('%Y-%m-%d') diff --git a/japan_avg_hotel_price_finder/utils.py b/japan_avg_hotel_price_finder/utils.py index 8e8c896..5c5fafb 100644 --- a/japan_avg_hotel_price_finder/utils.py +++ b/japan_avg_hotel_price_finder/utils.py @@ -14,7 +14,6 @@ def check_if_current_date_has_passed(year, month, day): today_for_check = datetime.today().strftime('%Y-%m-%d') current_date_for_check = datetime(year, month, day).strftime('%Y-%m-%d') if current_date_for_check < today_for_check: - logger.warning(f'The current day of the month to scrape was passed. Skip {year}-{month}-{day}.') return True else: return False diff --git a/main.py b/main.py index 51461b5..f9e4c07 100644 --- a/main.py +++ b/main.py @@ -31,9 +31,11 @@ parser.add_argument('--month_end', type=bool, default=False, help='Scrape until month end') parser.add_argument('--scraper', type=bool, default=True, help='Use basic scraper') parser.add_argument('--to_sqlite', type=bool, default=False, help='Use basic scraper') +parser.add_argument('--month', type=int, help='Month to scrape data for (1-12)', required=True) args = parser.parse_args() -details = Details() +month = args.month +details = Details(month=month) if args.thread_pool: logger.info('Using thread pool scraper')