v6.0.5

- Adjusted loggers
sakan811 · Aug 1, 2024 · c061458 · c061458
1 parent 09b8d8e
commit c061458
Show file tree

Hide file tree

Showing 10 changed files with 103 additions and 82 deletions.
diff --git a/automated_scraper.py b/automated_scraper.py
@@ -4,10 +4,10 @@
 import os
 from dataclasses import dataclass
 
-from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
+from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
 from japan_avg_hotel_price_finder.whole_mth_graphql_scraper import WholeMonthGraphQLScraper
 
-logger = configure_logging_with_file(log_dir='logs', log_file='automated_scraper.log', logger_name='automated_scraper')
+script_logger = configure_logging_with_file(log_dir='logs', log_file='automated_scraper.log', logger_name='automated_scraper')
 
 # Initialize argument parser
 parser = argparse.ArgumentParser(description='Parser that control which kind of scraper to use.')
@@ -26,7 +26,7 @@ async def main(self):
         try:
             os.makedirs(path, exist_ok=True)
         except OSError as e:
-            logger.error(f"Error creating directory '{path}': {e}")
+            main_logger.error(f"Error creating directory '{path}': {e}")
 
         csv_file_name = f'{self.city}_hotel_data_{month_name}_{self.year}.csv'
         csv_file_path = os.path.join(path, csv_file_name)
@@ -38,7 +38,7 @@ async def main(self):
     scraper = AutomatedScraper()
 
     if args.month:
-        logger.info(f'Setting month to scrape to {args.month} for {scraper.__class__.__name__}...')
+        main_logger.info(f'Setting month to scrape to {args.month} for {scraper.__class__.__name__}...')
         scraper = AutomatedScraper(month=args.month)
 
     asyncio.run(scraper.main())
diff --git a/japan_avg_hotel_price_finder/configure_logging.py b/japan_avg_hotel_price_finder/configure_logging.py
@@ -56,3 +56,6 @@ def configure_logging_with_file(log_dir: str, log_file: str, logger_name: str =
     logger.addHandler(stream_handler)
 
     return logger
+
+
+main_logger = configure_logging_with_file(log_dir='logs', log_file='main.log', logger_name='main', level="INFO")
diff --git a/japan_avg_hotel_price_finder/graphql_scraper.py b/japan_avg_hotel_price_finder/graphql_scraper.py
@@ -4,15 +4,15 @@
 import aiohttp
 import pandas as pd
 
-from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
+from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
 from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_data_extractor import extract_hotel_data
 from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_data_transformer import transform_data_in_df
 from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_request_func import get_header, fetch_hotel_data
 from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_utils_func import concat_df_list, check_city_data, \
     check_currency_data, check_hotel_filter_data
 from set_details import Details
 
-logger = configure_logging_with_file(log_dir='logs', log_file='graphql_scraper.log', logger_name='graphql_scraper')
+script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_scraper.log', logger_name='graphql_scraper')
 
 
 @dataclass
@@ -25,35 +25,40 @@ async def scrape_graphql(self) -> pd.DataFrame:
         Scrape hotel data from GraphQL endpoint using async.
         :return: DataFrame containing hotel data from GraphQL endpoint
         """
-        logger.info("Start scraping data from GraphQL endpoint...")
-        logger.info(
+        main_logger.info("Start scraping data from GraphQL endpoint...")
+
+        script_logger.debug(
             f"City: {self.city} | Check-in: {self.check_in} | Check-out: {self.check_out} | Currency: {self.selected_currency}")
-        logger.info(f"Adults: {self.group_adults} | Children: {self.group_children} | Rooms: {self.num_rooms}")
-        logger.info(f"Only hotel properties: {self.scrape_only_hotel}")
+        script_logger.debug(f"Adults: {self.group_adults} | Children: {self.group_children} | Rooms: {self.num_rooms}")
+        script_logger.debug(f"Only hotel properties: {self.scrape_only_hotel}")
 
         if self.city and self.check_in and self.check_out and self.selected_currency:
             url = f'https://www.booking.com/dml/graphql?selected_currency={self.selected_currency}'
             headers = get_header()
             graphql_query = self.get_graphql_query()
 
+            # get a response with Async
             async with aiohttp.ClientSession() as session:
                 async with session.post(url, headers=headers, json=graphql_query) as response:
                     if response.status == 200:
                         data = await response.json()
                         total_page_num, hotel_data_dict = await self.check_info(data)
                     else:
-                        logger.error(f"Error: {response.status}")
+                        main_logger.error(f"Error: {response.status}")
                         return pd.DataFrame()
 
-            logger.debug(f"Total page number: {total_page_num}")
+            script_logger.debug(f"Total page number: {total_page_num}")
 
             if total_page_num:
                 df_list = []
-                logger.info("Scraping data from GraphQL endpoint...")
+                main_logger.info("Scraping data from GraphQL endpoint...")
 
+                # fetch hotel data with Async
                 async with aiohttp.ClientSession() as session:
                     tasks = []
                     for offset in range(0, total_page_num, 100):
+                        script_logger.debug(f'Fetch data from page-offset: {offset}')
+
                         graphql_query = self.get_graphql_query(page_offset=offset)
                         tasks.append(fetch_hotel_data(session, url, headers, graphql_query))
 
@@ -67,13 +72,13 @@ async def scrape_graphql(self) -> pd.DataFrame:
                     df = concat_df_list(df_list)
                     return transform_data_in_df(self.check_in, self.city, df)
                 else:
-                    logger.warning("No hotel data was found. Return an empty DataFrame.")
+                    main_logger.warning("No hotel data was found. Return an empty DataFrame.")
                     return pd.DataFrame()
             else:
-                logger.warning("Total page number not found. Return an empty DataFrame.")
+                main_logger.warning("Total page number not found. Return an empty DataFrame.")
                 return pd.DataFrame()
         else:
-            logger.warning("Error: city, check_in, check_out and selected_currency are required")
+            main_logger.warning("Error: city, check_in, check_out and selected_currency are required")
             return pd.DataFrame()
 
     def get_graphql_query(self, page_offset: int = 0) -> dict:
@@ -82,7 +87,7 @@ def get_graphql_query(self, page_offset: int = 0) -> dict:
         :param page_offset: The offset for pagination, default is 0.
         :return: Graphql query as a dictionary.
         """
-        logger.debug("Getting graphql query...")
+        script_logger.debug("Getting graphql query...")
         if self.scrape_only_hotel:
             selected_filter = {"selectedFilters": "ht_id=204"}
         else:
@@ -463,11 +468,12 @@ async def check_info(self, data: dict) -> tuple:
         :param data: Data from GraphQL response.
         :return: Total page number and hotel data as a dictionary.
         """
+        main_logger.info('Checking whether entered data matches the data from GraphQL response...')
         try:
             total_page_num = data['data']['searchQueries']['search']['pagination']['nbResultsTotal']
         except TypeError:
-            logger.error("TypeError: Total page number not found.")
-            logger.error("Return 0 as total page number")
+            main_logger.error("TypeError: Total page number not found.")
+            main_logger.error("Return 0 as total page number")
             total_page_num = 0
 
         if total_page_num:
@@ -490,11 +496,11 @@ async def check_info(self, data: dict) -> tuple:
 
             for key, value in data_mapping.items():
                 entered_value = getattr(self, key, None)
-                logger.debug(f'Entered Value {key}: {entered_value}')
-                logger.debug(f'Response Value {key}: {value}')
+                script_logger.debug(f'Entered Value {key}: {entered_value}')
+                script_logger.debug(f'Response Value {key}: {value}')
                 if entered_value != value:
                     error_message = f"Error {key.replace('_', ' ').title()} not match: {entered_value} != {value}"
-                    logger.error(error_message)
+                    main_logger.error(error_message)
                     raise SystemExit(error_message)
         else:
             data_mapping = {

diff --git a/japan_avg_hotel_price_finder/graphql_scraper_func/graphql_data_transformer.py b/japan_avg_hotel_price_finder/graphql_scraper_func/graphql_data_transformer.py
@@ -2,9 +2,10 @@
 
 import pandas as pd
 
-from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
+from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
 
-logger = configure_logging_with_file(log_dir='logs', log_file='graphql_data_transformer.log', logger_name='graphql_data_transformer')
+script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_data_transformer.log',
+                                            logger_name='graphql_data_transformer')
 
 
 def transform_data_in_df(check_in, city, dataframe) -> pd.DataFrame:
@@ -16,30 +17,30 @@ def transform_data_in_df(check_in, city, dataframe) -> pd.DataFrame:
     :return: Pandas DataFrame.
     """
     if not dataframe.empty:
-        logger.info("Add City column to DataFrame")
+        main_logger.info("Add City column to DataFrame")
         dataframe['City'] = city
-        logger.info("Add Date column to DataFrame")
+        main_logger.info("Add Date column to DataFrame")
         dataframe['Date'] = check_in
-        logger.info("Add AsOf column to DataFrame")
+        main_logger.info("Add AsOf column to DataFrame")
         dataframe['AsOf'] = datetime.datetime.now()
 
-        logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
+        main_logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
         df_filtered = dataframe.drop_duplicates(subset='Hotel').copy()
 
-        logger.info("Convert columns to numeric values")
+        main_logger.info("Convert columns to numeric values")
         df_filtered.loc[:, 'Price'] = pd.to_numeric(df_filtered['Price'], errors='coerce')
         df_filtered.loc[:, 'Review'] = pd.to_numeric(df_filtered['Review'], errors='coerce')
 
         # Drop rows where any of the 'Hotel', 'Review', 'Price' columns are None or NaN
-        logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
+        main_logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
         df_filtered = df_filtered.dropna(subset=['Hotel', 'Review', 'Price'])
 
-        logger.info("Dropping rows where 'Review', or 'Price' columns are 0")
+        main_logger.info("Dropping rows where 'Review', or 'Price' columns are 0")
         df_filtered = df_filtered[(df_filtered['Price'] != 0) & (df_filtered['Review'] != 0)]
 
-        logger.info("Calculate the Price/Review ratio")
+        main_logger.info("Calculate the Price/Review ratio")
         df_filtered.loc[:, 'Price/Review'] = df_filtered['Price'] / df_filtered['Review']
         return df_filtered
     else:
-        logger.warning("Dataframe is empty. No data was scraped.")
+        main_logger.warning("Dataframe is empty. No data was scraped.")
         return dataframe
diff --git a/japan_avg_hotel_price_finder/graphql_scraper_func/graphql_request_func.py b/japan_avg_hotel_price_finder/graphql_scraper_func/graphql_request_func.py
@@ -3,9 +3,9 @@
 from aiohttp import ClientSession
 from dotenv import load_dotenv
 
-from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
+from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
 
-logger = configure_logging_with_file(log_dir='logs', log_file='graphql_request_func.log', logger_name='graphql_request_func')
+script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_request_func.log', logger_name='graphql_request_func')
 
 
 # Load environment variables from .env file
@@ -17,7 +17,7 @@ def get_header() -> dict:
     Return header.
     :return: Header as a dictionary.
     """
-    logger.info("Getting header...")
+    main_logger.info("Getting header...")
     return {
         "User-Agent": os.getenv("USER_AGENT"),
     }
@@ -38,11 +38,11 @@ async def fetch_hotel_data(session: ClientSession, url: str, headers: dict, grap
             try:
                 return data['data']['searchQueries']['search']['results']
             except (ValueError, KeyError) as e:
-                logger.error(f"Error extracting hotel data: {e}")
+                main_logger.error(f"Error extracting hotel data: {e}")
                 return []
             except Exception as e:
-                logger.error(f"Unexpected error: {e}")
+                main_logger.error(f"Unexpected error: {e}")
                 return []
         else:
-            logger.error(f"Error: {response.status}")
+            main_logger.error(f"Error: {response.status}")
             return []
diff --git a/japan_avg_hotel_price_finder/graphql_scraper_func/graphql_utils_func.py b/japan_avg_hotel_price_finder/graphql_scraper_func/graphql_utils_func.py
@@ -1,9 +1,9 @@
 import pandas as pd
 
-from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
+from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
 
-logger = configure_logging_with_file(log_dir='logs', log_file='graphql_utils_func.log',
-                                     logger_name='graphql_utils_func')
+script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_utils_func.log',
+                                            logger_name='graphql_utils_func')
 
 
 def concat_df_list(df_list: list[pd.DataFrame]) -> pd.DataFrame:
@@ -12,12 +12,12 @@ def concat_df_list(df_list: list[pd.DataFrame]) -> pd.DataFrame:
     :param df_list: A list of Pandas Dataframes.
     :return: Pandas DataFrame.
     """
-    logger.info("Concatenate a list of Pandas Dataframes")
+    main_logger.info("Concatenate a list of Pandas Dataframes")
     if df_list:
         df_main = pd.concat(df_list)
         return df_main
     else:
-        logger.warning("No data was scraped.")
+        main_logger.warning("No data was scraped.")
         return pd.DataFrame()
 
 
@@ -27,7 +27,7 @@ def check_currency_data(data) -> str:
     :param data: GraphQL response as JSON.
     :return: City name.
     """
-    logger.info("Checking currency data from the GraphQL response...")
+    main_logger.info("Checking currency data from the GraphQL response...")
     selected_currency_data = None
     try:
         for result in data['data']['searchQueries']['search']['results']:
@@ -37,9 +37,9 @@ def check_currency_data(data) -> str:
                         selected_currency_data = block['finalPrice']['currency']
                         break
     except KeyError:
-        logger.error('KeyError: Currency data not found')
+        main_logger.error('KeyError: Currency data not found')
     except IndexError:
-        logger.error('IndexError: Currency data not found')
+        main_logger.error('IndexError: Currency data not found')
     return selected_currency_data
 
 
@@ -49,7 +49,7 @@ def check_city_data(data) -> str:
     :param data: GraphQL response as JSON.
     :return: City name.
     """
-    logger.info("Checking city data from the GraphQL response...")
+    main_logger.info("Checking city data from the GraphQL response...")
     city_data = None
     try:
         for breadcrumb in data['data']['searchQueries']['search']['breadcrumbs']:
@@ -58,9 +58,9 @@ def check_city_data(data) -> str:
                     city_data = breadcrumb['name']
                     break
     except KeyError:
-        logger.error('KeyError: City not found')
+        main_logger.error('KeyError: City not found')
     except IndexError:
-        logger.error('IndexError: City not found')
+        main_logger.error('IndexError: City not found')
     return city_data
 
 
@@ -70,19 +70,20 @@ def check_hotel_filter_data(data) -> bool:
     :param data: GraphQL response as JSON.
     :return: Hotel filter indicator.
     """
-    logger.info("Checking hotel filter data from the GraphQL response...")
+    main_logger.info("Checking hotel filter data from the GraphQL response...")
 
     try:
         for option in data['data']['searchQueries']['search']['appliedFilterOptions']:
-            logger.debug(f'Filter options: {option}')
+            script_logger.debug(f'Filter options: {option}')
+
             if 'urlId' in option:
                 if option['urlId'] == "ht_id=204":
                     return True
     except KeyError:
-        logger.error('KeyError: hotel_filter not found')
+        main_logger.error('KeyError: hotel_filter not found')
         return False
     except IndexError:
-        logger.error('IndexError: hotel_filter not found')
+        main_logger.error('IndexError: hotel_filter not found')
         return False
 
     return False
Original file line number	Diff line number	Diff line change
Expand Up		@@ -56,3 +56,6 @@ def configure_logging_with_file(log_dir: str, log_file: str, logger_name: str =
		logger.addHandler(stream_handler)

		return logger


		main_logger = configure_logging_with_file(log_dir='logs', log_file='main.log', logger_name='main', level="INFO")