Skip to content

Commit

Permalink
v6.0.5
Browse files Browse the repository at this point in the history
- Adjusted loggers
  • Loading branch information
sakan811 committed Aug 1, 2024
1 parent 09b8d8e commit c061458
Show file tree
Hide file tree
Showing 10 changed files with 103 additions and 82 deletions.
8 changes: 4 additions & 4 deletions automated_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,10 +4,10 @@
import os
from dataclasses import dataclass

from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
from japan_avg_hotel_price_finder.whole_mth_graphql_scraper import WholeMonthGraphQLScraper

logger = configure_logging_with_file(log_dir='logs', log_file='automated_scraper.log', logger_name='automated_scraper')
script_logger = configure_logging_with_file(log_dir='logs', log_file='automated_scraper.log', logger_name='automated_scraper')

# Initialize argument parser
parser = argparse.ArgumentParser(description='Parser that control which kind of scraper to use.')
Expand All @@ -26,7 +26,7 @@ async def main(self):
try:
os.makedirs(path, exist_ok=True)
except OSError as e:
logger.error(f"Error creating directory '{path}': {e}")
main_logger.error(f"Error creating directory '{path}': {e}")

csv_file_name = f'{self.city}_hotel_data_{month_name}_{self.year}.csv'
csv_file_path = os.path.join(path, csv_file_name)
Expand All @@ -38,7 +38,7 @@ async def main(self):
scraper = AutomatedScraper()

if args.month:
logger.info(f'Setting month to scrape to {args.month} for {scraper.__class__.__name__}...')
main_logger.info(f'Setting month to scrape to {args.month} for {scraper.__class__.__name__}...')
scraper = AutomatedScraper(month=args.month)

asyncio.run(scraper.main())
3 changes: 3 additions & 0 deletions japan_avg_hotel_price_finder/configure_logging.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,3 +56,6 @@ def configure_logging_with_file(log_dir: str, log_file: str, logger_name: str =
logger.addHandler(stream_handler)

return logger


main_logger = configure_logging_with_file(log_dir='logs', log_file='main.log', logger_name='main', level="INFO")
42 changes: 24 additions & 18 deletions japan_avg_hotel_price_finder/graphql_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -4,15 +4,15 @@
import aiohttp
import pandas as pd

from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger
from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_data_extractor import extract_hotel_data
from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_data_transformer import transform_data_in_df
from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_request_func import get_header, fetch_hotel_data
from japan_avg_hotel_price_finder.graphql_scraper_func.graphql_utils_func import concat_df_list, check_city_data, \
check_currency_data, check_hotel_filter_data
from set_details import Details

logger = configure_logging_with_file(log_dir='logs', log_file='graphql_scraper.log', logger_name='graphql_scraper')
script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_scraper.log', logger_name='graphql_scraper')


@dataclass
Expand All @@ -25,35 +25,40 @@ async def scrape_graphql(self) -> pd.DataFrame:
Scrape hotel data from GraphQL endpoint using async.
:return: DataFrame containing hotel data from GraphQL endpoint
"""
logger.info("Start scraping data from GraphQL endpoint...")
logger.info(
main_logger.info("Start scraping data from GraphQL endpoint...")

script_logger.debug(
f"City: {self.city} | Check-in: {self.check_in} | Check-out: {self.check_out} | Currency: {self.selected_currency}")
logger.info(f"Adults: {self.group_adults} | Children: {self.group_children} | Rooms: {self.num_rooms}")
logger.info(f"Only hotel properties: {self.scrape_only_hotel}")
script_logger.debug(f"Adults: {self.group_adults} | Children: {self.group_children} | Rooms: {self.num_rooms}")
script_logger.debug(f"Only hotel properties: {self.scrape_only_hotel}")

if self.city and self.check_in and self.check_out and self.selected_currency:
url = f'https://www.booking.com/dml/graphql?selected_currency={self.selected_currency}'
headers = get_header()
graphql_query = self.get_graphql_query()

# get a response with Async
async with aiohttp.ClientSession() as session:
async with session.post(url, headers=headers, json=graphql_query) as response:
if response.status == 200:
data = await response.json()
total_page_num, hotel_data_dict = await self.check_info(data)
else:
logger.error(f"Error: {response.status}")
main_logger.error(f"Error: {response.status}")
return pd.DataFrame()

logger.debug(f"Total page number: {total_page_num}")
script_logger.debug(f"Total page number: {total_page_num}")

if total_page_num:
df_list = []
logger.info("Scraping data from GraphQL endpoint...")
main_logger.info("Scraping data from GraphQL endpoint...")

# fetch hotel data with Async
async with aiohttp.ClientSession() as session:
tasks = []
for offset in range(0, total_page_num, 100):
script_logger.debug(f'Fetch data from page-offset: {offset}')

graphql_query = self.get_graphql_query(page_offset=offset)
tasks.append(fetch_hotel_data(session, url, headers, graphql_query))

Expand All @@ -67,13 +72,13 @@ async def scrape_graphql(self) -> pd.DataFrame:
df = concat_df_list(df_list)
return transform_data_in_df(self.check_in, self.city, df)
else:
logger.warning("No hotel data was found. Return an empty DataFrame.")
main_logger.warning("No hotel data was found. Return an empty DataFrame.")
return pd.DataFrame()
else:
logger.warning("Total page number not found. Return an empty DataFrame.")
main_logger.warning("Total page number not found. Return an empty DataFrame.")
return pd.DataFrame()
else:
logger.warning("Error: city, check_in, check_out and selected_currency are required")
main_logger.warning("Error: city, check_in, check_out and selected_currency are required")
return pd.DataFrame()

def get_graphql_query(self, page_offset: int = 0) -> dict:
Expand All @@ -82,7 +87,7 @@ def get_graphql_query(self, page_offset: int = 0) -> dict:
:param page_offset: The offset for pagination, default is 0.
:return: Graphql query as a dictionary.
"""
logger.debug("Getting graphql query...")
script_logger.debug("Getting graphql query...")
if self.scrape_only_hotel:
selected_filter = {"selectedFilters": "ht_id=204"}
else:
Expand Down Expand Up @@ -463,11 +468,12 @@ async def check_info(self, data: dict) -> tuple:
:param data: Data from GraphQL response.
:return: Total page number and hotel data as a dictionary.
"""
main_logger.info('Checking whether entered data matches the data from GraphQL response...')
try:
total_page_num = data['data']['searchQueries']['search']['pagination']['nbResultsTotal']
except TypeError:
logger.error("TypeError: Total page number not found.")
logger.error("Return 0 as total page number")
main_logger.error("TypeError: Total page number not found.")
main_logger.error("Return 0 as total page number")
total_page_num = 0

if total_page_num:
Expand All @@ -490,11 +496,11 @@ async def check_info(self, data: dict) -> tuple:

for key, value in data_mapping.items():
entered_value = getattr(self, key, None)
logger.debug(f'Entered Value {key}: {entered_value}')
logger.debug(f'Response Value {key}: {value}')
script_logger.debug(f'Entered Value {key}: {entered_value}')
script_logger.debug(f'Response Value {key}: {value}')
if entered_value != value:
error_message = f"Error {key.replace('_', ' ').title()} not match: {entered_value} != {value}"
logger.error(error_message)
main_logger.error(error_message)
raise SystemExit(error_message)
else:
data_mapping = {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -2,9 +2,10 @@

import pandas as pd

from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger

logger = configure_logging_with_file(log_dir='logs', log_file='graphql_data_transformer.log', logger_name='graphql_data_transformer')
script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_data_transformer.log',
logger_name='graphql_data_transformer')


def transform_data_in_df(check_in, city, dataframe) -> pd.DataFrame:
Expand All @@ -16,30 +17,30 @@ def transform_data_in_df(check_in, city, dataframe) -> pd.DataFrame:
:return: Pandas DataFrame.
"""
if not dataframe.empty:
logger.info("Add City column to DataFrame")
main_logger.info("Add City column to DataFrame")
dataframe['City'] = city
logger.info("Add Date column to DataFrame")
main_logger.info("Add Date column to DataFrame")
dataframe['Date'] = check_in
logger.info("Add AsOf column to DataFrame")
main_logger.info("Add AsOf column to DataFrame")
dataframe['AsOf'] = datetime.datetime.now()

logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
main_logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
df_filtered = dataframe.drop_duplicates(subset='Hotel').copy()

logger.info("Convert columns to numeric values")
main_logger.info("Convert columns to numeric values")
df_filtered.loc[:, 'Price'] = pd.to_numeric(df_filtered['Price'], errors='coerce')
df_filtered.loc[:, 'Review'] = pd.to_numeric(df_filtered['Review'], errors='coerce')

# Drop rows where any of the 'Hotel', 'Review', 'Price' columns are None or NaN
logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
main_logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
df_filtered = df_filtered.dropna(subset=['Hotel', 'Review', 'Price'])

logger.info("Dropping rows where 'Review', or 'Price' columns are 0")
main_logger.info("Dropping rows where 'Review', or 'Price' columns are 0")
df_filtered = df_filtered[(df_filtered['Price'] != 0) & (df_filtered['Review'] != 0)]

logger.info("Calculate the Price/Review ratio")
main_logger.info("Calculate the Price/Review ratio")
df_filtered.loc[:, 'Price/Review'] = df_filtered['Price'] / df_filtered['Review']
return df_filtered
else:
logger.warning("Dataframe is empty. No data was scraped.")
main_logger.warning("Dataframe is empty. No data was scraped.")
return dataframe
Original file line number Diff line number Diff line change
Expand Up @@ -3,9 +3,9 @@
from aiohttp import ClientSession
from dotenv import load_dotenv

from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger

logger = configure_logging_with_file(log_dir='logs', log_file='graphql_request_func.log', logger_name='graphql_request_func')
script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_request_func.log', logger_name='graphql_request_func')


# Load environment variables from .env file
Expand All @@ -17,7 +17,7 @@ def get_header() -> dict:
Return header.
:return: Header as a dictionary.
"""
logger.info("Getting header...")
main_logger.info("Getting header...")
return {
"User-Agent": os.getenv("USER_AGENT"),
}
Expand All @@ -38,11 +38,11 @@ async def fetch_hotel_data(session: ClientSession, url: str, headers: dict, grap
try:
return data['data']['searchQueries']['search']['results']
except (ValueError, KeyError) as e:
logger.error(f"Error extracting hotel data: {e}")
main_logger.error(f"Error extracting hotel data: {e}")
return []
except Exception as e:
logger.error(f"Unexpected error: {e}")
main_logger.error(f"Unexpected error: {e}")
return []
else:
logger.error(f"Error: {response.status}")
main_logger.error(f"Error: {response.status}")
return []
Original file line number Diff line number Diff line change
@@ -1,9 +1,9 @@
import pandas as pd

from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file, main_logger

logger = configure_logging_with_file(log_dir='logs', log_file='graphql_utils_func.log',
logger_name='graphql_utils_func')
script_logger = configure_logging_with_file(log_dir='logs', log_file='graphql_utils_func.log',
logger_name='graphql_utils_func')


def concat_df_list(df_list: list[pd.DataFrame]) -> pd.DataFrame:
Expand All @@ -12,12 +12,12 @@ def concat_df_list(df_list: list[pd.DataFrame]) -> pd.DataFrame:
:param df_list: A list of Pandas Dataframes.
:return: Pandas DataFrame.
"""
logger.info("Concatenate a list of Pandas Dataframes")
main_logger.info("Concatenate a list of Pandas Dataframes")
if df_list:
df_main = pd.concat(df_list)
return df_main
else:
logger.warning("No data was scraped.")
main_logger.warning("No data was scraped.")
return pd.DataFrame()


Expand All @@ -27,7 +27,7 @@ def check_currency_data(data) -> str:
:param data: GraphQL response as JSON.
:return: City name.
"""
logger.info("Checking currency data from the GraphQL response...")
main_logger.info("Checking currency data from the GraphQL response...")
selected_currency_data = None
try:
for result in data['data']['searchQueries']['search']['results']:
Expand All @@ -37,9 +37,9 @@ def check_currency_data(data) -> str:
selected_currency_data = block['finalPrice']['currency']
break
except KeyError:
logger.error('KeyError: Currency data not found')
main_logger.error('KeyError: Currency data not found')
except IndexError:
logger.error('IndexError: Currency data not found')
main_logger.error('IndexError: Currency data not found')
return selected_currency_data


Expand All @@ -49,7 +49,7 @@ def check_city_data(data) -> str:
:param data: GraphQL response as JSON.
:return: City name.
"""
logger.info("Checking city data from the GraphQL response...")
main_logger.info("Checking city data from the GraphQL response...")
city_data = None
try:
for breadcrumb in data['data']['searchQueries']['search']['breadcrumbs']:
Expand All @@ -58,9 +58,9 @@ def check_city_data(data) -> str:
city_data = breadcrumb['name']
break
except KeyError:
logger.error('KeyError: City not found')
main_logger.error('KeyError: City not found')
except IndexError:
logger.error('IndexError: City not found')
main_logger.error('IndexError: City not found')
return city_data


Expand All @@ -70,19 +70,20 @@ def check_hotel_filter_data(data) -> bool:
:param data: GraphQL response as JSON.
:return: Hotel filter indicator.
"""
logger.info("Checking hotel filter data from the GraphQL response...")
main_logger.info("Checking hotel filter data from the GraphQL response...")

try:
for option in data['data']['searchQueries']['search']['appliedFilterOptions']:
logger.debug(f'Filter options: {option}')
script_logger.debug(f'Filter options: {option}')

if 'urlId' in option:
if option['urlId'] == "ht_id=204":
return True
except KeyError:
logger.error('KeyError: hotel_filter not found')
main_logger.error('KeyError: hotel_filter not found')
return False
except IndexError:
logger.error('IndexError: hotel_filter not found')
main_logger.error('IndexError: hotel_filter not found')
return False

return False
Loading

0 comments on commit c061458

Please sign in to comment.