Skip to content

Commit

Permalink
v4.4.1
Browse files Browse the repository at this point in the history
- Added IndexError handler when finding currency data from GrapQL response JSON
  • Loading branch information
sakan811 committed Jun 17, 2024
1 parent a23d136 commit 5a0a08a
Showing 1 changed file with 26 additions and 22 deletions.
48 changes: 26 additions & 22 deletions japan_avg_hotel_price_finder/graphql_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,7 +18,7 @@ def get_header() -> dict:
return {
"Content-Type": "application/json",
"Accept": "*/*",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64; rv:91.0) Gecko/20100101 Firefox/91.0",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/125.0.0.0 Safari/537.36",
"X-Booking-Csrf-Token": "eyJhbGciOiJIUzUxMiJ9.eyJpc3MiOiJjb250ZXh0LWVucmljaG1lbnQtYXBpIiwic3ViIjoiY3NyZi10b2tlbiIsImlhdCI6MTcxODYyNTQ2NywiZXhwIjoxNzE4NzExODY3fQ.CAooc42_J_78rrJe-e2kgLUyRvd_JBzuo2G8MP2V6veKwH5-TfikWdjTK2yBOfEe_Xl_IxDMFZOl-Q3qgvWp5A",
"X-Booking-Context-Action-Name": "searchresults_irene",
"X-Booking-Context-Aid": "304142"
Expand Down Expand Up @@ -520,27 +520,31 @@ def transform_data_in_df(check_in, city, dataframe) -> pd.DataFrame:
:param dataframe: Pandas DataFrame to be transformed.
:return: Pandas DataFrame.
"""
logger.info("Add City column to DataFrame")
dataframe['City'] = city
logger.info("Add Date column to DataFrame")
dataframe['Date'] = check_in
logger.info("Add AsOf column to DataFrame")
dataframe['AsOf'] = datetime.datetime.now()

logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
df_filtered = dataframe.drop_duplicates(subset='Hotel').copy()

# Drop rows where any of the 'Hotel', 'Review', 'Price' columns are None or NaN
logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
df_filtered = df_filtered.dropna(subset=['Hotel', 'Review', 'Price'])

logger.info("Convert columns to numeric values")
df_filtered.loc[:, 'Price'] = pd.to_numeric(df_filtered['Price'], errors='coerce')
df_filtered.loc[:, 'Review'] = pd.to_numeric(df_filtered['Review'], errors='coerce')

logger.info("Calculate the Price/Review ratio")
df_filtered.loc[:, 'Price/Review'] = df_filtered['Price'] / df_filtered['Review']
return df_filtered
if not dataframe.empty:
logger.info("Add City column to DataFrame")
dataframe['City'] = city
logger.info("Add Date column to DataFrame")
dataframe['Date'] = check_in
logger.info("Add AsOf column to DataFrame")
dataframe['AsOf'] = datetime.datetime.now()

logger.info("Remove duplicate rows from the DataFrame based on 'Hotel' column")
df_filtered = dataframe.drop_duplicates(subset='Hotel').copy()

# Drop rows where any of the 'Hotel', 'Review', 'Price' columns are None or NaN
logger.info("Dropping rows where 'Hotel', 'Review', or 'Price' columns are None or NaN")
df_filtered = df_filtered.dropna(subset=['Hotel', 'Review', 'Price'])

logger.info("Convert columns to numeric values")
df_filtered.loc[:, 'Price'] = pd.to_numeric(df_filtered['Price'], errors='coerce')
df_filtered.loc[:, 'Review'] = pd.to_numeric(df_filtered['Review'], errors='coerce')

logger.info("Calculate the Price/Review ratio")
df_filtered.loc[:, 'Price/Review'] = df_filtered['Price'] / df_filtered['Review']
return df_filtered
else:
logger.warning("Dataframe is empty. No data was scraped.")
return dataframe


def extract_hotel_data(df_list: list, hotel_data_list: list) -> None:
Expand Down

0 comments on commit 5a0a08a

Please sign in to comment.