From 3339190ea3627148f9fc299e050cf463dbfec64f Mon Sep 17 00:00:00 2001 From: Sakan Nirattisaykul <94357278+sakan811@users.noreply.github.com> Date: Sun, 22 Dec 2024 22:30:57 +0700 Subject: [PATCH] Switch to median calculation for hotel price tables. Replaced average with median when calculating prices grouped by review score and day of the week. Added support for PostgreSQL (using SQL percentile_cont) and SQLite (manual median calculation). This ensures a more robust representation of price distribution. --- .../sql/save_to_db.py | 106 +++++++++++++----- 1 file changed, 78 insertions(+), 28 deletions(-) diff --git a/japan_avg_hotel_price_finder/sql/save_to_db.py b/japan_avg_hotel_price_finder/sql/save_to_db.py index f67fcf2..ff5237d 100644 --- a/japan_avg_hotel_price_finder/sql/save_to_db.py +++ b/japan_avg_hotel_price_finder/sql/save_to_db.py @@ -134,7 +134,8 @@ def create_avg_hotel_room_price_by_date_table(session: Session) -> None: def create_avg_room_price_by_review_table(session: Session) -> None: """ - Create AverageHotelRoomPriceByReview table using SQLAlchemy ORM. + Create AverageHotelRoomPriceByReview table using the median (instead of average). + Supports PostgreSQL and SQLite. :param session: SQLAlchemy session :return: None """ @@ -143,16 +144,45 @@ def create_avg_room_price_by_review_table(session: Session) -> None: # Clear existing data session.query(AverageHotelRoomPriceByReview).delete() - # Calculate average prices by review, rounding review to nearest integer - avg_prices = session.query( - func.round(HotelPrice.Review), - func.avg(HotelPrice.Price).label('AveragePrice') - ).group_by(func.round(HotelPrice.Review)).all() + # Detect database dialect + dialect = session.bind.dialect + + if isinstance(dialect, postgresql.dialect): + # PostgreSQL-specific median calculation using percentile_cont + median_subquery = session.query( + func.round(HotelPrice.Review).label("Review"), + func.percentile_cont(0.5).within_group(HotelPrice.Price).label("MedianPrice") + ).group_by(func.round(HotelPrice.Review)).subquery() + + median_data = session.query( + median_subquery.c.Review, + median_subquery.c.MedianPrice + ).all() + + elif isinstance(dialect, sqlite.dialect): + # SQLite: Calculate median manually using Python + grouped_data = session.query( + func.round(HotelPrice.Review).label("Review"), + HotelPrice.Price + ).order_by(func.round(HotelPrice.Review), HotelPrice.Price).all() + + # Organize data into groups by rounded Review + from collections import defaultdict + grouped_prices = defaultdict(list) + for review, price in grouped_data: + grouped_prices[review].append(price) + + # Calculate the median for each group + median_data = [ + (review, np.median(prices)) for review, prices in grouped_prices.items() + ] + else: + raise NotImplementedError("Median calculation is only implemented for PostgreSQL and SQLite.") # Create new records new_records = [ - AverageHotelRoomPriceByReview(Review=review, AveragePrice=avg_price) - for review, avg_price in avg_prices + AverageHotelRoomPriceByReview(Review=review, AveragePrice=median_price) + for review, median_price in median_data ] # Bulk insert new records @@ -162,7 +192,8 @@ def create_avg_room_price_by_review_table(session: Session) -> None: def create_avg_hotel_price_by_dow_table(session: Session) -> None: """ - Create AverageHotelRoomPriceByDayOfWeek table using SQLAlchemy ORM. + Create AverageHotelRoomPriceByDayOfWeek table using the median (instead of average). + Supports PostgreSQL and SQLite. :param session: SQLAlchemy session :return: None """ @@ -177,32 +208,51 @@ def create_avg_hotel_price_by_dow_table(session: Session) -> None: if isinstance(dialect, postgresql.dialect): # PostgreSQL specific date extraction dow_func = extract('dow', func.to_date(HotelPrice.Date, 'YYYY-MM-DD')) + + # Median calculation using percentile_cont + median_subquery = session.query( + dow_func.label("day_of_week"), + func.percentile_cont(0.5).within_group(HotelPrice.Price).label("MedianPrice") + ).group_by(dow_func).subquery() + + median_data = session.query( + median_subquery.c.day_of_week, + median_subquery.c.MedianPrice + ).all() + elif isinstance(dialect, sqlite.dialect): - # SQLite specific date extraction + # SQLite-specific date extraction dow_func = func.cast(func.strftime('%w', func.date(HotelPrice.Date)), Integer) + + # Retrieve grouped data + grouped_data = session.query( + dow_func.label("day_of_week"), + HotelPrice.Price + ).order_by(dow_func, HotelPrice.Price).all() + + # Organize data into Python groups by day_of_week + from collections import defaultdict + grouped_prices = defaultdict(list) + for dow, price in grouped_data: + grouped_prices[dow].append(price) + + # Calculate the median for each day_of_week + median_data = [ + (dow, np.median(prices)) for dow, prices in grouped_prices.items() + ] else: - raise NotImplementedError(f"Unsupported dialect: {dialect}") + raise NotImplementedError(f"Median calculation is only implemented for PostgreSQL and SQLite.") - # Calculate average prices by day of week - day_of_week_case = case( - (dow_func == 0, 'Sunday'), - (dow_func == 1, 'Monday'), - (dow_func == 2, 'Tuesday'), - (dow_func == 3, 'Wednesday'), - (dow_func == 4, 'Thursday'), - (dow_func == 5, 'Friday'), - (dow_func == 6, 'Saturday'), - ).label('day_of_week') - - avg_prices = session.query( - day_of_week_case, - func.avg(HotelPrice.Price).label('avg_price') - ).group_by(day_of_week_case).all() + # Map numeric days to readable names + dow_mapping = { + 0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday', + 4: 'Thursday', 5: 'Friday', 6: 'Saturday' + } # Create new records new_records = [ - AverageHotelRoomPriceByDayOfWeek(DayOfWeek=day_of_week, AveragePrice=avg_price) - for day_of_week, avg_price in avg_prices + AverageHotelRoomPriceByDayOfWeek(DayOfWeek=dow_mapping[dow], AveragePrice=median_price) + for dow, median_price in median_data ] # Bulk insert new records