Skip to content

Commit

Permalink
Switch to median calculation for hotel price tables.
Browse files Browse the repository at this point in the history
Replaced average with median when calculating prices grouped by review score and day of the week. Added support for PostgreSQL (using SQL percentile_cont) and SQLite (manual median calculation). This ensures a more robust representation of price distribution.
  • Loading branch information
sakan811 committed Dec 22, 2024
1 parent 2a7722d commit 3339190
Showing 1 changed file with 78 additions and 28 deletions.
106 changes: 78 additions & 28 deletions japan_avg_hotel_price_finder/sql/save_to_db.py
Original file line number Diff line number Diff line change
Expand Up @@ -134,7 +134,8 @@ def create_avg_hotel_room_price_by_date_table(session: Session) -> None:

def create_avg_room_price_by_review_table(session: Session) -> None:
"""
Create AverageHotelRoomPriceByReview table using SQLAlchemy ORM.
Create AverageHotelRoomPriceByReview table using the median (instead of average).
Supports PostgreSQL and SQLite.
:param session: SQLAlchemy session
:return: None
"""
Expand All @@ -143,16 +144,45 @@ def create_avg_room_price_by_review_table(session: Session) -> None:
# Clear existing data
session.query(AverageHotelRoomPriceByReview).delete()

# Calculate average prices by review, rounding review to nearest integer
avg_prices = session.query(
func.round(HotelPrice.Review),
func.avg(HotelPrice.Price).label('AveragePrice')
).group_by(func.round(HotelPrice.Review)).all()
# Detect database dialect
dialect = session.bind.dialect

if isinstance(dialect, postgresql.dialect):
# PostgreSQL-specific median calculation using percentile_cont
median_subquery = session.query(
func.round(HotelPrice.Review).label("Review"),
func.percentile_cont(0.5).within_group(HotelPrice.Price).label("MedianPrice")
).group_by(func.round(HotelPrice.Review)).subquery()

median_data = session.query(
median_subquery.c.Review,
median_subquery.c.MedianPrice
).all()

elif isinstance(dialect, sqlite.dialect):
# SQLite: Calculate median manually using Python
grouped_data = session.query(
func.round(HotelPrice.Review).label("Review"),
HotelPrice.Price
).order_by(func.round(HotelPrice.Review), HotelPrice.Price).all()

# Organize data into groups by rounded Review
from collections import defaultdict
grouped_prices = defaultdict(list)
for review, price in grouped_data:
grouped_prices[review].append(price)

# Calculate the median for each group
median_data = [
(review, np.median(prices)) for review, prices in grouped_prices.items()
]
else:
raise NotImplementedError("Median calculation is only implemented for PostgreSQL and SQLite.")

# Create new records
new_records = [
AverageHotelRoomPriceByReview(Review=review, AveragePrice=avg_price)
for review, avg_price in avg_prices
AverageHotelRoomPriceByReview(Review=review, AveragePrice=median_price)
for review, median_price in median_data
]

# Bulk insert new records
Expand All @@ -162,7 +192,8 @@ def create_avg_room_price_by_review_table(session: Session) -> None:

def create_avg_hotel_price_by_dow_table(session: Session) -> None:
"""
Create AverageHotelRoomPriceByDayOfWeek table using SQLAlchemy ORM.
Create AverageHotelRoomPriceByDayOfWeek table using the median (instead of average).
Supports PostgreSQL and SQLite.
:param session: SQLAlchemy session
:return: None
"""
Expand All @@ -177,32 +208,51 @@ def create_avg_hotel_price_by_dow_table(session: Session) -> None:
if isinstance(dialect, postgresql.dialect):
# PostgreSQL specific date extraction
dow_func = extract('dow', func.to_date(HotelPrice.Date, 'YYYY-MM-DD'))

# Median calculation using percentile_cont
median_subquery = session.query(
dow_func.label("day_of_week"),
func.percentile_cont(0.5).within_group(HotelPrice.Price).label("MedianPrice")
).group_by(dow_func).subquery()

median_data = session.query(
median_subquery.c.day_of_week,
median_subquery.c.MedianPrice
).all()

elif isinstance(dialect, sqlite.dialect):
# SQLite specific date extraction
# SQLite-specific date extraction
dow_func = func.cast(func.strftime('%w', func.date(HotelPrice.Date)), Integer)

# Retrieve grouped data
grouped_data = session.query(
dow_func.label("day_of_week"),
HotelPrice.Price
).order_by(dow_func, HotelPrice.Price).all()

# Organize data into Python groups by day_of_week
from collections import defaultdict
grouped_prices = defaultdict(list)
for dow, price in grouped_data:
grouped_prices[dow].append(price)

# Calculate the median for each day_of_week
median_data = [
(dow, np.median(prices)) for dow, prices in grouped_prices.items()
]
else:
raise NotImplementedError(f"Unsupported dialect: {dialect}")
raise NotImplementedError(f"Median calculation is only implemented for PostgreSQL and SQLite.")

# Calculate average prices by day of week
day_of_week_case = case(
(dow_func == 0, 'Sunday'),
(dow_func == 1, 'Monday'),
(dow_func == 2, 'Tuesday'),
(dow_func == 3, 'Wednesday'),
(dow_func == 4, 'Thursday'),
(dow_func == 5, 'Friday'),
(dow_func == 6, 'Saturday'),
).label('day_of_week')

avg_prices = session.query(
day_of_week_case,
func.avg(HotelPrice.Price).label('avg_price')
).group_by(day_of_week_case).all()
# Map numeric days to readable names
dow_mapping = {
0: 'Sunday', 1: 'Monday', 2: 'Tuesday', 3: 'Wednesday',
4: 'Thursday', 5: 'Friday', 6: 'Saturday'
}

# Create new records
new_records = [
AverageHotelRoomPriceByDayOfWeek(DayOfWeek=day_of_week, AveragePrice=avg_price)
for day_of_week, avg_price in avg_prices
AverageHotelRoomPriceByDayOfWeek(DayOfWeek=dow_mapping[dow], AveragePrice=median_price)
for dow, median_price in median_data
]

# Bulk insert new records
Expand Down

0 comments on commit 3339190

Please sign in to comment.