Skip to content

Commit

Permalink
Merge pull request #60 from sakan811/patch
Browse files Browse the repository at this point in the history
Replaced SQLite with Postgres
  • Loading branch information
sakan811 authored Nov 1, 2024
2 parents 81f25c8 + e45ef98 commit a362315
Show file tree
Hide file tree
Showing 25 changed files with 615 additions and 449 deletions.
7 changes: 7 additions & 0 deletions .env.example
Original file line number Diff line number Diff line change
Expand Up @@ -9,3 +9,10 @@ X_BOOKING_TOPIC=

# User Agent
USER_AGENT=

# Database
POSTGRES_USER=postgres
POSTGRES_PASSWORD=postgres
POSTGRES_HOST=localhost
POSTGRES_PORT=5500
POSTGRES_DB=postgres
26 changes: 14 additions & 12 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,6 +67,13 @@ Built on top of [Find the Hotel's Average Room Price in Osaka](#find-the-hotels-
- Create a virtual environment and activate it.
- Install all dependencies listed in [requirements.txt](requirements.txt)
- Rename a `.env.example` to `.env`

### Setup a Database
- Download [Docker Desktop](https://www.docker.com/products/docker-desktop)
- Ensure that Docker Desktop is running.
- Run: `export POSTGRES_DATA_PATH='<your_container_volume_path>'` to set the container volume
to the directory path of your choice.
- Run: `docker compose up -d`

### Find your **User Agent**:
- Go to https://www.whatismybrowser.com/detect/what-is-my-user-agent/
Expand Down Expand Up @@ -130,41 +137,36 @@ Built on top of [Find the Hotel's Average Room Price in Osaka](#find-the-hotels-
### General Guidelines for Using the Scraper
- To scrape only hotel properties, use `--scrape_only_hotel` argument.
- The SQLite database is created automatically if it doesn't exist.
- Ensure that Docker Desktop and Postgres container are running.
### To scrape using Whole-Month GraphQL Scraper:
- Example usage, with only required arguments for Whole-Month GraphQL Scraper:
```bash
python main.py --whole_mth --year=2024 --month=12 --city=Osaka \
--sqlite_name=avg_japan_hotel_price_test.db
python main.py --whole_mth --year=2024 --month=12 --city=Osaka
```
- Scrape data start from the given day of the month to the end of the same month.
- Default **start day** is 1.
- **Start day** can be set with `--start_day` argument.
- Data is saved to **SQLite**.

### To scrape using Basic GraphQL Scraper:
- Example usage, with only required arguments for Basic GraphQL Scraper:
```bash
python main.py --city=Osaka --check_in=2024-12-25 --check_out=2024-12-26 --scraper \
--sqlite_name=avg_japan_hotel_price_test.db
python main.py --city=Osaka --check_in=2024-12-25 --check_out=2024-12-26 --scraper
```
- Data is saved to **SQLite**.

### To scrape using Japan GraphQL Scraper:
- Example usage, with only required arguments for Japan GraphQL Scraper:
```bash
python main.py --japan_hotel --sqlite_name=japan_hotel_data_test.db
python main.py --japan_hotel
```
- Data is saved to **SQLite**.
- Prefecture to scrape can be specified with `--prefecture` argument, for example:
- ```bash
python main.py --japan_hotel --prefecture Tokyo --sqlite_name=japan_hotel_data_test.db
python main.py --japan_hotel --prefecture Tokyo
```
- If `--prefecture` argument is not specified, all prefectures will be scraped.
- Multiple prefectures can be specified.
- ```bash
python main.py --japan_hotel --prefecture Tokyo Osaka --sqlite_name=japan_hotel_data_test.db
python main.py --japan_hotel --prefecture Tokyo Osaka
```
- You can use the prefecture name on Booking.com as a reference.

Expand Down
46 changes: 26 additions & 20 deletions check_missing_dates.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,11 +2,13 @@
import asyncio
import calendar
import datetime
import os
from calendar import monthrange
from dataclasses import dataclass, field
from typing import Any

from sqlalchemy import create_engine, func
from dotenv import load_dotenv
from sqlalchemy import create_engine, func, Engine
from sqlalchemy.orm import sessionmaker

from japan_avg_hotel_price_finder.booking_details import BookingDetails
Expand All @@ -16,6 +18,14 @@
from japan_avg_hotel_price_finder.sql.db_model import HotelPrice
from japan_avg_hotel_price_finder.sql.save_to_db import save_scraped_data

load_dotenv(dotenv_path='.env')

postgres_host = os.getenv('POSTGRES_HOST')
postgres_port = os.getenv('POSTGRES_PORT')
postgres_user = os.getenv('POSTGRES_USER')
postgres_password = os.getenv('POSTGRES_PASSWORD')
postgres_db = os.getenv('POSTGRES_DB')


def find_missing_dates(dates_in_db: set[str],
days_in_month: int,
Expand Down Expand Up @@ -81,12 +91,14 @@ def filter_past_date(dates_in_db_date_obj: list[datetime.date], today: datetime.

async def scrape_missing_dates(missing_dates_list: list[str] = None,
booking_details_class: 'BookingDetails' = None,
country: str = 'Japan') -> None:
country: str = 'Japan',
engine: Engine = None) -> None:
"""
Scrape missing dates with BasicScraper and load them into a database.
:param missing_dates_list: Missing dates.
:param booking_details_class: Dataclass of booking details as parameters, default is None.
:param country: Country where the hotels are located, default is Japan.
:param engine: SQLAlchemy engine.
:return: None
"""
main_logger.info("Scraping missing dates...")
Expand All @@ -106,15 +118,14 @@ async def scrape_missing_dates(missing_dates_list: list[str] = None,
num_rooms = booking_details_class.num_rooms
selected_currency = booking_details_class.selected_currency
scrape_only_hotel = booking_details_class.scrape_only_hotel
sqlite_name = booking_details_class.sqlite_name

scraper = BasicGraphQLScraper(check_in=check_in, check_out=check_out, city=city, group_adults=group_adults,
group_children=group_children, num_rooms=num_rooms,
selected_currency=selected_currency,
scrape_only_hotel=scrape_only_hotel, sqlite_name=sqlite_name, country=country)
scrape_only_hotel=scrape_only_hotel, country=country)
df = await scraper.scrape_graphql()

save_scraped_data(dataframe=df, db=scraper.sqlite_name)
save_scraped_data(dataframe=df, engine=engine)
else:
main_logger.warning("Missing dates is None. No missing dates to scrape.")

Expand All @@ -126,18 +137,15 @@ class MissingDateChecker:
It only checks the data scraped today, UTC Time.
Attributes:
sqlite_name (str): Path to SQLite database.
city (str): City where the hotels are located.
"""
sqlite_name: str
city: str

# sqlalchemy
engine: Any = field(init=False)
engine: Any = field(init=True)
Session: Any = field(init=False)

def __post_init__(self):
self.engine = create_engine(f'sqlite:///{self.sqlite_name}')
self.Session = sessionmaker(bind=self.engine)

def find_missing_dates_in_db(self, year: int) -> list[str]:
Expand All @@ -146,7 +154,7 @@ def find_missing_dates_in_db(self, year: int) -> list[str]:
:param year: Year of the dates to check whether they are missing.
:return: List of missing dates.
"""
main_logger.info(f"Checking if all dates were scraped in {self.sqlite_name}...")
main_logger.info(f"Checking if all dates were scraped in a database...")
missing_date_list: list[str] = []

session = self.Session()
Expand All @@ -167,8 +175,7 @@ def find_missing_dates_in_db(self, year: int) -> list[str]:

if not count_of_date_by_mth_as_of_today:
today = datetime.datetime.now(datetime.timezone.utc).date()
main_logger.warning(f"No scraped data for today, {today}, UTC time for city {self.city} in"
f" {self.sqlite_name}.")
main_logger.warning(f"No scraped data for today, {today}, UTC time for city {self.city} in a database")
return missing_date_list

today = datetime.datetime.today()
Expand Down Expand Up @@ -255,8 +262,6 @@ def parse_arguments() -> argparse.Namespace:
:return: argparse.Namespace
"""
parser = argparse.ArgumentParser(description='Parser which controls Missing Date Checker.')
parser.add_argument('--sqlite_name', type=str, default='avg_japan_hotel_price_test.db',
help='SQLite database path, default is "avg_japan_hotel_price_test.db"')
parser.add_argument('--city', type=str, help='City where the hotels are located', required=True)
parser.add_argument('--group_adults', type=int, default=1, help='Number of Adults, default is 1')
parser.add_argument('--num_rooms', type=int, default=1, help='Number of Rooms, default is 1')
Expand All @@ -274,11 +279,12 @@ def parse_arguments() -> argparse.Namespace:
args = parse_arguments()

booking_details = BookingDetails(city=args.city, group_adults=args.group_adults,
num_rooms=args.num_rooms, group_children=args.group_children,
selected_currency=args.selected_currency,
scrape_only_hotel=args.scrape_only_hotel, sqlite_name=args.sqlite_name)
num_rooms=args.num_rooms, group_children=args.group_children,
selected_currency=args.selected_currency,
scrape_only_hotel=args.scrape_only_hotel)

db_path: str = args.sqlite_name
missing_date_checker = MissingDateChecker(sqlite_name=db_path, city=args.city)
postgres_url = f"postgresql://{postgres_user}:{postgres_password}@{postgres_host}:{postgres_port}/{postgres_db}"
engine = create_engine(postgres_url)
missing_date_checker = MissingDateChecker(engine=engine, city=args.city)
missing_dates: list[str] = missing_date_checker.find_missing_dates_in_db(year=args.year)
asyncio.run(scrape_missing_dates(missing_dates, booking_details_class=booking_details))
asyncio.run(scrape_missing_dates(missing_dates, booking_details_class=booking_details, engine=engine))
10 changes: 6 additions & 4 deletions docker-compose.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@ services:
container_name: jp_scraper
hostname: jp_scraper
environment:
POSTGRES_USER: postgres
POSTGRES_PASSWORD: postgres
POSTGRES_DB: postgres
- POSTGRES_USER=postgres
- POSTGRES_PASSWORD=postgres
- POSTGRES_HOST=localhost
- POSTGRES_PORT=5500
- POSTGRES_DB=postgres
volumes:
- ./postgres_data:/var/lib/postgresql/data
- ${POSTGRES_DATA_PATH}:/var/lib/postgresql/data
ports:
- "5500:5432"
networks:
Expand Down
4 changes: 0 additions & 4 deletions docs/SCRAPER_ARGS.md
Original file line number Diff line number Diff line change
Expand Up @@ -53,10 +53,6 @@
- **Type**: `bool`
- **Description**: If set to `True`, the scraper will only target hotel properties.

### `--sqlite_name`
- **Type**: `str`
- **Description**: The name of the SQLite database file to use. Only used for Basic and Whole-Month Scraper.

### `--year`
- **Type**: `int`
- **Description**: Specifies the year to scrape. This argument is required for Whole-Month Scraper.
Expand Down
4 changes: 1 addition & 3 deletions japan_avg_hotel_price_finder/booking_details.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,6 @@ class BookingDetails(BaseModel):
- group_children (int): Number of children.
- selected_currency (str): Room price currency.
- scrape_only_hotel (bool): Whether to scrape only hotel.
- sqlite_name (str): Path to SQLite database.
"""
city: str = ''
country: str = ''
Expand All @@ -25,5 +24,4 @@ class BookingDetails(BaseModel):
num_rooms: int = Field(1, gt=0)
group_children: int = Field(0, ge=0)
selected_currency: str = ''
scrape_only_hotel: bool = True
sqlite_name: str = ''
scrape_only_hotel: bool = True
4 changes: 0 additions & 4 deletions japan_avg_hotel_price_finder/graphql_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -38,11 +38,7 @@ class BasicGraphQLScraper(BaseModel):
group_children (str): Number of children, default is 0.
selected_currency (str): Currency of the room price, default is USD.
scrape_only_hotel (bool): Whether to scrape only the hotel property data, default is True
sqlite_name (str): Name of SQLite database to store the scraped data.
"""
# Set SQLite database name
sqlite_name: str

# Set booking details.
city: str
country: str
Expand Down
26 changes: 15 additions & 11 deletions japan_avg_hotel_price_finder/japan_hotel_scraper.py
Original file line number Diff line number Diff line change
Expand Up @@ -2,8 +2,8 @@
from typing import Any

import pandas as pd
from pydantic import Field
from sqlalchemy import create_engine
from pydantic import Field, ConfigDict
from sqlalchemy import Engine
from sqlalchemy.orm import sessionmaker

from japan_avg_hotel_price_finder.configure_logging import main_logger
Expand Down Expand Up @@ -32,8 +32,11 @@ class JapanScraper(WholeMonthGraphQLScraper):
region (str): The current region being scraped.
start_month (int): Month to start scraping (1-12).
end_month (int): Last month to scrape (1-12).
sqlite_name (str): Path and name of SQLite database to store the scraped data.
engine (Engine): SQLAlchemy engine.
"""
engine: Engine

model_config = ConfigDict(arbitrary_types_allowed=True)

japan_regions: dict[str, list[str]] = {
"Hokkaido": ["Hokkaido"],
Expand Down Expand Up @@ -114,27 +117,28 @@ async def _scrape_whole_year(self) -> None:
df = await self.scrape_whole_month()
if not df.empty:
df['Region'] = self.region
self._load_to_sqlite(df)
self._load_to_database(df)
else:
main_logger.warning(f"No data found for {self.city} for {calendar.month_name[self.month]} {self.year}")

def _load_to_sqlite(self, prefecture_hotel_data: pd.DataFrame) -> None:
def _load_to_database(self, prefecture_hotel_data: pd.DataFrame) -> None:
"""
Load hotel data of all Japan Prefectures to SQLite using SQLAlchemy ORM
Load hotel data of all Japan Prefectures to a database using SQLAlchemy ORM
:param prefecture_hotel_data: DataFrame with the whole-year hotel data of the given prefecture.
:return: None
"""
main_logger.info(f"Loading hotel data to SQLite {self.sqlite_name}...")
main_logger.info(f"Loading hotel data to database...")

# Rename 'City' column to 'Prefecture'
prefecture_hotel_data = prefecture_hotel_data.rename(columns={'City': 'Prefecture'})

# Rename Price/Review column
prefecture_hotel_data.rename(columns={'Price/Review': 'PriceReview'}, inplace=True)

engine = create_engine(f'sqlite:///{self.sqlite_name}')
Base.metadata.tables['JapanHotels'].create(engine, checkfirst=True)
Session = sessionmaker(bind=engine)
# Create all tables
Base.metadata.create_all(self.engine)

Session = sessionmaker(bind=self.engine)
session = Session()

try:
Expand All @@ -148,7 +152,7 @@ def _load_to_sqlite(self, prefecture_hotel_data: pd.DataFrame) -> None:
session.bulk_save_objects(hotel_prices)

session.commit()
main_logger.info(f"Hotel data for {self.city} loaded to SQLite successfully.")
main_logger.info(f"Hotel data for {self.city} loaded to a database successfully.")
except Exception as e:
session.rollback()
main_logger.error(f"An error occurred while saving data: {str(e)}")
Expand Down
11 changes: 0 additions & 11 deletions japan_avg_hotel_price_finder/main_argparse.py
Original file line number Diff line number Diff line change
Expand Up @@ -32,16 +32,6 @@ def add_booking_details_arguments(parser: argparse.ArgumentParser) -> None:
parser.add_argument('--scrape_only_hotel', action='store_true', help='Whether to scrape only hotel properties')


def add_database_arguments(parser: argparse.ArgumentParser) -> None:
"""
Add database-related arguments to the parser.
:param parser: argparse.ArgumentParser
:return: None
"""
db_group = parser.add_mutually_exclusive_group(required=True)
db_group.add_argument('--sqlite_name', type=str, help='SQLite database path')


def add_date_arguments(parser: argparse.ArgumentParser) -> None:
"""
Add date and length of stay arguments to the parser.
Expand Down Expand Up @@ -101,7 +91,6 @@ def parse_arguments() -> argparse.Namespace:
parser = argparse.ArgumentParser(description='Parser that controls which kind of scraper to use.')
add_scraper_arguments(parser)
add_booking_details_arguments(parser)
add_database_arguments(parser)
add_date_arguments(parser)
add_japan_arguments(parser)
args = parser.parse_args()
Expand Down
Loading

0 comments on commit a362315

Please sign in to comment.