Skip to content

Commit

Permalink
v5.4.0
Browse files Browse the repository at this point in the history
- Use async when doing requests
- Added more tests
  • Loading branch information
sakan811 committed Jul 8, 2024
1 parent 934e1cb commit decd5aa
Show file tree
Hide file tree
Showing 4 changed files with 35 additions and 24 deletions.
37 changes: 22 additions & 15 deletions check_missing_dates.py
Original file line number Diff line number Diff line change
@@ -1,23 +1,30 @@
import argparse
import asyncio

from japan_avg_hotel_price_finder.configure_logging import configure_logging_with_file
from japan_avg_hotel_price_finder.utils import check_in_db_if_all_date_was_scraped, \
check_in_csv_dir_if_all_date_was_scraped

logger = configure_logging_with_file('jp_hotel_data.log', 'jp_hotel_data')

parser = argparse.ArgumentParser(description='Parser that control which kind of missing dates checkers to use.')
parser.add_argument('--check_db', type=str, default=False, help='Check missing dates in database')
parser.add_argument('--check_csv', type=str, default=False, help='Check missing dates in CSV file directory')

args = parser.parse_args()

if args.check_db:
db = args.check_db
check_in_db_if_all_date_was_scraped(db=db, to_sqlite=True)
elif args.check_csv:
directory = args.check_csv
directory = str(directory)
check_in_csv_dir_if_all_date_was_scraped(directory)
else:
logger.warning('Please use --check_db or --check_csv')

async def check_missing_dates_main():
parser = argparse.ArgumentParser(description='Parser that control which kind of missing dates checkers to use.')
parser.add_argument('--check_db', type=str, default=False, help='Check missing dates in database')
parser.add_argument('--check_csv', type=str, default=False, help='Check missing dates in CSV file directory')

args = parser.parse_args()

if args.check_db:
db = args.check_db
await check_in_db_if_all_date_was_scraped(db=db, to_sqlite=True)
elif args.check_csv:
directory = args.check_csv
directory = str(directory)
await check_in_csv_dir_if_all_date_was_scraped(directory)
else:
logger.warning('Please use --check_db or --check_csv')


if __name__ == '__main__':
asyncio.run(check_missing_dates_main())
2 changes: 2 additions & 0 deletions japan_avg_hotel_price_finder/migrate_to_sqlite.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,6 +54,8 @@ def migrate_data_to_sqlite(df_filtered: pd.DataFrame, details: Details) -> None:
# Save the DataFrame to a table named 'HotelPrice'
df_filtered.to_sql('HotelPrice', con=con, if_exists='append', index=False, dtype=hotel_price_dtype)

con.commit()

logger.info(f'Data has been saved to {db}')

create_average_room_price_by_date_view(db)
Expand Down
15 changes: 8 additions & 7 deletions japan_avg_hotel_price_finder/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -93,7 +93,7 @@ def find_missing_dates_in_db(sqlite_db: str) -> list:
return missing_dates


def check_in_db_if_all_date_was_scraped(db: str, to_sqlite: bool = False) -> None:
async def check_in_db_if_all_date_was_scraped(db: str, to_sqlite: bool = False) -> None:
"""
Check inside the SQLite database if all dates of each month were scraped today.
:param db: Path to the SQLite database.
Expand All @@ -103,10 +103,10 @@ def check_in_db_if_all_date_was_scraped(db: str, to_sqlite: bool = False) -> Non
"""
logger.info(f"Checking in the SQLite database '{db}' if any date was not scraped today...")
missing_dates = find_missing_dates_in_db(db)
scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=to_sqlite)
await scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=to_sqlite)


def check_in_csv_dir_if_all_date_was_scraped(directory: str = 'scraped_hotel_data_csv') -> None:
async def check_in_csv_dir_if_all_date_was_scraped(directory: str = 'scraped_hotel_data_csv') -> None:
"""
Check inside the CSV files directory if all dates of each month were scraped today.
:param directory: Path to the CSV files directory.
Expand All @@ -125,7 +125,7 @@ def check_in_csv_dir_if_all_date_was_scraped(directory: str = 'scraped_hotel_dat
with sqlite3.connect(temp_db) as con:
df.to_sql('HotelPrice', con, if_exists='replace', index=False)

check_in_db_if_all_date_was_scraped(temp_db)
await check_in_db_if_all_date_was_scraped(temp_db)
else:
logger.warning("No CSV files were found")
except FileNotFoundError as e:
Expand All @@ -145,7 +145,7 @@ def check_in_csv_dir_if_all_date_was_scraped(directory: str = 'scraped_hotel_dat
logger.info("Truncate the HotelPrice table in the temporary database.")
with sqlite3.connect(temp_db) as con:
con.execute("DELETE FROM HotelPrice")
logger.warning("Please delete the temporary database manually after the web-scraping process finishes.")
logger.warning("Please delete the temporary database manually.")


def get_count_of_date_by_mth_asof_today_query():
Expand Down Expand Up @@ -259,7 +259,7 @@ def find_dates_of_the_month_in_db(db: str, days_in_month, month, year) -> tuple:
return dates_in_db, end_date, start_date


def scrape_missing_dates(db: str = None, missing_dates: list = None, to_sqlite: bool = False):
async def scrape_missing_dates(db: str = None, missing_dates: list = None, to_sqlite: bool = False):
"""
Scrape missing dates with BasicScraper.
:param db: SQLite database path.
Expand All @@ -268,9 +268,10 @@ def scrape_missing_dates(db: str = None, missing_dates: list = None, to_sqlite:
Set to False as default.
:return: None
"""
logger.info("Scraping missing dates...")
if missing_dates:
for date in missing_dates:
scrape_with_basic_scraper(db, date, to_sqlite)
await scrape_with_basic_scraper(db, date, to_sqlite)
else:
logger.warning(f"Missing dates is None. No missing dates to scrape.")

Expand Down
5 changes: 3 additions & 2 deletions tests/test_utils/test_utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,8 @@
scrape_missing_dates


def test_scrape_missing_dates() -> None:
@pytest.mark.asyncio
async def test_scrape_missing_dates() -> None:
db = 'test_scrape_missing_dates.db'

today = datetime.datetime.today()
Expand All @@ -26,7 +27,7 @@ def test_scrape_missing_dates() -> None:
second_missing_date = f'{year}-{month_str}-11'
third_missing_date = f'{year}-{month_str}-20'
missing_dates = [first_missing_date, second_missing_date, third_missing_date]
scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True)
await scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True)

with sqlite3.connect(db) as con:
query = get_count_of_date_by_mth_asof_today_query()
Expand Down

0 comments on commit decd5aa

Please sign in to comment.