From 56ca951954fde8c73f4516aef7721fe85843360d Mon Sep 17 00:00:00 2001 From: Sakan Date: Tue, 18 Jun 2024 22:14:15 +0700 Subject: [PATCH] v4.6.0 - Added more tests - Added ValueError handler for check_if_current_date_has_passed function - Adjusted 'find_missing_dates' function --- japan_avg_hotel_price_finder/utils.py | 51 ++- tests/test_graphql/test_check_info.py | 233 +++++++++++ tests/test_graphql/test_concat_df.py | 92 +++++ tests/test_graphql/test_extract_data.py | 123 ++++++ tests/test_graphql/test_graphql_scraper.py | 23 ++ tests/test_graphql/test_transform_data.py | 104 +++++ tests/test_graphql_scraper.py | 383 ------------------ tests/test_utils.py | 95 ----- .../test_check_current_date_has_passed.py | 72 ++++ tests/test_utils/test_convert_csv_to_df.py | 15 + tests/test_utils/test_find_csv_files.py | 14 + ...ka_hotel_data_2024-09-29_to_2024-09-30.csv | 0 tests/test_utils/test_find_missing_dates.py | 80 ++++ tests/test_utils/test_utils.py | 35 ++ 14 files changed, 823 insertions(+), 497 deletions(-) create mode 100644 tests/test_graphql/test_check_info.py create mode 100644 tests/test_graphql/test_concat_df.py create mode 100644 tests/test_graphql/test_extract_data.py create mode 100644 tests/test_graphql/test_graphql_scraper.py create mode 100644 tests/test_graphql/test_transform_data.py delete mode 100644 tests/test_graphql_scraper.py delete mode 100644 tests/test_utils.py create mode 100644 tests/test_utils/test_check_current_date_has_passed.py create mode 100644 tests/test_utils/test_convert_csv_to_df.py create mode 100644 tests/test_utils/test_find_csv_files.py rename tests/{ => test_utils}/test_find_csv_files/Osaka_hotel_data_2024-09-29_to_2024-09-30.csv (100%) create mode 100644 tests/test_utils/test_find_missing_dates.py create mode 100644 tests/test_utils/test_utils.py diff --git a/japan_avg_hotel_price_finder/utils.py b/japan_avg_hotel_price_finder/utils.py index 6e7494e..f82ed61 100644 --- a/japan_avg_hotel_price_finder/utils.py +++ b/japan_avg_hotel_price_finder/utils.py @@ -29,10 +29,15 @@ def check_if_current_date_has_passed(year: int, month: int, day: int, timezone=N else: today = datetime.datetime.today() today_for_check = today.strftime('%Y-%m-%d') - current_date_for_check = datetime.datetime(year, month, day).strftime('%Y-%m-%d') - if current_date_for_check < today_for_check: - return True - else: + + try: + current_date_for_check = datetime.datetime(year, month, day).strftime('%Y-%m-%d') + if current_date_for_check < today_for_check: + return True + else: + return False + except ValueError: + logger.error("Invalid date. Returning False") return False @@ -67,7 +72,7 @@ def find_missing_dates_in_db(sqlite_db: str) -> list: dates_in_db, end_date, start_date = find_dates_of_the_month_in_db(sqlite_db, days_in_month, month, year) - missing_dates = find_missing_dates(dates_in_db, days_in_month, today, month, year) + missing_dates = find_missing_dates(dates_in_db, days_in_month, month, year) logger.warning(f"Missing dates in {start_date} to {end_date}: {missing_dates}") else: date_obj = datetime.datetime.strptime(row[0], '%Y-%m') @@ -80,7 +85,7 @@ def find_missing_dates_in_db(sqlite_db: str) -> list: dates_in_db, end_date, start_date = find_dates_of_the_month_in_db(sqlite_db, days_in_month, month, year) - missing_dates = find_missing_dates(dates_in_db, days_in_month, today, month, year) + missing_dates = find_missing_dates(dates_in_db, days_in_month, month, year) logger.warning(f"Missing dates in {start_date} to {end_date}: {missing_dates}") return missing_dates @@ -183,32 +188,40 @@ def scrape_with_basic_scraper(db: str, date, to_sqlite: bool = False): def find_missing_dates( dates_in_db: set[str], days_in_month: int, - today: datetime, month: int, - year: int) -> list[str]: + year: int, + timezone=None) -> list[str]: """ Find missing dates of the given month. + Only check date from today onward. :param dates_in_db: Dates of that month in the database of the current AsOf Date. Date format: '%Y-%m-%d'. :param days_in_month: Total days in the given month. - :param today: Today's date as a Datetime object. :param month: Month. :param year: Year. + :param timezone: Timezone, default is None, mostly for testing purpose. :returns: Missing Dates as a list. """ logger.info(f"Find missing date of {calendar.month_name[month]} {year}.") + if timezone: + today = datetime.datetime.now(timezone) + else: + today = datetime.datetime.today() + + dates_in_db_date_obj = [datetime.datetime.strptime(date, '%Y-%m-%d').date() for date in dates_in_db] + filtered_dates = [date for date in dates_in_db_date_obj if date >= today.date()] + + today_date_obj = today.date() missing_dates = [] for day in range(1, days_in_month + 1): - date_str = datetime.datetime(year, month, day).strftime('%Y-%m-%d') - if date_str not in dates_in_db: - if month == today.month: - # Handle the case when the month to scrape is the current month. - if day < today.day: - logger.warning(f"This day has passed. Skip {date_str}") - else: - missing_dates.append(date_str) - else: - missing_dates.append(date_str) + date_to_check = datetime.datetime(year, month, day) + date_to_check_str = date_to_check.strftime('%Y-%m-%d') + date_to_check_date_obj = date_to_check.date() + if date_to_check_date_obj < today_date_obj: + logger.warning(f"{date_to_check_str} has passed. Skip this date.") + else: + if date_to_check_date_obj not in filtered_dates: + missing_dates.append(date_to_check_str) return missing_dates diff --git a/tests/test_graphql/test_check_info.py b/tests/test_graphql/test_check_info.py new file mode 100644 index 0000000..53f783d --- /dev/null +++ b/tests/test_graphql/test_check_info.py @@ -0,0 +1,233 @@ +from unittest.mock import Mock + +import pytest + +from japan_avg_hotel_price_finder.graphql_scraper import check_info + + +def test_returns_correct_total_page_number_and_data_mapping(): + # Given + response_mock = Mock() + response_mock.status_code = 200 + response_mock.json.return_value = { + 'data': { + 'searchQueries': { + 'search': { + 'pagination': {'nbResultsTotal': 1}, + 'breadcrumbs': [{}, {}, {'name': 'Test City'}], + 'flexibleDatesConfig': { + 'dateRangeCalendar': { + 'checkin': ['2023-01-01'], + 'checkout': ['2023-01-02'] + } + }, + 'searchMeta': { + 'nbAdults': 2, + 'nbChildren': 1, + 'nbRooms': 1 + }, + 'results': [{ + 'blocks': [{ + 'finalPrice': {'currency': 'USD'} + }] + }] + } + } + } + } + entered_city = "Test City" + entered_check_in = "2023-01-01" + entered_check_out = "2023-01-02" + entered_selected_currency = "USD" + entered_num_adult = 2 + entered_num_children = 1 + entered_num_room = 1 + + # When + result = check_info( + response_mock, entered_city, entered_check_in, entered_check_out, + entered_selected_currency, entered_num_adult, entered_num_children, + entered_num_room + ) + + # Then + assert result == (1, { + "city": "Test City", + "check_in": "2023-01-01", + "check_out": "2023-01-02", + "num_adult": 2, + "num_children": 1, + "num_room": 1, + "selected_currency": "USD" + }) + + +def test_handles_response_with_missing_or_null_fields_gracefully(): + # Given + response_mock = Mock() + response_mock.status_code = 200 + response_mock.json.return_value = { + 'data': { + 'searchQueries': { + 'search': { + 'pagination': {'nbResultsTotal': 1}, + 'breadcrumbs': [{}, {}, {'name': None}], + 'flexibleDatesConfig': { + 'dateRangeCalendar': { + 'checkin': [None], + 'checkout': [None] + } + }, + 'searchMeta': { + 'nbAdults': None, + 'nbChildren': None, + 'nbRooms': None + }, + 'results': [{ + 'blocks': [{ + 'finalPrice': {'currency': None} + }] + }] + } + } + } + } + entered_city = "Test City" + entered_check_in = "2023-01-01" + entered_check_out = "2023-01-02" + entered_selected_currency = "USD" + entered_num_adult = 2 + entered_num_children = 1 + entered_num_room = 1 + + # When + error_message = '' + try: + check_info( + response_mock, entered_city, entered_check_in, entered_check_out, + entered_selected_currency, entered_num_adult, entered_num_children, + entered_num_room + ) + except SystemExit as e: + error_message = str(e) + + # Then + assert error_message == "Error City not match: Test City != None" + + +def test_data_mapping_dictionary_keys(): + # Given + response_mock = Mock() + response_mock.status_code = 200 + response_mock.json.return_value = { + 'data': { + 'searchQueries': { + 'search': { + 'pagination': {'nbResultsTotal': 1}, + 'breadcrumbs': [{}, {}, {'name': 'Test City'}], + 'flexibleDatesConfig': { + 'dateRangeCalendar': { + 'checkin': ['2023-01-01'], + 'checkout': ['2023-01-02'] + } + }, + 'searchMeta': { + 'nbAdults': 2, + 'nbChildren': 1, + 'nbRooms': 1 + }, + 'results': [{ + 'blocks': [{ + 'finalPrice': {'currency': 'USD'} + }] + }] + } + } + } + } + entered_city = "Test City" + entered_check_in = "2023-01-01" + entered_check_out = "2023-01-02" + entered_selected_currency = "USD" + entered_num_adult = 2 + entered_num_children = 1 + entered_num_room = 1 + + # When + result = check_info( + response_mock, entered_city, entered_check_in, entered_check_out, + entered_selected_currency, entered_num_adult, entered_num_children, + entered_num_room + ) + + # Then + assert result == (1, { + "city": "Test City", + "check_in": "2023-01-01", + "check_out": "2023-01-02", + "num_adult": 2, + "num_children": 1, + "num_room": 1, + "selected_currency": "USD" + }) + + +def test_data_mapping_extraction(): + # Given + response_mock = Mock() + response_mock.status_code = 200 + response_mock.json.return_value = { + 'data': { + 'searchQueries': { + 'search': { + 'pagination': {'nbResultsTotal': 1}, + 'breadcrumbs': [{}, {}, {'name': 'Test City'}], + 'flexibleDatesConfig': { + 'dateRangeCalendar': { + 'checkin': ['2023-01-01'], + 'checkout': ['2023-01-02'] + } + }, + 'searchMeta': { + 'nbAdults': 2, + 'nbChildren': 1, + 'nbRooms': 1 + }, + 'results': [{ + 'blocks': [{ + 'finalPrice': {'currency': 'USD'} + }] + }] + } + } + } + } + entered_city = "Test City" + entered_check_in = "2023-01-01" + entered_check_out = "2023-01-02" + entered_selected_currency = "USD" + entered_num_adult = 2 + entered_num_children = 1 + entered_num_room = 1 + + # When + result = check_info( + response_mock, entered_city, entered_check_in, entered_check_out, + entered_selected_currency, entered_num_adult, entered_num_children, + entered_num_room + ) + + # Then + assert result == (1, { + "city": "Test City", + "check_in": "2023-01-01", + "check_out": "2023-01-02", + "num_adult": 2, + "num_children": 1, + "num_room": 1, + "selected_currency": "USD" + }) + + +if __name__ == '__main__': + pytest.main() \ No newline at end of file diff --git a/tests/test_graphql/test_concat_df.py b/tests/test_graphql/test_concat_df.py new file mode 100644 index 0000000..f2c710d --- /dev/null +++ b/tests/test_graphql/test_concat_df.py @@ -0,0 +1,92 @@ +import pandas as pd +import pytest + +from japan_avg_hotel_price_finder.graphql_scraper import concat_df_list + + +def test_concatenate_multiple_non_empty_dataframes(): + # Given + df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]}) + df_list = [df1, df2] + + # When + result = concat_df_list(df_list) + + # Then + assert not result.empty + assert result.equals(pd.concat([df1, df2])) + + +def test_concatenate_empty_list(): + # Given + df_list = [] + + # When + result = concat_df_list(df_list) + + # Then + assert len(result) == 0 + + +def test_concatenate_dataframes_with_missing_values(): + # Given + df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = pd.DataFrame({'A': [5, 6], 'B': [7, None]}) + df_list = [df1, df2] + + # When + result = concat_df_list(df_list) + + # Then + assert not result.empty + assert result.equals(pd.concat([df1, df2])) + + +def test_concatenate_mixed_empty_non_empty_dataframes(): + # Given + df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]}) + df3 = pd.DataFrame() # Empty DataFrame + df_list = [df1, df2, df3] + + # When + result = concat_df_list(df_list) + + # Then + assert not result.empty + assert result.equals(pd.concat([df1, df2])) + + +def test_concatenate_dataframes_with_different_index_types(): + # Given + df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) + df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]}) + df1.index = ['a', 'b'] + df2.index = ['c', 'd'] + df_list = [df1, df2] + + # When + result = concat_df_list(df_list) + + # Then + assert not result.empty + assert result.equals(pd.concat([df1, df2])) + + +def test_concatenate_dataframes_with_duplicate_indices(): + # Given + df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}, index=[0, 1]) + df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]}, index=[1, 2]) + df_list = [df1, df2] + + # When + result = concat_df_list(df_list) + + # Then + assert not result.empty + assert result.equals(pd.concat([df1, df2])) + + +if __name__ == '__main__': + pytest.main() diff --git a/tests/test_graphql/test_extract_data.py b/tests/test_graphql/test_extract_data.py new file mode 100644 index 0000000..d8f12c3 --- /dev/null +++ b/tests/test_graphql/test_extract_data.py @@ -0,0 +1,123 @@ +import numpy as np +import pandas as pd + +from japan_avg_hotel_price_finder.graphql_scraper import extract_hotel_data + + +def test_extract_hotel_data_multiple_appends(): + # Sample hotel data + hotel_data_list_1 = [ + { + "displayName": {"text": "Hotel A"}, + "basicPropertyData": {"reviewScore": {"score": 4.5}}, + "blocks": [{"finalPrice": {"amount": 150}}] + } + ] + hotel_data_list_2 = [ + { + "displayName": {"text": "Hotel B"}, + "basicPropertyData": {"reviewScore": {"score": 4.0}}, + "blocks": [{"finalPrice": {"amount": 200}}] + } + ] + + df_list = [] + + # Call the function twice with different data + extract_hotel_data(df_list, hotel_data_list_1) + extract_hotel_data(df_list, hotel_data_list_2) + + # Assertions + assert len(df_list) == 2 + df1 = df_list[0] + df2 = df_list[1] + + assert df1.shape == (1, 3) + assert df1['Hotel'].tolist() == ['Hotel A'] + assert df1['Review'].tolist() == [4.5] + assert df1['Price'].tolist() == [150] + + assert df2.shape == (1, 3) + assert df2['Hotel'].tolist() == ['Hotel B'] + assert df2['Review'].tolist() == [4.0] + assert df2['Price'].tolist() == [200] + + +def test_extract_hotel_data_missing_values(): + # Sample hotel data with missing values + hotel_data_list = [ + { + "displayName": None, + "basicPropertyData": {"reviewScore": {"score": 4.5}}, + "blocks": [{"finalPrice": {"amount": 150}}] + }, + { + "displayName": {"text": "Hotel B"}, + "basicPropertyData": None, + "blocks": None + } + ] + + df_list = [] + + # Call the function + extract_hotel_data(df_list, hotel_data_list) + + # Assertions + assert len(df_list) == 2 + df = pd.concat(df_list, ignore_index=True) + assert df.shape == (2, 3) + assert df['Hotel'].tolist() == [None, 'Hotel B'] + + # Convert columns to numeric, coercing errors to NaN + df['Review'] = pd.to_numeric(df['Review'], errors='coerce') + df['Price'] = pd.to_numeric(df['Price'], errors='coerce') + + # Check for NaN values + assert df['Review'][0] == 4.5 + assert np.isnan(df['Review'][1]) + assert df['Price'][0] == 150 + assert np.isnan(df['Price'][1]) + + +def test_extract_hotel_data_empty_list(): + # Empty hotel data list + hotel_data_list = [] + + df_list = [] + + # Call the function + extract_hotel_data(df_list, hotel_data_list) + + # Assertions + assert len(df_list) == 0 + assert df_list == [] + + +def test_extract_hotel_data_basic(): + # Sample hotel data + hotel_data_list = [ + { + "displayName": {"text": "Hotel A"}, + "basicPropertyData": {"reviewScore": {"score": 4.5}}, + "blocks": [{"finalPrice": {"amount": 150}}] + }, + { + "displayName": {"text": "Hotel B"}, + "basicPropertyData": {"reviewScore": {"score": 4.0}}, + "blocks": [{"finalPrice": {"amount": 200}}] + } + ] + + df_list = [] + + # Call the function + extract_hotel_data(df_list, hotel_data_list) + + # Assertions + assert len(df_list) == 2 + df = pd.concat(df_list, ignore_index=True) + assert df.shape == (2, 3) + assert df['Hotel'].tolist() == ['Hotel A', 'Hotel B'] + assert df['Review'].tolist() == [4.5, 4.0] + assert df['Price'].tolist() == [150, 200] diff --git a/tests/test_graphql/test_graphql_scraper.py b/tests/test_graphql/test_graphql_scraper.py new file mode 100644 index 0000000..7879ec1 --- /dev/null +++ b/tests/test_graphql/test_graphql_scraper.py @@ -0,0 +1,23 @@ +import datetime + +import pytest +import pytz + +from japan_avg_hotel_price_finder.graphql_scraper import scrape_graphql + + +def test_graphql_scraper(): + timezone = pytz.timezone('Asia/Tokyo') + today = datetime.datetime.now(timezone) + check_in = today.strftime('%Y-%m-%d') + tomorrow = today + datetime.timedelta(days=1) + check_out = tomorrow.strftime('%Y-%m-%d') + df = scrape_graphql(city='Osaka', check_in=check_in, check_out=check_out, selected_currency='USD') + + assert not df.empty + # Check column + assert df.shape[1] == 7 + + +if __name__ == '__main__': + pytest.main() diff --git a/tests/test_graphql/test_transform_data.py b/tests/test_graphql/test_transform_data.py new file mode 100644 index 0000000..554385d --- /dev/null +++ b/tests/test_graphql/test_transform_data.py @@ -0,0 +1,104 @@ +import pandas as pd + +from japan_avg_hotel_price_finder.graphql_scraper import transform_data_in_df + + +def test_transform_data_in_df_basic(): + # Create a sample DataFrame + data = { + 'Hotel': ['Hotel A', 'Hotel B', 'Hotel C', 'Hotel A'], + 'Review': [4.5, 3.0, 4.0, 4.5], + 'Price': [150, 200, 250, 150] + } + df = pd.DataFrame(data) + + # Define parameters + check_in = '2024-06-17' + city = 'Tokyo' + + # Transform data + result_df = transform_data_in_df(check_in, city, df) + + # Assertions + assert 'City' in result_df.columns + assert 'Date' in result_df.columns + assert 'AsOf' in result_df.columns + assert 'Price/Review' in result_df.columns + assert result_df['City'].iloc[0] == city + assert result_df['Date'].iloc[0] == check_in + assert (result_df['AsOf'].notna()).all() + assert len(result_df) == 3 # Since one duplicate 'Hotel A' should be removed + + +def test_transform_data_in_df_dropna(): + # Create a sample DataFrame with None values + data = { + 'Hotel': ['Hotel A', 'Hotel B', 'Hotel C', None], + 'Review': [4.5, None, 4.0, 3.5], + 'Price': [150, 200, None, 175] + } + df = pd.DataFrame(data) + + # Define parameters + check_in = '2024-06-17' + city = 'Tokyo' + + # Transform data + result_df = transform_data_in_df(check_in, city, df) + + # Assertions + assert len(result_df) == 1 # Only 'Hotel A' + + +def test_drop_rows_with_zero_price_or_review(): + # Given + data = { + 'Hotel': ['Hotel A', 'Hotel B', 'Hotel C'], + 'Review': [4.0, 0, 5.0], + 'Price': [200, 0, 250] + } + df = pd.DataFrame(data) + + # When + result_df = transform_data_in_df('2024-06-17', 'Tokyo', df) + + # Then + assert len(result_df) == 2 # Only 'Hotel A' and 'Hotel C' should remain + assert 'Hotel B' not in result_df['Hotel'].values # 'Hotel B' with 0 price should be dropped + + +def test_transform_data_in_df_calculation(): + # Create a sample DataFrame + data = { + 'Hotel': ['Hotel A', 'Hotel B'], + 'Review': [4.0, 5.0], + 'Price': [200, 250] + } + df = pd.DataFrame(data) + + # Define parameters + check_in = '2024-06-17' + city = 'Tokyo' + + # Transform data + result_df = transform_data_in_df(check_in, city, df) + + # Assertions + assert 'Price/Review' in result_df.columns + assert result_df['Price/Review'].iloc[0] == 200 / 4.0 + assert result_df['Price/Review'].iloc[1] == 250 / 5.0 + + +def test_transform_data_in_df_empty(): + # Create an empty DataFrame + df = pd.DataFrame(columns=['Hotel', 'Review', 'Price']) + + # Define parameters + check_in = '2024-06-17' + city = 'Tokyo' + + # Transform data + result_df = transform_data_in_df(check_in, city, df) + + # Assertions + assert result_df.empty diff --git a/tests/test_graphql_scraper.py b/tests/test_graphql_scraper.py deleted file mode 100644 index 749d875..0000000 --- a/tests/test_graphql_scraper.py +++ /dev/null @@ -1,383 +0,0 @@ -import datetime -from unittest.mock import Mock - -import numpy as np -import pandas as pd -import pytest -import pytz -import requests - -from japan_avg_hotel_price_finder.graphql_scraper import scrape_graphql, transform_data_in_df, extract_hotel_data, \ - check_info, concat_df_list - - -def test_graphql_scraper(): - timezone = pytz.timezone('Asia/Tokyo') - today = datetime.datetime.now(timezone) - check_in = today.strftime('%Y-%m-%d') - tomorrow = today + datetime.timedelta(days=1) - check_out = tomorrow.strftime('%Y-%m-%d') - df = scrape_graphql(city='Osaka', check_in=check_in, check_out=check_out, selected_currency='USD') - - assert not df.empty - # Check column - assert df.shape[1] == 7 - - -def test_transform_data_in_df_basic(): - # Create a sample DataFrame - data = { - 'Hotel': ['Hotel A', 'Hotel B', 'Hotel C', 'Hotel A'], - 'Review': [4.5, 3.0, 4.0, 4.5], - 'Price': [150, 200, 250, 150] - } - df = pd.DataFrame(data) - - # Define parameters - check_in = '2024-06-17' - city = 'Tokyo' - - # Transform data - result_df = transform_data_in_df(check_in, city, df) - - # Assertions - assert 'City' in result_df.columns - assert 'Date' in result_df.columns - assert 'AsOf' in result_df.columns - assert 'Price/Review' in result_df.columns - assert result_df['City'].iloc[0] == city - assert result_df['Date'].iloc[0] == check_in - assert (result_df['AsOf'].notna()).all() - assert len(result_df) == 3 # Since one duplicate 'Hotel A' should be removed - - -def test_transform_data_in_df_dropna(): - # Create a sample DataFrame with None values - data = { - 'Hotel': ['Hotel A', 'Hotel B', 'Hotel C', None], - 'Review': [4.5, None, 4.0, 3.5], - 'Price': [150, 200, None, 175] - } - df = pd.DataFrame(data) - - # Define parameters - check_in = '2024-06-17' - city = 'Tokyo' - - # Transform data - result_df = transform_data_in_df(check_in, city, df) - - # Assertions - assert len(result_df) == 1 # Only 'Hotel A' - - -def test_drop_rows_with_zero_price_or_review(): - # Given - data = { - 'Hotel': ['Hotel A', 'Hotel B', 'Hotel C'], - 'Review': [4.0, 0, 5.0], - 'Price': [200, 0, 250] - } - df = pd.DataFrame(data) - - # When - result_df = transform_data_in_df('2024-06-17', 'Tokyo', df) - - # Then - assert len(result_df) == 2 # Only 'Hotel A' and 'Hotel C' should remain - assert 'Hotel B' not in result_df['Hotel'].values # 'Hotel B' with 0 price should be dropped - - -def test_transform_data_in_df_calculation(): - # Create a sample DataFrame - data = { - 'Hotel': ['Hotel A', 'Hotel B'], - 'Review': [4.0, 5.0], - 'Price': [200, 250] - } - df = pd.DataFrame(data) - - # Define parameters - check_in = '2024-06-17' - city = 'Tokyo' - - # Transform data - result_df = transform_data_in_df(check_in, city, df) - - # Assertions - assert 'Price/Review' in result_df.columns - assert result_df['Price/Review'].iloc[0] == 200 / 4.0 - assert result_df['Price/Review'].iloc[1] == 250 / 5.0 - - -def test_transform_data_in_df_empty(): - # Create an empty DataFrame - df = pd.DataFrame(columns=['Hotel', 'Review', 'Price']) - - # Define parameters - check_in = '2024-06-17' - city = 'Tokyo' - - # Transform data - result_df = transform_data_in_df(check_in, city, df) - - # Assertions - assert result_df.empty - - -def test_extract_hotel_data_basic(): - # Sample hotel data - hotel_data_list = [ - { - "displayName": {"text": "Hotel A"}, - "basicPropertyData": {"reviewScore": {"score": 4.5}}, - "blocks": [{"finalPrice": {"amount": 150}}] - }, - { - "displayName": {"text": "Hotel B"}, - "basicPropertyData": {"reviewScore": {"score": 4.0}}, - "blocks": [{"finalPrice": {"amount": 200}}] - } - ] - - df_list = [] - - # Call the function - extract_hotel_data(df_list, hotel_data_list) - - # Assertions - assert len(df_list) == 2 - df = pd.concat(df_list, ignore_index=True) - assert df.shape == (2, 3) - assert df['Hotel'].tolist() == ['Hotel A', 'Hotel B'] - assert df['Review'].tolist() == [4.5, 4.0] - assert df['Price'].tolist() == [150, 200] - - -def test_extract_hotel_data_missing_values(): - # Sample hotel data with missing values - hotel_data_list = [ - { - "displayName": None, - "basicPropertyData": {"reviewScore": {"score": 4.5}}, - "blocks": [{"finalPrice": {"amount": 150}}] - }, - { - "displayName": {"text": "Hotel B"}, - "basicPropertyData": None, - "blocks": None - } - ] - - df_list = [] - - # Call the function - extract_hotel_data(df_list, hotel_data_list) - - # Assertions - assert len(df_list) == 2 - df = pd.concat(df_list, ignore_index=True) - assert df.shape == (2, 3) - assert df['Hotel'].tolist() == [None, 'Hotel B'] - - # Convert columns to numeric, coercing errors to NaN - df['Review'] = pd.to_numeric(df['Review'], errors='coerce') - df['Price'] = pd.to_numeric(df['Price'], errors='coerce') - - # Check for NaN values - assert df['Review'][0] == 4.5 - assert np.isnan(df['Review'][1]) - assert df['Price'][0] == 150 - assert np.isnan(df['Price'][1]) - - -def test_extract_hotel_data_empty_list(): - # Empty hotel data list - hotel_data_list = [] - - df_list = [] - - # Call the function - extract_hotel_data(df_list, hotel_data_list) - - # Assertions - assert len(df_list) == 0 - assert df_list == [] - - -def test_extract_hotel_data_multiple_appends(): - # Sample hotel data - hotel_data_list_1 = [ - { - "displayName": {"text": "Hotel A"}, - "basicPropertyData": {"reviewScore": {"score": 4.5}}, - "blocks": [{"finalPrice": {"amount": 150}}] - } - ] - hotel_data_list_2 = [ - { - "displayName": {"text": "Hotel B"}, - "basicPropertyData": {"reviewScore": {"score": 4.0}}, - "blocks": [{"finalPrice": {"amount": 200}}] - } - ] - - df_list = [] - - # Call the function twice with different data - extract_hotel_data(df_list, hotel_data_list_1) - extract_hotel_data(df_list, hotel_data_list_2) - - # Assertions - assert len(df_list) == 2 - df1 = df_list[0] - df2 = df_list[1] - - assert df1.shape == (1, 3) - assert df1['Hotel'].tolist() == ['Hotel A'] - assert df1['Review'].tolist() == [4.5] - assert df1['Price'].tolist() == [150] - - assert df2.shape == (1, 3) - assert df2['Hotel'].tolist() == ['Hotel B'] - assert df2['Review'].tolist() == [4.0] - assert df2['Price'].tolist() == [200] - - -def test_returns_correct_total_page_number_and_data_mapping(): - # Given - response_mock = Mock() - response_mock.status_code = 200 - response_mock.json.return_value = { - 'data': { - 'searchQueries': { - 'search': { - 'pagination': {'nbResultsTotal': 1}, - 'breadcrumbs': [{}, {}, {'name': 'Test City'}], - 'flexibleDatesConfig': { - 'dateRangeCalendar': { - 'checkin': ['2023-01-01'], - 'checkout': ['2023-01-02'] - } - }, - 'searchMeta': { - 'nbAdults': 2, - 'nbChildren': 1, - 'nbRooms': 1 - }, - 'results': [{ - 'blocks': [{ - 'finalPrice': {'currency': 'USD'} - }] - }] - } - } - } - } - entered_city = "Test City" - entered_check_in = "2023-01-01" - entered_check_out = "2023-01-02" - entered_selected_currency = "USD" - entered_num_adult = 2 - entered_num_children = 1 - entered_num_room = 1 - - # When - result = check_info( - response_mock, entered_city, entered_check_in, entered_check_out, - entered_selected_currency, entered_num_adult, entered_num_children, - entered_num_room - ) - - # Then - assert result == (1, { - "city": "Test City", - "check_in": "2023-01-01", - "check_out": "2023-01-02", - "num_adult": 2, - "num_children": 1, - "num_room": 1, - "selected_currency": "USD" - }) - - -def test_handles_response_with_missing_or_null_fields_gracefully(): - # Given - response_mock = Mock() - response_mock.status_code = 200 - response_mock.json.return_value = { - 'data': { - 'searchQueries': { - 'search': { - 'pagination': {'nbResultsTotal': 1}, - 'breadcrumbs': [{}, {}, {'name': None}], - 'flexibleDatesConfig': { - 'dateRangeCalendar': { - 'checkin': [None], - 'checkout': [None] - } - }, - 'searchMeta': { - 'nbAdults': None, - 'nbChildren': None, - 'nbRooms': None - }, - 'results': [{ - 'blocks': [{ - 'finalPrice': {'currency': None} - }] - }] - } - } - } - } - entered_city = "Test City" - entered_check_in = "2023-01-01" - entered_check_out = "2023-01-02" - entered_selected_currency = "USD" - entered_num_adult = 2 - entered_num_children = 1 - entered_num_room = 1 - - # When - error_message = '' - try: - check_info( - response_mock, entered_city, entered_check_in, entered_check_out, - entered_selected_currency, entered_num_adult, entered_num_children, - entered_num_room - ) - except SystemExit as e: - error_message = str(e) - - # Then - assert error_message == "Error City not match: Test City != None" - - -def test_concatenate_multiple_non_empty_dataframes(): - # Given - df1 = pd.DataFrame({'A': [1, 2], 'B': [3, 4]}) - df2 = pd.DataFrame({'A': [5, 6], 'B': [7, 8]}) - df_list = [df1, df2] - - # When - result = concat_df_list(df_list) - - # Then - assert not result.empty - assert result.equals(pd.concat([df1, df2])) - - -def test_concatenate_empty_list(): - # Given - df_list = [] - - # When - result = concat_df_list(df_list) - - # Then - assert len(result) == 0 - - -if __name__ == '__main__': - pytest.main() diff --git a/tests/test_utils.py b/tests/test_utils.py deleted file mode 100644 index ee4a31c..0000000 --- a/tests/test_utils.py +++ /dev/null @@ -1,95 +0,0 @@ -import datetime -import sqlite3 -from calendar import monthrange - -import pytest - -from japan_avg_hotel_price_finder.utils import check_if_current_date_has_passed, find_missing_dates, find_csv_files, \ - convert_csv_to_df, get_count_of_date_by_mth_asof_today_query, \ - scrape_missing_dates - - -def test_check_if_current_date_has_passed(): - result = check_if_current_date_has_passed(2022, 5, 1) - assert result is True - - today = datetime.datetime.today() - day = today.day - month = today.month - year = today.year - result = check_if_current_date_has_passed(year, month, day) - assert result is False - - -def test_find_missing_dates(): - today = datetime.datetime.today() - month = today.month + 1 - year = today.year - days_in_month = monthrange(year, month)[1] - - first_day_of_month = datetime.datetime(year, month, 1).strftime('%Y-%m-%d') - third_day_of_month = datetime.datetime(year, month, 3).strftime('%Y-%m-%d') - fifth_day_of_month = datetime.datetime(year, month, 5).strftime('%Y-%m-%d') - - dates_in_db = {first_day_of_month, third_day_of_month, fifth_day_of_month} - - result = find_missing_dates(dates_in_db, days_in_month, today, month, year) - - expected_missing_dates = [] - for day in range(1, days_in_month + 1): - date_str = datetime.datetime(year, month, day).strftime('%Y-%m-%d') - if date_str not in dates_in_db: - expected_missing_dates.append(date_str) - - assert result == expected_missing_dates - - -def test_find_csv_files(): - directory = 'tests/test_find_csv_files' - csv_files = find_csv_files(directory) - print(csv_files) - assert len(csv_files) > 0 - - directory_2 = 'tests/test_find_csv_files_2' - csv_files = find_csv_files(directory_2) - assert len(csv_files) == 0 - - -def test_convert_csv_to_df(): - directory = 'tests/test_find_csv_files' - csv_files = find_csv_files(directory) - df = convert_csv_to_df(csv_files) - assert not df.empty - - directory_2 = 'tests/test_find_csv_files_2' - csv_files = find_csv_files(directory_2) - df = convert_csv_to_df(csv_files) - assert df is None - - -def test_scrape_missing_dates() -> None: - db = 'test_scrape_missing_dates.db' - - today = datetime.datetime.today() - if today.month == 12: - month = 1 - year = today.year + 1 - else: - month = today.month + 1 - year = today.year - - first_missing_date = f'{year}-{month}-01' - second_missing_date = f'{year}-{month}-11' - third_missing_date = f'{year}-{month}-20' - missing_dates = [first_missing_date, second_missing_date, third_missing_date] - scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True) - - with sqlite3.connect(db) as con: - query = get_count_of_date_by_mth_asof_today_query() - result = con.execute(query).fetchall() - for row in result: - assert row[1] == 3 - - -if __name__ == '__main__': - pytest.main() diff --git a/tests/test_utils/test_check_current_date_has_passed.py b/tests/test_utils/test_check_current_date_has_passed.py new file mode 100644 index 0000000..f82983b --- /dev/null +++ b/tests/test_utils/test_check_current_date_has_passed.py @@ -0,0 +1,72 @@ +import datetime + +import pytest + +from japan_avg_hotel_price_finder.utils import check_if_current_date_has_passed + + +def test_check_if_current_date_has_passed(): + result = check_if_current_date_has_passed(2022, 5, 1) + assert result is True + + today = datetime.datetime.today() + day = today.day + month = today.month + year = today.year + result = check_if_current_date_has_passed(year, month, day) + assert result is False + + +def test_handles_leap_years_correctly(): + # Given + year, month, day = 2020, 2, 29 + + # When + result = check_if_current_date_has_passed(year, month, day) + + # Then + assert result is True + + +def test_returns_false_when_given_date_is_today(): + # Given + today = datetime.datetime.today() + year = today.year + month = today.month + day = today.day + + # When + result = check_if_current_date_has_passed(year, month, day) + + # Then + assert result is False + + +def test_handles_end_of_month_dates_correctly(): + # Given + year = 2022 + month = 1 + day = 31 + + # When + result = check_if_current_date_has_passed(year, month, day) + + # Then + assert result is True + + +def test_handles_invalid_dates(): + # Given an invalid date like 30th February + year = 2022 + month = 2 + day = 30 + + # When checking if the current date has passed + result = check_if_current_date_has_passed(year, month, day) + + # Then the result should be False as it's an invalid date + assert result is False + + +if __name__ == '__main__': + pytest.main() diff --git a/tests/test_utils/test_convert_csv_to_df.py b/tests/test_utils/test_convert_csv_to_df.py new file mode 100644 index 0000000..3f398e5 --- /dev/null +++ b/tests/test_utils/test_convert_csv_to_df.py @@ -0,0 +1,15 @@ +from japan_avg_hotel_price_finder.utils import find_csv_files, convert_csv_to_df + + +def test_convert_csv_to_df(): + directory = 'tests/test_utils/test_find_csv_files' + csv_files = find_csv_files(directory) + df = convert_csv_to_df(csv_files) + assert not df.empty + + +def test_convert_csv_to_df_empty(): + directory_2 = 'tests/test_utils/test_find_csv_files_2' + csv_files = find_csv_files(directory_2) + df = convert_csv_to_df(csv_files) + assert df is None \ No newline at end of file diff --git a/tests/test_utils/test_find_csv_files.py b/tests/test_utils/test_find_csv_files.py new file mode 100644 index 0000000..9facdd4 --- /dev/null +++ b/tests/test_utils/test_find_csv_files.py @@ -0,0 +1,14 @@ +from japan_avg_hotel_price_finder.utils import find_csv_files + + +def test_find_csv_files(): + directory = 'tests/test_utils/test_find_csv_files' + csv_files = find_csv_files(directory) + print(csv_files) + assert len(csv_files) > 0 + + +def test_find_csv_files_empty(): + directory_2 = 'tests/test_utils/test_find_csv_files_2' + csv_files = find_csv_files(directory_2) + assert len(csv_files) == 0 diff --git a/tests/test_find_csv_files/Osaka_hotel_data_2024-09-29_to_2024-09-30.csv b/tests/test_utils/test_find_csv_files/Osaka_hotel_data_2024-09-29_to_2024-09-30.csv similarity index 100% rename from tests/test_find_csv_files/Osaka_hotel_data_2024-09-29_to_2024-09-30.csv rename to tests/test_utils/test_find_csv_files/Osaka_hotel_data_2024-09-29_to_2024-09-30.csv diff --git a/tests/test_utils/test_find_missing_dates.py b/tests/test_utils/test_find_missing_dates.py new file mode 100644 index 0000000..9ae3ec3 --- /dev/null +++ b/tests/test_utils/test_find_missing_dates.py @@ -0,0 +1,80 @@ +import calendar +import datetime +from calendar import monthrange + +from japan_avg_hotel_price_finder.utils import find_missing_dates + + +def test_find_missing_dates(): + today = datetime.datetime.today() + month = today.month + 1 + year = today.year + days_in_month = monthrange(year, month)[1] + + first_day_of_month = datetime.datetime(year, month, 1).strftime('%Y-%m-%d') + third_day_of_month = datetime.datetime(year, month, 3).strftime('%Y-%m-%d') + fifth_day_of_month = datetime.datetime(year, month, 5).strftime('%Y-%m-%d') + + dates_in_db = {first_day_of_month, third_day_of_month, fifth_day_of_month} + + result = find_missing_dates(dates_in_db, days_in_month, month, year) + + expected_missing_dates = [] + for day in range(1, days_in_month + 1): + date_str = datetime.datetime(year, month, day).strftime('%Y-%m-%d') + if date_str not in dates_in_db: + expected_missing_dates.append(date_str) + + # The missing dates should be all dates of the given month that are not the dates of the given month in the database + assert result == expected_missing_dates + + +def test_handles_empty_set_of_dates(): + # Given + dates_in_db = set() + days_in_month = 30 + today = datetime.datetime.today() + month = 9 + year = today.year + 1 + + # When + missing_dates = find_missing_dates(dates_in_db, days_in_month, month, year) + expected_missing_dates = [datetime.datetime(year, month, day).strftime('%Y-%m-%d') for day in range(1, days_in_month + 1)] + # The missing dates should be all dates of the given month that are not the dates of the given month in the database + assert missing_dates == expected_missing_dates + + +def test_handles_leap_year_feb_missing_dates(): + # Given + today = datetime.datetime.today() + month = 2 + year = today.year + 1 + dates_in_db = {f'{year}-02-01', f'{year}-02-02', f'{year}-02-03', f'{year}-02-04', f'{year}-02-06'} + + # Determine the number of days in February for the given year + if calendar.isleap(year): + days_in_month = 29 + else: + days_in_month = 28 + + # When + missing_dates = find_missing_dates(dates_in_db, days_in_month, month, year) + + expected_missing_dates = [] + for day in range(1, days_in_month + 1): + date_str = datetime.datetime(year, month, day).strftime('%Y-%m-%d') + if date_str not in dates_in_db: + expected_missing_dates.append(date_str) + + # The missing dates should be all dates of the given month that are not the dates of the given month in the database + assert missing_dates == expected_missing_dates + + +def test_past_dates_in_db(): + dates_in_db = {'2020-03-01', '2020-03-02', '2020-03-03', '2020-03-04'} + days_in_month = 31 + month = 3 + year = 2020 + missing_dates = find_missing_dates(dates_in_db, days_in_month, month, year) + + assert missing_dates == [] \ No newline at end of file diff --git a/tests/test_utils/test_utils.py b/tests/test_utils/test_utils.py new file mode 100644 index 0000000..dd262a1 --- /dev/null +++ b/tests/test_utils/test_utils.py @@ -0,0 +1,35 @@ +import datetime +import sqlite3 + +import pytest + +from japan_avg_hotel_price_finder.utils import get_count_of_date_by_mth_asof_today_query, \ + scrape_missing_dates + + +def test_scrape_missing_dates() -> None: + db = 'test_scrape_missing_dates.db' + + today = datetime.datetime.today() + if today.month == 12: + month = 1 + year = today.year + 1 + else: + month = today.month + 1 + year = today.year + + first_missing_date = f'{year}-{month}-01' + second_missing_date = f'{year}-{month}-11' + third_missing_date = f'{year}-{month}-20' + missing_dates = [first_missing_date, second_missing_date, third_missing_date] + scrape_missing_dates(db=db, missing_dates=missing_dates, to_sqlite=True) + + with sqlite3.connect(db) as con: + query = get_count_of_date_by_mth_asof_today_query() + result = con.execute(query).fetchall() + for row in result: + assert row[1] == 3 + + +if __name__ == '__main__': + pytest.main()