From 0a6dcc49f6ffb9c33599b8e2fc49d1a93028b2fe Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 1 Dec 2024 13:37:04 +0700 Subject: [PATCH 1/6] fix: adjusted dotenv path --- main.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/main.py b/main.py index 6b46d1d..88bbc54 100644 --- a/main.py +++ b/main.py @@ -12,7 +12,7 @@ from japan_avg_hotel_price_finder.sql.save_to_db import save_scraped_data from japan_avg_hotel_price_finder.whole_mth_graphql_scraper import WholeMonthGraphQLScraper -load_dotenv() +load_dotenv(dotenv_path='.env', override=True) def validate_required_args(arguments: argparse.Namespace, required_args: list[str]) -> bool: From efa9f67e65e8fab42b89a18314e0444c8afbd7cf Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 1 Dec 2024 13:43:32 +0700 Subject: [PATCH 2/6] fix: adjusted auth header getter --- get_auth_headers.py | 83 +++++++++++++++++++++++++++++++++++++++------ 1 file changed, 73 insertions(+), 10 deletions(-) diff --git a/get_auth_headers.py b/get_auth_headers.py index 0d44a1e..15a8c99 100644 --- a/get_auth_headers.py +++ b/get_auth_headers.py @@ -1,10 +1,30 @@ -from playwright.sync_api import sync_playwright import re +from playwright import sync_api +from playwright.sync_api import sync_playwright + +# Global flag to track if we've intercepted a request +intercepted = False + +ENV_FILENAME = '.env.local' + -def extract_x_headers(): +def extract_x_headers() -> None: + """ + Extract X-headers from Booking.com using Playwright. + + This function launches a Chromium browser, navigates to Booking.com, + performs a search for "Tokyo", and intercepts network requests to + extract X-headers used in GraphQL requests. + + The function uses the global 'intercepted' flag to track if a request + has been intercepted, and the 'handle_request' function to process + intercepted requests. + + :return: None + """ with sync_playwright() as p: - browser = p.chromium.launch(headless=True) + browser = p.chromium.launch(headless=False) page = browser.new_page() # Enable request interception @@ -23,15 +43,58 @@ def extract_x_headers(): browser.close() -def handle_request(request): - if re.match(r"https://www\.booking\.com/dml/graphql.*", request.url): - print(f"GraphQL Request URL: {request.url}") +def handle_request(request: sync_api.Request) -> None: + """ + Handle intercepted requests from Booking.com to extract X-headers. + + This function is called for each intercepted request. It checks if the request + is a GraphQL request to Booking.com and hasn't been intercepted before. If so, + it extracts the relevant headers and updates the environment variables. + + :param request: The intercepted request object. + :return: None + """ + global intercepted + if not intercepted and re.match(r"https://www\.booking\.com/dml/graphql.*", request.url): headers = request.headers + env_vars = {} for key, value in headers.items(): - if key.startswith('x-'): - print(f"{key}: {value}") - print("--------------------") + if key.startswith('x-') or key == 'user-agent': + env_key = key.upper().replace('-', '_') + env_vars[env_key] = value + + update_env_example(env_vars) + intercepted = True # Set the flag to True after intercepting + + +def update_env_example(env_vars: dict[str, str]) -> None: + """ + Update the environment variables file with new X-headers. + + This function reads the '.env.example' file, updates the values of existing keys + with new values from env_vars, and writes the result to a new file (ENV_FILENAME). + + :param env_vars: A dictionary of environment variables to update. + :return: None + """ + # Read from .env.example + with open('.env.example', 'r') as f: + lines = f.readlines() + + updated_lines = [] + for line in lines: + key = line.split('=')[0].strip() + if key in env_vars: + updated_lines.append(f"{key}={env_vars[key]}\n") + else: + updated_lines.append(line) + + # Write to .env instead of .env.example + with open(ENV_FILENAME, 'w') as f: + f.writelines(updated_lines) + + print(f"Headers updated in {ENV_FILENAME} file") if __name__ == "__main__": - extract_x_headers() + extract_x_headers() \ No newline at end of file From 2587a6b1e232347a3186e6fc5a85b6085f1ee638 Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 1 Dec 2024 13:44:02 +0700 Subject: [PATCH 3/6] docs: adjusted README.md --- README.md | 11 +---------- 1 file changed, 1 insertion(+), 10 deletions(-) diff --git a/README.md b/README.md index 67013e0..410c391 100644 --- a/README.md +++ b/README.md @@ -67,7 +67,6 @@ Built on top of [Find the Hotel's Average Room Price in Osaka](#find-the-hotels- - Create a virtual environment and activate it. - Install all dependencies listed in [requirements.txt](requirements.txt) - Run `playwright install` -- Rename a `.env.example` to `.env` ### Setup a Database - Download [Docker Desktop](https://www.docker.com/products/docker-desktop) @@ -75,18 +74,10 @@ Built on top of [Find the Hotel's Average Room Price in Osaka](#find-the-hotels- - Run: `export POSTGRES_DATA_PATH=''` to set the container volume to the directory path of your choice. - Run: `docker compose up -d` - -### Find your **User Agent**: - - Go to https://www.whatismybrowser.com/detect/what-is-my-user-agent/ - - Enter your User Agent into your **.env** file: - - User-Agent ➡ USER_AGENT ### Find the Necessary Headers - Run: `python get_auth_headers.py` - - It will print out the authentication headers from each request, which start with `X-`. - - It will print out multiple ones, but just choose the ones you need. -- Copy and paste the headers into your **.env** file: - - X_BOOKING_CONTEXT_ACTION_NAME, X_BOOKING_CONTEXT_AID, X_BOOKING_CSRF_TOKEN, X_BOOKING_ET_SERIALIZED_STATE, X_BOOKING_PAGEVIEW_ID, X_BOOKING_SITE_TYPE_ID, X_BOOKING_TOPIC + - It will write the headers to an `.env` file. ### General Guidelines for Using the Scraper - To scrape only hotel properties, use `--scrape_only_hotel` argument. From aa97abda504af93afec92cd18a21a65b8c179771 Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 1 Dec 2024 13:44:29 +0700 Subject: [PATCH 4/6] test: added tests for auth header getter --- .../test_extract_x_headers.py | 98 +++++++++++++++++++ .../test_handle_request.py | 78 +++++++++++++++ .../test_update_env_example.py | 77 +++++++++++++++ 3 files changed, 253 insertions(+) create mode 100644 tests/test_get_auth_headers/test_extract_x_headers.py create mode 100644 tests/test_get_auth_headers/test_handle_request.py create mode 100644 tests/test_get_auth_headers/test_update_env_example.py diff --git a/tests/test_get_auth_headers/test_extract_x_headers.py b/tests/test_get_auth_headers/test_extract_x_headers.py new file mode 100644 index 0000000..3160960 --- /dev/null +++ b/tests/test_get_auth_headers/test_extract_x_headers.py @@ -0,0 +1,98 @@ +from unittest.mock import patch, Mock, mock_open + +import pytest +from playwright.sync_api import Page, Browser, BrowserContext + +from get_auth_headers import extract_x_headers, handle_request, update_env_example + + +@pytest.fixture +def mock_playwright(): + with patch('get_auth_headers.sync_playwright') as mock_playwright: + mock_browser = Mock(spec=Browser) + mock_page = Mock(spec=Page) + mock_context = Mock(spec=BrowserContext) + + mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser + mock_browser.new_page.return_value = mock_page + + yield mock_playwright, mock_browser, mock_page + + +def test_extract_x_headers_navigation(mock_playwright): + _, _, mock_page = mock_playwright + + extract_x_headers() + + # Check if the function navigates to Booking.com + mock_page.goto.assert_called_once_with("https://www.booking.com") + + # Check if the function fills in the search input and presses Enter + mock_page.fill.assert_called_once_with('input[name="ss"]', "Tokyo") + mock_page.press.assert_called_once_with('input[name="ss"]', "Enter") + + +def test_extract_x_headers_request_interception(mock_playwright): + _, _, mock_page = mock_playwright + + extract_x_headers() + + # Check if request interception is set up + mock_page.on.assert_called_once_with("request", handle_request) + + +@patch('get_auth_headers.update_env_example') +def test_handle_request_graphql(mock_update_env): + mock_request = Mock() + mock_request.url = "https://www.booking.com/dml/graphql?query=somequery" + mock_request.headers = { + 'x-booking-context-action-name': 'searchresults', + 'user-agent': 'Mozilla/5.0', + 'content-type': 'application/json' + } + + handle_request(mock_request) + + expected_env_vars = { + 'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults', + 'USER_AGENT': 'Mozilla/5.0' + } + mock_update_env.assert_called_once_with(expected_env_vars) + + +def test_handle_request_non_graphql(): + mock_request = Mock() + mock_request.url = "https://www.booking.com/some-other-page" + + with patch('get_auth_headers.update_env_example') as mock_update_env: + handle_request(mock_request) + mock_update_env.assert_not_called() + + +@patch('builtins.open', new_callable=mock_open, read_data="X_BOOKING_CONTEXT_ACTION_NAME=\nUSER_AGENT=\n") +@patch('get_auth_headers.ENV_FILENAME', '.env.test') +def test_update_env_example(mock_file): + env_vars = { + 'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults', + 'USER_AGENT': 'Mozilla/5.0' + } + + update_env_example(env_vars) + + # Check that open was called twice (once for reading, once for writing) + assert mock_file.call_count == 2 + + # Check the read operation + mock_file.assert_any_call('.env.example', 'r') + + # Check the write operation + mock_file.assert_any_call('.env.test', 'w') + + # Check the content written + handle = mock_file() + handle.writelines.assert_called_once_with(['X_BOOKING_CONTEXT_ACTION_NAME=searchresults\n', 'USER_AGENT=Mozilla/5.0\n']) + + # Check that print was called with the correct message + with patch('builtins.print') as mock_print: + update_env_example(env_vars) + mock_print.assert_called_once_with("Headers updated in .env.test file") \ No newline at end of file diff --git a/tests/test_get_auth_headers/test_handle_request.py b/tests/test_get_auth_headers/test_handle_request.py new file mode 100644 index 0000000..7710754 --- /dev/null +++ b/tests/test_get_auth_headers/test_handle_request.py @@ -0,0 +1,78 @@ +from unittest.mock import Mock, patch + +import pytest +from playwright.sync_api import Request + +from get_auth_headers import handle_request + + +@pytest.fixture +def mock_request(): + request = Mock(spec=Request) + request.url = "https://www.booking.com/dml/graphql?query=somequery" + request.headers = { + 'x-booking-context-action-name': 'searchresults', + 'user-agent': 'Mozilla/5.0', + 'content-type': 'application/json' + } + return request + +def test_handle_request_graphql(mock_request): + with patch('get_auth_headers.update_env_example') as mock_update_env: + handle_request(mock_request) + + expected_env_vars = { + 'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults', + 'USER_AGENT': 'Mozilla/5.0' + } + mock_update_env.assert_called_once_with(expected_env_vars) + +def test_handle_request_non_graphql(): + non_graphql_request = Mock(spec=Request) + non_graphql_request.url = "https://www.booking.com/some-other-page" + + with patch('get_auth_headers.update_env_example') as mock_update_env: + handle_request(non_graphql_request) + mock_update_env.assert_not_called() + +def test_handle_request_intercept_once(): + with patch('get_auth_headers.update_env_example') as mock_update_env: + request1 = Mock(spec=Request) + request1.url = "https://www.booking.com/dml/graphql?query=somequery" + request1.headers = {'x-test': 'value1'} + + request2 = Mock(spec=Request) + request2.url = "https://www.booking.com/dml/graphql?query=anotherquery" + request2.headers = {'x-test': 'value2'} + + handle_request(request1) + handle_request(request2) + + mock_update_env.assert_called_once() + +def test_handle_request_extracts_correct_headers(): + request = Mock(spec=Request) + request.url = "https://www.booking.com/dml/graphql?query=somequery" + request.headers = { + 'x-test1': 'value1', + 'x-test2': 'value2', + 'user-agent': 'TestAgent', + 'content-type': 'application/json' + } + + with patch('get_auth_headers.update_env_example') as mock_update_env: + handle_request(request) + + expected_env_vars = { + 'X_TEST1': 'value1', + 'X_TEST2': 'value2', + 'USER_AGENT': 'TestAgent' + } + mock_update_env.assert_called_once_with(expected_env_vars) + +@pytest.fixture(autouse=True) +def reset_intercepted(): + import get_auth_headers + get_auth_headers.intercepted = False + yield + get_auth_headers.intercepted = False \ No newline at end of file diff --git a/tests/test_get_auth_headers/test_update_env_example.py b/tests/test_get_auth_headers/test_update_env_example.py new file mode 100644 index 0000000..100534a --- /dev/null +++ b/tests/test_get_auth_headers/test_update_env_example.py @@ -0,0 +1,77 @@ +from unittest.mock import patch, mock_open + +import pytest + +from get_auth_headers import update_env_example + + +@pytest.fixture +def mock_env_file(): + return "X_BOOKING_CONTEXT_ACTION_NAME=\nUSER_AGENT=\n" + + +@pytest.mark.parametrize('env_filename', ['.env.test']) +def test_update_env_example(mock_env_file, env_filename): + with patch('builtins.open', mock_open(read_data=mock_env_file)) as mock_file, \ + patch('get_auth_headers.ENV_FILENAME', env_filename): + env_vars = { + 'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults', + 'USER_AGENT': 'Mozilla/5.0' + } + + update_env_example(env_vars) + + # Check that open was called twice (once for reading, once for writing) + assert mock_file.call_count == 2 + + # Check the read operation + mock_file.assert_any_call('.env.example', 'r') + + # Check the write operation + mock_file.assert_any_call(env_filename, 'w') + + # Check the content written + handle = mock_file() + handle.writelines.assert_called_once_with([ + 'X_BOOKING_CONTEXT_ACTION_NAME=searchresults\n', + 'USER_AGENT=Mozilla/5.0\n' + ]) + + +def test_update_env_example_partial_update(): + mock_file_content = "X_HEADER=old_value\nOTHER_HEADER=keep_this\n" + with patch('builtins.open', mock_open(read_data=mock_file_content)) as mock_file, \ + patch('get_auth_headers.ENV_FILENAME', '.env.test'): + env_vars = { + 'X_HEADER': 'new_value' + } + + update_env_example(env_vars) + + handle = mock_file() + handle.writelines.assert_called_once_with([ + 'X_HEADER=new_value\n', + 'OTHER_HEADER=keep_this\n' + ]) + + +def test_update_env_example_empty_file(): + with patch('builtins.open', mock_open(read_data="")) as mock_file, \ + patch('get_auth_headers.ENV_FILENAME', '.env.test'): + env_vars = { + 'NEW_HEADER': 'new_value' + } + + update_env_example(env_vars) + + handle = mock_file() + handle.writelines.assert_called_once_with([]) + + +def test_update_env_example_print_message(capsys): + with patch('builtins.open', mock_open(read_data="")), \ + patch('get_auth_headers.ENV_FILENAME', '.env.test'): + update_env_example({}) + + captured = capsys.readouterr() + assert captured.out == "Headers updated in .env.test file\n" From 29cb961232622245ecca145c90f98c07c73171d0 Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 1 Dec 2024 13:53:54 +0700 Subject: [PATCH 5/6] test: adjusted tests --- .../test_find_missing_dates.py | 14 +++++++------- 1 file changed, 7 insertions(+), 7 deletions(-) diff --git a/tests/test_missing_date_checker/test_find_missing_dates.py b/tests/test_missing_date_checker/test_find_missing_dates.py index 4cca238..d1a60b8 100644 --- a/tests/test_missing_date_checker/test_find_missing_dates.py +++ b/tests/test_missing_date_checker/test_find_missing_dates.py @@ -7,13 +7,14 @@ def test_find_missing_dates(): today = datetime.datetime.today() - month = today.month + 1 - year = today.year + next_month = today.replace(day=1) + datetime.timedelta(days=32) + month = next_month.month + year = next_month.year days_in_month = monthrange(year, month)[1] - first_day_of_month = datetime.datetime(year, month, 1).strftime('%Y-%m-%d') - third_day_of_month = datetime.datetime(year, month, 3).strftime('%Y-%m-%d') - fifth_day_of_month = datetime.datetime(year, month, 5).strftime('%Y-%m-%d') + first_day_of_month = next_month.replace(day=1).strftime('%Y-%m-%d') + third_day_of_month = next_month.replace(day=3).strftime('%Y-%m-%d') + fifth_day_of_month = next_month.replace(day=5).strftime('%Y-%m-%d') dates_in_db = {first_day_of_month, third_day_of_month, fifth_day_of_month} @@ -21,14 +22,13 @@ def test_find_missing_dates(): expected_missing_dates = [] for day in range(1, days_in_month + 1): - date_str = datetime.datetime(year, month, day).strftime('%Y-%m-%d') + date_str = next_month.replace(day=day).strftime('%Y-%m-%d') if date_str not in dates_in_db: expected_missing_dates.append(date_str) # The missing dates should be all dates of the given month that are not the dates of the given month in the database assert result == expected_missing_dates - def test_find_missing_dates_of_different_months(): result = [] expected_result = [] From 80e5740919df3424c204d24c3fcb6810f147164e Mon Sep 17 00:00:00 2001 From: Sakan Date: Sun, 1 Dec 2024 14:04:34 +0700 Subject: [PATCH 6/6] test: adjusted tests for Japan scraper --- .../test_japan_scraper/test_japan_scraper.py | 24 ++++++++++++------- 1 file changed, 15 insertions(+), 9 deletions(-) diff --git a/tests/test_japan_scraper/test_japan_scraper.py b/tests/test_japan_scraper/test_japan_scraper.py index 3f904a8..adff5c7 100644 --- a/tests/test_japan_scraper/test_japan_scraper.py +++ b/tests/test_japan_scraper/test_japan_scraper.py @@ -31,7 +31,7 @@ async def test_japan_scraper(tmp_path): Column('scrape_only_hotel', Boolean), Column('country', String), Column('city', String) - ) + ) metadata.create_all(engine) scraper = JapanScraper( @@ -47,19 +47,23 @@ async def test_japan_scraper(tmp_path): scrape_only_hotel=True ) scraper.japan_regions = {"Hokkaido": ["Hokkaido"]} - current_month = datetime.datetime.now().month + current_date = datetime.datetime.now().date() + current_month = current_date.month + current_year = current_date.year scraper.start_month = current_month scraper.end_month = current_month - # Create sample data + # Create sample data with dynamic dates sample_data = pd.DataFrame({ 'Region': ['Hokkaido', 'Hokkaido'], 'Prefecture': ['Hokkaido', 'Hokkaido'], 'hotel_name': ['Hotel A', 'Hotel B'], 'price': [100, 200], - 'date': [datetime.date(2023, current_month, 1), datetime.date(2023, current_month, 2)], - 'check_in': ['2023-11-01', '2023-11-02'], - 'check_out': ['2023-11-02', '2023-11-03'], + 'date': [current_date, current_date + datetime.timedelta(days=1)], + 'check_in': [current_date.strftime('%Y-%m-%d'), + (current_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d')], + 'check_out': [(current_date + datetime.timedelta(days=1)).strftime('%Y-%m-%d'), + (current_date + datetime.timedelta(days=2)).strftime('%Y-%m-%d')], 'group_adults': [1, 1], 'num_rooms': [1, 1], 'group_children': [0, 0], @@ -104,9 +108,11 @@ async def mock_scrape_whole_year(): assert row.Prefecture == 'Hokkaido', f"Row {i}: Prefecture mismatch" assert row.hotel_name == f'Hotel {"A" if i == 0 else "B"}', f"Row {i}: hotel_name mismatch" assert row.price == (100 if i == 0 else 200), f"Row {i}: price mismatch" - assert row.date == f'2023-11-0{i + 1}', f"Row {i}: date mismatch" - assert row.check_in == f'2023-11-0{i + 1}', f"Row {i}: check_in mismatch" - assert row.check_out == f'2023-11-0{i + 2}', f"Row {i}: check_out mismatch" + expected_date = (current_date + datetime.timedelta(days=i)).strftime('%Y-%m-%d') + assert row.date == expected_date, f"Row {i}: date mismatch. Expected {expected_date}, got {row.date}" + assert row.check_in == expected_date, f"Row {i}: check_in mismatch. Expected {expected_date}, got {row.check_in}" + expected_checkout = (current_date + datetime.timedelta(days=i + 1)).strftime('%Y-%m-%d') + assert row.check_out == expected_checkout, f"Row {i}: check_out mismatch. Expected {expected_checkout}, got {row.check_out}" assert row.group_adults == 1, f"Row {i}: group_adults mismatch" assert row.num_rooms == 1, f"Row {i}: num_rooms mismatch" assert row.group_children == 0, f"Row {i}: group_children mismatch"