Skip to content

Commit

Permalink
Merge pull request #78 from sakan811/patch
Browse files Browse the repository at this point in the history
Adjust authorization header getter to write the headers to a .env file
  • Loading branch information
sakan811 authored Dec 1, 2024
2 parents 01c18f9 + 80e5740 commit 830341e
Show file tree
Hide file tree
Showing 8 changed files with 350 additions and 37 deletions.
11 changes: 1 addition & 10 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -67,26 +67,17 @@ Built on top of [Find the Hotel's Average Room Price in Osaka](#find-the-hotels-
- Create a virtual environment and activate it.
- Install all dependencies listed in [requirements.txt](requirements.txt)
- Run `playwright install`
- Rename a `.env.example` to `.env`

### Setup a Database
- Download [Docker Desktop](https://www.docker.com/products/docker-desktop)
- Ensure that Docker Desktop is running.
- Run: `export POSTGRES_DATA_PATH='<your_container_volume_path>'` to set the container volume
to the directory path of your choice.
- Run: `docker compose up -d`

### Find your **User Agent**:
- Go to https://www.whatismybrowser.com/detect/what-is-my-user-agent/
- Enter your User Agent into your **.env** file:
- User-Agent ➡ USER_AGENT

### Find the Necessary Headers
- Run: `python get_auth_headers.py`
- It will print out the authentication headers from each request, which start with `X-`.
- It will print out multiple ones, but just choose the ones you need.
- Copy and paste the headers into your **.env** file:
- X_BOOKING_CONTEXT_ACTION_NAME, X_BOOKING_CONTEXT_AID, X_BOOKING_CSRF_TOKEN, X_BOOKING_ET_SERIALIZED_STATE, X_BOOKING_PAGEVIEW_ID, X_BOOKING_SITE_TYPE_ID, X_BOOKING_TOPIC
- It will write the headers to an `.env` file.

### General Guidelines for Using the Scraper
- To scrape only hotel properties, use `--scrape_only_hotel` argument.
Expand Down
83 changes: 73 additions & 10 deletions get_auth_headers.py
Original file line number Diff line number Diff line change
@@ -1,10 +1,30 @@
from playwright.sync_api import sync_playwright
import re

from playwright import sync_api
from playwright.sync_api import sync_playwright

# Global flag to track if we've intercepted a request
intercepted = False

ENV_FILENAME = '.env.local'


def extract_x_headers():
def extract_x_headers() -> None:
"""
Extract X-headers from Booking.com using Playwright.
This function launches a Chromium browser, navigates to Booking.com,
performs a search for "Tokyo", and intercepts network requests to
extract X-headers used in GraphQL requests.
The function uses the global 'intercepted' flag to track if a request
has been intercepted, and the 'handle_request' function to process
intercepted requests.
:return: None
"""
with sync_playwright() as p:
browser = p.chromium.launch(headless=True)
browser = p.chromium.launch(headless=False)
page = browser.new_page()

# Enable request interception
Expand All @@ -23,15 +43,58 @@ def extract_x_headers():
browser.close()


def handle_request(request):
if re.match(r"https://www\.booking\.com/dml/graphql.*", request.url):
print(f"GraphQL Request URL: {request.url}")
def handle_request(request: sync_api.Request) -> None:
"""
Handle intercepted requests from Booking.com to extract X-headers.
This function is called for each intercepted request. It checks if the request
is a GraphQL request to Booking.com and hasn't been intercepted before. If so,
it extracts the relevant headers and updates the environment variables.
:param request: The intercepted request object.
:return: None
"""
global intercepted
if not intercepted and re.match(r"https://www\.booking\.com/dml/graphql.*", request.url):
headers = request.headers
env_vars = {}
for key, value in headers.items():
if key.startswith('x-'):
print(f"{key}: {value}")
print("--------------------")
if key.startswith('x-') or key == 'user-agent':
env_key = key.upper().replace('-', '_')
env_vars[env_key] = value

update_env_example(env_vars)
intercepted = True # Set the flag to True after intercepting


def update_env_example(env_vars: dict[str, str]) -> None:
"""
Update the environment variables file with new X-headers.
This function reads the '.env.example' file, updates the values of existing keys
with new values from env_vars, and writes the result to a new file (ENV_FILENAME).
:param env_vars: A dictionary of environment variables to update.
:return: None
"""
# Read from .env.example
with open('.env.example', 'r') as f:
lines = f.readlines()

updated_lines = []
for line in lines:
key = line.split('=')[0].strip()
if key in env_vars:
updated_lines.append(f"{key}={env_vars[key]}\n")
else:
updated_lines.append(line)

# Write to .env instead of .env.example
with open(ENV_FILENAME, 'w') as f:
f.writelines(updated_lines)

print(f"Headers updated in {ENV_FILENAME} file")


if __name__ == "__main__":
extract_x_headers()
extract_x_headers()
2 changes: 1 addition & 1 deletion main.py
Original file line number Diff line number Diff line change
Expand Up @@ -12,7 +12,7 @@
from japan_avg_hotel_price_finder.sql.save_to_db import save_scraped_data
from japan_avg_hotel_price_finder.whole_mth_graphql_scraper import WholeMonthGraphQLScraper

load_dotenv()
load_dotenv(dotenv_path='.env', override=True)


def validate_required_args(arguments: argparse.Namespace, required_args: list[str]) -> bool:
Expand Down
98 changes: 98 additions & 0 deletions tests/test_get_auth_headers/test_extract_x_headers.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,98 @@
from unittest.mock import patch, Mock, mock_open

import pytest
from playwright.sync_api import Page, Browser, BrowserContext

from get_auth_headers import extract_x_headers, handle_request, update_env_example


@pytest.fixture
def mock_playwright():
with patch('get_auth_headers.sync_playwright') as mock_playwright:
mock_browser = Mock(spec=Browser)
mock_page = Mock(spec=Page)
mock_context = Mock(spec=BrowserContext)

mock_playwright.return_value.__enter__.return_value.chromium.launch.return_value = mock_browser
mock_browser.new_page.return_value = mock_page

yield mock_playwright, mock_browser, mock_page


def test_extract_x_headers_navigation(mock_playwright):
_, _, mock_page = mock_playwright

extract_x_headers()

# Check if the function navigates to Booking.com
mock_page.goto.assert_called_once_with("https://www.booking.com")

# Check if the function fills in the search input and presses Enter
mock_page.fill.assert_called_once_with('input[name="ss"]', "Tokyo")
mock_page.press.assert_called_once_with('input[name="ss"]', "Enter")


def test_extract_x_headers_request_interception(mock_playwright):
_, _, mock_page = mock_playwright

extract_x_headers()

# Check if request interception is set up
mock_page.on.assert_called_once_with("request", handle_request)


@patch('get_auth_headers.update_env_example')
def test_handle_request_graphql(mock_update_env):
mock_request = Mock()
mock_request.url = "https://www.booking.com/dml/graphql?query=somequery"
mock_request.headers = {
'x-booking-context-action-name': 'searchresults',
'user-agent': 'Mozilla/5.0',
'content-type': 'application/json'
}

handle_request(mock_request)

expected_env_vars = {
'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults',
'USER_AGENT': 'Mozilla/5.0'
}
mock_update_env.assert_called_once_with(expected_env_vars)


def test_handle_request_non_graphql():
mock_request = Mock()
mock_request.url = "https://www.booking.com/some-other-page"

with patch('get_auth_headers.update_env_example') as mock_update_env:
handle_request(mock_request)
mock_update_env.assert_not_called()


@patch('builtins.open', new_callable=mock_open, read_data="X_BOOKING_CONTEXT_ACTION_NAME=\nUSER_AGENT=\n")
@patch('get_auth_headers.ENV_FILENAME', '.env.test')
def test_update_env_example(mock_file):
env_vars = {
'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults',
'USER_AGENT': 'Mozilla/5.0'
}

update_env_example(env_vars)

# Check that open was called twice (once for reading, once for writing)
assert mock_file.call_count == 2

# Check the read operation
mock_file.assert_any_call('.env.example', 'r')

# Check the write operation
mock_file.assert_any_call('.env.test', 'w')

# Check the content written
handle = mock_file()
handle.writelines.assert_called_once_with(['X_BOOKING_CONTEXT_ACTION_NAME=searchresults\n', 'USER_AGENT=Mozilla/5.0\n'])

# Check that print was called with the correct message
with patch('builtins.print') as mock_print:
update_env_example(env_vars)
mock_print.assert_called_once_with("Headers updated in .env.test file")
78 changes: 78 additions & 0 deletions tests/test_get_auth_headers/test_handle_request.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,78 @@
from unittest.mock import Mock, patch

import pytest
from playwright.sync_api import Request

from get_auth_headers import handle_request


@pytest.fixture
def mock_request():
request = Mock(spec=Request)
request.url = "https://www.booking.com/dml/graphql?query=somequery"
request.headers = {
'x-booking-context-action-name': 'searchresults',
'user-agent': 'Mozilla/5.0',
'content-type': 'application/json'
}
return request

def test_handle_request_graphql(mock_request):
with patch('get_auth_headers.update_env_example') as mock_update_env:
handle_request(mock_request)

expected_env_vars = {
'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults',
'USER_AGENT': 'Mozilla/5.0'
}
mock_update_env.assert_called_once_with(expected_env_vars)

def test_handle_request_non_graphql():
non_graphql_request = Mock(spec=Request)
non_graphql_request.url = "https://www.booking.com/some-other-page"

with patch('get_auth_headers.update_env_example') as mock_update_env:
handle_request(non_graphql_request)
mock_update_env.assert_not_called()

def test_handle_request_intercept_once():
with patch('get_auth_headers.update_env_example') as mock_update_env:
request1 = Mock(spec=Request)
request1.url = "https://www.booking.com/dml/graphql?query=somequery"
request1.headers = {'x-test': 'value1'}

request2 = Mock(spec=Request)
request2.url = "https://www.booking.com/dml/graphql?query=anotherquery"
request2.headers = {'x-test': 'value2'}

handle_request(request1)
handle_request(request2)

mock_update_env.assert_called_once()

def test_handle_request_extracts_correct_headers():
request = Mock(spec=Request)
request.url = "https://www.booking.com/dml/graphql?query=somequery"
request.headers = {
'x-test1': 'value1',
'x-test2': 'value2',
'user-agent': 'TestAgent',
'content-type': 'application/json'
}

with patch('get_auth_headers.update_env_example') as mock_update_env:
handle_request(request)

expected_env_vars = {
'X_TEST1': 'value1',
'X_TEST2': 'value2',
'USER_AGENT': 'TestAgent'
}
mock_update_env.assert_called_once_with(expected_env_vars)

@pytest.fixture(autouse=True)
def reset_intercepted():
import get_auth_headers
get_auth_headers.intercepted = False
yield
get_auth_headers.intercepted = False
77 changes: 77 additions & 0 deletions tests/test_get_auth_headers/test_update_env_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
from unittest.mock import patch, mock_open

import pytest

from get_auth_headers import update_env_example


@pytest.fixture
def mock_env_file():
return "X_BOOKING_CONTEXT_ACTION_NAME=\nUSER_AGENT=\n"


@pytest.mark.parametrize('env_filename', ['.env.test'])
def test_update_env_example(mock_env_file, env_filename):
with patch('builtins.open', mock_open(read_data=mock_env_file)) as mock_file, \
patch('get_auth_headers.ENV_FILENAME', env_filename):
env_vars = {
'X_BOOKING_CONTEXT_ACTION_NAME': 'searchresults',
'USER_AGENT': 'Mozilla/5.0'
}

update_env_example(env_vars)

# Check that open was called twice (once for reading, once for writing)
assert mock_file.call_count == 2

# Check the read operation
mock_file.assert_any_call('.env.example', 'r')

# Check the write operation
mock_file.assert_any_call(env_filename, 'w')

# Check the content written
handle = mock_file()
handle.writelines.assert_called_once_with([
'X_BOOKING_CONTEXT_ACTION_NAME=searchresults\n',
'USER_AGENT=Mozilla/5.0\n'
])


def test_update_env_example_partial_update():
mock_file_content = "X_HEADER=old_value\nOTHER_HEADER=keep_this\n"
with patch('builtins.open', mock_open(read_data=mock_file_content)) as mock_file, \
patch('get_auth_headers.ENV_FILENAME', '.env.test'):
env_vars = {
'X_HEADER': 'new_value'
}

update_env_example(env_vars)

handle = mock_file()
handle.writelines.assert_called_once_with([
'X_HEADER=new_value\n',
'OTHER_HEADER=keep_this\n'
])


def test_update_env_example_empty_file():
with patch('builtins.open', mock_open(read_data="")) as mock_file, \
patch('get_auth_headers.ENV_FILENAME', '.env.test'):
env_vars = {
'NEW_HEADER': 'new_value'
}

update_env_example(env_vars)

handle = mock_file()
handle.writelines.assert_called_once_with([])


def test_update_env_example_print_message(capsys):
with patch('builtins.open', mock_open(read_data="")), \
patch('get_auth_headers.ENV_FILENAME', '.env.test'):
update_env_example({})

captured = capsys.readouterr()
assert captured.out == "Headers updated in .env.test file\n"
Loading

0 comments on commit 830341e

Please sign in to comment.