Skip to content

Commit

Permalink
timeout 20 sec
Browse files Browse the repository at this point in the history
token and retry added

token and retry added

token and retry added

token and retry added

token and retry added

cleanup

fix for push

improve logging
  • Loading branch information
dlubom committed Jul 24, 2024
1 parent 6b9e137 commit 9cf1722
Show file tree
Hide file tree
Showing 4 changed files with 56 additions and 48 deletions.
Binary file modified iata_code_fetcher/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
76 changes: 48 additions & 28 deletions iata_code_fetcher/fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,11 +6,13 @@
import json
from string import ascii_uppercase
from itertools import product
from typing import Generator, Union, List, Tuple, Dict
from typing import Generator, List, Dict
from enum import Enum
import logging
from time import sleep
from bs4 import BeautifulSoup
import requests
from requests.exceptions import RequestException

# Constants
BASE_URL: str = (
Expand All @@ -22,6 +24,9 @@
AIRPORT_FILE: str = "airport_data_full.jsonl"
# Frequency of processing status updates
REPORT_FREQUENCY: int = 100 # report every 100 codes
MAX_RETRIES: int = 3
RETRY_DELAY: int = 5 # seconds
TIMEOUT: int = 20 # seconds

# Configure Logging
logging.basicConfig(level=logging.INFO, format="%(asctime)s - %(levelname)s - %(message)s")
Expand All @@ -46,40 +51,53 @@ def generate_codes(length: int) -> Generator[str, None, None]:
return ("".join(letters) for letters in product(ascii_uppercase, repeat=length))


def fetch_and_process_data(code: str, code_type: CodeType) -> Tuple[Union[List[Dict[str, str]], str], str]:
def fetch_and_process_data(code: str, code_type: CodeType) -> List[Dict[str, str]]:
"""
Fetch and process data from the IATA site based on the code and type.
:param code: The IATA code.
:param code_type: The type of the code (CodeType.CARRIER or CodeType.AIRPORT).
:return: A tuple of (rows, file_path). Rows is a list of dictionaries or an error message.
:return: List of dictionaries.
"""
url = BASE_URL.format(
block=CARRIER_BLOCK if code_type == CodeType.CARRIER else AIRPORT_BLOCK,
type="airline" if code_type == CodeType.CARRIER else "airport",
code=code,
)
file_path = CARRIER_FILE if code_type == CodeType.CARRIER else AIRPORT_FILE

try:
response = requests.get(url, timeout=10)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", {"class": "datatable"})

if not table:
return "No table found or error in response", file_path

headers = [th.text.strip() for th in table.find_all("td")]
rows = []
for row in table.find("tbody").find_all("tr"):
cols = [col.text.strip() for col in row.find_all("td")]
row_data = dict(zip(headers, cols))
rows.append(row_data)

return rows, file_path
except requests.RequestException as e:
return f"Request failed: {str(e)}", file_path
for attempt in range(MAX_RETRIES):
try:
response = requests.get(url, timeout=TIMEOUT)
response.raise_for_status()
soup = BeautifulSoup(response.text, "html.parser")
table = soup.find("table", {"class": "datatable"})

if not table:
raise ValueError("No table found or error in response")

headers = [th.text.strip() for th in table.find_all("td")]
rows = []
for row in table.find("tbody").find_all("tr"):
cols = [col.text.strip() for col in row.find_all("td")]
row_data = dict(zip(headers, cols))
rows.append(row_data)

return rows

except RequestException as e:
if attempt < MAX_RETRIES - 1:
logging.warning(
"Request failed for %s. Retrying in {%d} seconds... (Attempt {%d}/{%d})",
code,
RETRY_DELAY,
attempt + 1,
MAX_RETRIES,
)
sleep(RETRY_DELAY)
else:
raise RequestException(f"Request failed after {MAX_RETRIES} attempts: {str(e)}") from e

return []


def process_and_save_data(code_type: CodeType) -> None:
Expand All @@ -89,15 +107,17 @@ def process_and_save_data(code_type: CodeType) -> None:
:param code_type: The type of the code (CodeType.CARRIER or CodeType.AIRPORT).
"""
processed: int = 0
file_path = CARRIER_FILE if code_type == CodeType.CARRIER else AIRPORT_FILE
for code in generate_codes(2 if code_type == CodeType.CARRIER else 3):
result, file_path = fetch_and_process_data(code, code_type)
if isinstance(result, list) and result:
try:
result = fetch_and_process_data(code, code_type)
with open(file_path, "a", encoding="UTF-8") as file:
for item in result:
file.write(json.dumps(item) + "\n")
else:
if result != "No table found or error in response":
logging.error("Error for %s: %s", code, result)
except RequestException as e:
logging.error("For %s: %s", code, e)
except ValueError as e:
logging.warn("For %s, %s", code, e)

processed += 1
if processed % REPORT_FREQUENCY == 0:
Expand Down
Binary file modified tests/__pycache__/__init__.cpython-310.pyc
Binary file not shown.
28 changes: 8 additions & 20 deletions tests/test_fetcher.py
Original file line number Diff line number Diff line change
Expand Up @@ -77,7 +77,7 @@ def test_fetch_and_process_data_carrier(mock_get, carrier_response_mock):
mock_get.return_value.status_code = 200
mock_get.return_value.text = carrier_response_mock

data, file_path = fetch_and_process_data("AA", CodeType.CARRIER)
data = fetch_and_process_data("AA", CodeType.CARRIER)

expected_data = [
{
Expand All @@ -93,7 +93,6 @@ def test_fetch_and_process_data_carrier(mock_get, carrier_response_mock):
]

assert data == expected_data
assert file_path == "carrier_data_full.jsonl"


@patch("requests.get")
Expand All @@ -104,7 +103,7 @@ def test_fetch_and_process_data_airport(mock_get, airport_response_mock):
mock_get.return_value.status_code = 200
mock_get.return_value.text = airport_response_mock

data, file_path = fetch_and_process_data("AAA", CodeType.AIRPORT)
data = fetch_and_process_data("AAA", CodeType.AIRPORT)

expected_data = [
{
Expand All @@ -115,7 +114,6 @@ def test_fetch_and_process_data_airport(mock_get, airport_response_mock):
]

assert data == expected_data
assert file_path == "airport_data_full.jsonl"


@patch("requests.get")
Expand All @@ -125,10 +123,8 @@ def test_fetch_and_process_data_error(mock_get):
"""
mock_get.side_effect = RequestException("Network error")

data, file_path = fetch_and_process_data("AA", CodeType.CARRIER)

assert data == "Request failed: Network error"
assert file_path == "carrier_data_full.jsonl"
with pytest.raises(RequestException):
_ = fetch_and_process_data("AA", CodeType.CARRIER)


@patch("builtins.open", new_callable=mock_open)
Expand Down Expand Up @@ -176,14 +172,10 @@ def test_generate_codes_for_two_letter_codes():
"""
length = 2 # Test with two-letter codes
codes = list(generate_codes(length))
expected_number_of_codes = (
26**2
) # There are 26 letters, so 26^2 two-letter combinations
expected_number_of_codes = 26**2 # There are 26 letters, so 26^2 two-letter combinations

# Check if all codes have the correct length
assert all(
len(code) == length for code in codes
), "All codes must have the specified length of 2"
assert all(len(code) == length for code in codes), "All codes must have the specified length of 2"

# Check the total number of generated codes
assert (
Expand All @@ -201,14 +193,10 @@ def test_generate_codes_for_three_letter_codes():
"""
length = 3 # Test with three-letter codes
codes = list(generate_codes(length))
expected_number_of_codes = (
26**3
) # There are 26 letters, so 26^3 three-letter combinations
expected_number_of_codes = 26**3 # There are 26 letters, so 26^3 three-letter combinations

# Check if all codes have the correct length
assert all(
len(code) == length for code in codes
), "All codes must have the specified length of 3"
assert all(len(code) == length for code in codes), "All codes must have the specified length of 3"

# Check the total number of generated codes
assert (
Expand Down

0 comments on commit 9cf1722

Please sign in to comment.