From ad36df1531d06bae47d3e618eeb8b9f517ccf92f Mon Sep 17 00:00:00 2001 From: ldemesla <56355146+ldemesla@users.noreply.github.com> Date: Wed, 28 Jan 2026 12:41:19 +0700 Subject: [PATCH] Fix: Prevent URL malformation causing 'getaddrinfo ENOTFOUND http' errors MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Fixed a critical bug in extract_host_port() that caused malformed URLs like "http://http://localhost:3000" when the target already contained a protocol scheme. ## Problem The extract_host_port() function used naive string splitting on ':' which incorrectly parsed URLs containing protocol schemes: - Input: "http://localhost:3000" - Old output: ("http://localhost", 3000) - URL construction: f"http://{host}:{port}/login" - Result: "http://http://localhost:3000/login" ❌ - Error: getaddrinfo ENOTFOUND http This manifested on macOS (and potentially Linux) as DNS resolution errors when the agent attempted HTTP requests. ## Solution Rewrote extract_host_port() to use urllib.parse.urlparse which properly handles URL parsing: - Input: "http://localhost:3000" - New output: ("localhost", 3000) - URL construction: f"http://{host}:{port}/login" - Result: "http://localhost:3000/login" ✅ ## Changes - Fixed extract_host_port() in http_parser.py to use urlparse - Added comprehensive unit tests (15 tests, all passing) - Verified fix with Juice Shop server on localhost:3000 ## Testing All tests pass including critical test for protocol duplication prevention. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude --- .../tools/browser_automation/http_parser.py | 33 +++--- .../test_extract_host_port.py | 109 ++++++++++++++++++ 2 files changed, 123 insertions(+), 19 deletions(-) create mode 100644 deadend_cli/deadend_agent/tests/deadend_sdk/tools/browser_automation/test_extract_host_port.py diff --git a/deadend_cli/deadend_agent/src/deadend_agent/tools/browser_automation/http_parser.py b/deadend_cli/deadend_agent/src/deadend_agent/tools/browser_automation/http_parser.py index 8f7dc16..a885bc3 100644 --- a/deadend_cli/deadend_agent/src/deadend_agent/tools/browser_automation/http_parser.py +++ b/deadend_cli/deadend_agent/src/deadend_agent/tools/browser_automation/http_parser.py @@ -215,25 +215,20 @@ def analyze_http_request_text(raw_request_text: str) -> tuple[bool, dict]: def extract_host_port(target_host: str) -> Tuple[str, int]: """Extract host and port from a URL string using urllib.parse.urlparse""" - if target_host.startswith("http://"): - default_port = 80 - elif target_host.startswith("https://"): - default_port = 443 - else: - default_port = 80 - - parts = target_host.split(":") - if len(parts) >= 2: - try: - port_int = int(parts[-1]) - host = ":".join(parts[:-1]) - return host, port_int - except ValueError: - host = target_host - return host, default_port - else: - host = target_host - return host, default_port + # If no scheme, add one for parsing + if not target_host.startswith(('http://', 'https://')): + target_host = f"http://{target_host}" + + # Parse the URL properly + parsed = urlparse(target_host) + host = parsed.hostname or 'localhost' + port = parsed.port + + # If no port specified, use default based on scheme + if port is None: + port = 443 if parsed.scheme == 'https' else 80 + + return host, port import re diff --git a/deadend_cli/deadend_agent/tests/deadend_sdk/tools/browser_automation/test_extract_host_port.py b/deadend_cli/deadend_agent/tests/deadend_sdk/tools/browser_automation/test_extract_host_port.py new file mode 100644 index 0000000..876ddca --- /dev/null +++ b/deadend_cli/deadend_agent/tests/deadend_sdk/tools/browser_automation/test_extract_host_port.py @@ -0,0 +1,109 @@ +""" +Unit tests for extract_host_port functionality. +""" +import pytest +from deadend_agent.tools.browser_automation.http_parser import extract_host_port + + +class TestExtractHostPort: + """Tests for extract_host_port function.""" + + def test_http_url_with_port(self): + """HTTP URL with port should extract host and port correctly.""" + host, port = extract_host_port("http://localhost:3000") + assert host == "localhost" + assert port == 3000 + + def test_https_url_with_port(self): + """HTTPS URL with port should extract host and port correctly.""" + host, port = extract_host_port("https://localhost:3000") + assert host == "localhost" + assert port == 3000 + + def test_host_with_port_no_protocol(self): + """Host:port without protocol should extract correctly.""" + host, port = extract_host_port("localhost:3000") + assert host == "localhost" + assert port == 3000 + + def test_http_url_with_custom_port(self): + """HTTP URL with custom port should extract correctly.""" + host, port = extract_host_port("http://example.com:8080") + assert host == "example.com" + assert port == 8080 + + def test_https_url_with_standard_port(self): + """HTTPS URL with standard port 443 should extract correctly.""" + host, port = extract_host_port("https://example.com:443") + assert host == "example.com" + assert port == 443 + + def test_http_url_no_port(self): + """HTTP URL without port should default to 80.""" + host, port = extract_host_port("http://example.com") + assert host == "example.com" + assert port == 80 + + def test_https_url_no_port(self): + """HTTPS URL without port should default to 443.""" + host, port = extract_host_port("https://example.com") + assert host == "example.com" + assert port == 443 + + def test_bare_hostname(self): + """Bare hostname without protocol should default to port 80.""" + host, port = extract_host_port("example.com") + assert host == "example.com" + assert port == 80 + + def test_bare_localhost(self): + """Bare localhost without port should default to port 80.""" + host, port = extract_host_port("localhost") + assert host == "localhost" + assert port == 80 + + def test_ip_address_with_port(self): + """IP address with port should extract correctly.""" + host, port = extract_host_port("127.0.0.1:8000") + assert host == "127.0.0.1" + assert port == 8000 + + def test_http_ip_address_with_port(self): + """HTTP URL with IP address and port should extract correctly.""" + host, port = extract_host_port("http://127.0.0.1:8000") + assert host == "127.0.0.1" + assert port == 8000 + + def test_url_with_path_ignored(self): + """URL with path should ignore the path and extract host:port.""" + host, port = extract_host_port("http://example.com:8080/api/v1") + assert host == "example.com" + assert port == 8080 + + def test_url_with_query_ignored(self): + """URL with query params should ignore them and extract host:port.""" + host, port = extract_host_port("http://example.com:8080?param=value") + assert host == "example.com" + assert port == 8080 + + def test_url_reconstruction_no_duplicate_protocol(self): + """ + Test that extract_host_port prevents protocol duplication in URL construction. + This is the bug we're fixing: http://http://localhost:3000 should not happen. + """ + # Input with protocol + host, port = extract_host_port("http://localhost:3000") + # Reconstruct URL (simulating pw_requester.py behavior) + reconstructed_url = f"http://{host}:{port}/path" + + # Should NOT have duplicate protocol + assert reconstructed_url == "http://localhost:3000/path" + assert "http://http://" not in reconstructed_url + + def test_https_url_reconstruction_no_duplicate_protocol(self): + """Test HTTPS URL reconstruction doesn't duplicate protocol.""" + host, port = extract_host_port("https://example.com:443") + reconstructed_url = f"https://{host}:{port}/api" + + assert reconstructed_url == "https://example.com:443/api" + assert "https://https://" not in reconstructed_url