apify · janbuchar · May 31, 2024 · May 31, 2024 · Jun 3, 2024 · Jun 3, 2024
diff --git a/.github/workflows/_unit_tests.yaml b/.github/workflows/_unit_tests.yaml
@@ -12,6 +12,13 @@ jobs:
         python-version: ["3.9", "3.10", "3.11", "3.12"]
     runs-on: ${{ matrix.os }}
 
+    services:
+      httpbin:
+        image: kennethreitz/httpbin
+        ports: 80:80
+    env:
+      HTTPBIN_URL: http://httpbin/
+
     steps:
       # We need to check out the head commit in case of PRs, and the default ref otherwise (during release).
       - name: Checkout repository

diff --git a/tests/unit/browsers/test_browser_pool.py b/tests/unit/browsers/test_browser_pool.py
@@ -6,52 +6,52 @@
 from crawlee.browsers.playwright_browser_plugin import PlaywrightBrowserPlugin
 
 
-async def test_new_page_single_plugin() -> None:
+async def test_new_page_single_plugin(httpbin: str) -> None:
     plugin = PlaywrightBrowserPlugin(browser_type='chromium')
 
     async with BrowserPool([plugin]) as browser_pool:
         assert browser_pool.plugins == [plugin]
 
         page_1 = await browser_pool.new_page()
-        await page_1.page.goto('https://httpbin.org/get')
+        await page_1.page.goto(f'{httpbin}/get')
         assert page_1.browser_type == 'chromium'
-        assert page_1.page.url == 'https://httpbin.org/get'
+        assert page_1.page.url == f'{httpbin}/get'
         assert '<html' in await page_1.page.content()  # there is some HTML content
 
         page_2 = await browser_pool.new_page()
-        await page_2.page.goto('https://httpbin.org/status/200')
+        await page_2.page.goto(f'{httpbin}/status/200')
         assert page_2.browser_type == 'chromium'
-        assert page_2.page.url == 'https://httpbin.org/status/200'
+        assert page_2.page.url == f'{httpbin}/status/200'
         assert '<html' in await page_1.page.content()  # there is some HTML content
 
 
-async def test_new_page_multiple_plugins() -> None:
+async def test_new_page_multiple_plugins(httpbin: str) -> None:
     plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
     plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')
 
     async with BrowserPool([plugin_chromium, plugin_firefox]) as browser_pool:
         assert browser_pool.plugins == [plugin_chromium, plugin_firefox]
 
         page_1 = await browser_pool.new_page()
-        await page_1.page.goto('https://httpbin.org/get')
+        await page_1.page.goto(f'{httpbin}/get')
         assert page_1.browser_type == 'chromium'
-        assert page_1.page.url == 'https://httpbin.org/get'
+        assert page_1.page.url == f'{httpbin}/get'
         assert '<html' in await page_1.page.content()  # there is some HTML content
 
         page_2 = await browser_pool.new_page()
-        await page_2.page.goto('https://httpbin.org/headers')
+        await page_2.page.goto(f'{httpbin}/headers')
         assert page_2.browser_type == 'firefox'
-        assert page_2.page.url == 'https://httpbin.org/headers'
+        assert page_2.page.url == f'{httpbin}/headers'
         assert '<html' in await page_2.page.content()  # there is some HTML content
 
         page_3 = await browser_pool.new_page()
-        await page_3.page.goto('https://httpbin.org/user-agent')
+        await page_3.page.goto(f'{httpbin}/user-agent')
         assert page_3.browser_type == 'chromium'
-        assert page_3.page.url == 'https://httpbin.org/user-agent'
+        assert page_3.page.url == f'{httpbin}/user-agent'
         assert '<html' in await page_3.page.content()  # there is some HTML content
 
 
-async def test_new_page_with_each_plugin() -> None:
+async def test_new_page_with_each_plugin(httpbin: str) -> None:
     plugin_chromium = PlaywrightBrowserPlugin(browser_type='chromium')
     plugin_firefox = PlaywrightBrowserPlugin(browser_type='firefox')
 
@@ -63,22 +63,22 @@ async def test_new_page_with_each_plugin() -> None:
         assert pages[0].browser_type == 'chromium'
         assert pages[1].browser_type == 'firefox'
 
-        await pages[0].page.goto('https://httpbin.org/get')
-        assert pages[0].page.url == 'https://httpbin.org/get'
+        await pages[0].page.goto(f'{httpbin}/get')
+        assert pages[0].page.url == f'{httpbin}/get'
         assert '<html' in await pages[0].page.content()  # there is some HTML content
 
-        await pages[1].page.goto('https://httpbin.org/headers')
-        assert pages[1].page.url == 'https://httpbin.org/headers'
+        await pages[1].page.goto(f'{httpbin}/headers')
+        assert pages[1].page.url == f'{httpbin}/headers'
         assert '<html' in await pages[1].page.content()
 
 
-async def test_resource_management() -> None:
+async def test_resource_management(httpbin: str) -> None:
     playwright_plugin = PlaywrightBrowserPlugin(browser_type='chromium')
 
     async with BrowserPool([playwright_plugin]) as browser_pool:
         page = await browser_pool.new_page()
-        await page.page.goto('https://httpbin.org/get')
-        assert page.page.url == 'https://httpbin.org/get'
+        await page.page.goto(f'{httpbin}/get')
+        assert page.page.url == f'{httpbin}/get'
         assert '<html' in await page.page.content()  # there is some HTML content
 
     # The page should be closed

diff --git a/tests/unit/browsers/test_playwright_plugin.py b/tests/unit/browsers/test_playwright_plugin.py
@@ -5,12 +5,12 @@
 from crawlee.browsers.playwright_browser_plugin import PlaywrightBrowserPlugin
 
 
-async def test_new_page() -> None:
+async def test_new_page(httpbin: str) -> None:
     async with PlaywrightBrowserPlugin() as plugin:
         # Get a new page with default options
         page_1 = await plugin.new_page()
-        await page_1.goto('https://httpbin.org/get')
-        assert page_1.url == 'https://httpbin.org/get'
+        await page_1.goto(f'{httpbin}/get')
+        assert page_1.url == f'{httpbin}/get'
         assert '<html' in await page_1.content()  # there is some HTML content
 
     page_options = {
@@ -22,20 +22,20 @@ async def test_new_page() -> None:
     async with PlaywrightBrowserPlugin(page_options=page_options) as plugin:
         # Get a new page with custom options
         page_2 = await plugin.new_page()
-        await page_2.goto('https://httpbin.org/user-agent')
-        assert page_2.url == 'https://httpbin.org/user-agent'
+        await page_2.goto(f'{httpbin}/user-agent')
+        assert page_2.url == f'{httpbin}/user-agent'
         assert '<html' in await page_2.content()  # there is some HTML content
 
 
-async def test_resource_management() -> None:
+async def test_resource_management(httpbin: str) -> None:
     async with PlaywrightBrowserPlugin() as plugin:
         assert plugin.browser is not None
         # Browser should be connected
         assert plugin.browser.is_connected() is True
 
         page = await plugin.new_page()
-        await page.goto('https://httpbin.org/get')
-        assert page.url == 'https://httpbin.org/get'
+        await page.goto(f'{httpbin}/get')
+        assert page.url == f'{httpbin}/get'
         assert '<html' in await page.content()  # there is some HTML content
 
     # The page should be closed

diff --git a/tests/unit/conftest.py b/tests/unit/conftest.py
@@ -51,3 +51,8 @@ def memory_storage_client(tmp_path: Path) -> MemoryStorageClient:
         crawlee_local_storage_dir=str(tmp_path),  # type: ignore
     )
     return MemoryStorageClient(cfg)
+
+
+@pytest.fixture()
+def httpbin() -> str:
+    return os.environ.get('HTTPBIN_URL', 'https://httpbin.org')
diff --git a/tests/unit/http_crawler/test_http_crawler.py b/tests/unit/http_crawler/test_http_crawler.py
@@ -107,17 +107,17 @@ async def test_handles_server_error(
     assert server['500_endpoint'].called
 
 
-async def test_stores_cookies() -> None:
+async def test_stores_cookies(httpbin: str) -> None:
     visit = Mock()
     track_session_usage = Mock()
 
     session_pool = SessionPool(max_pool_size=1)
     crawler = HttpCrawler(
         request_provider=RequestList(
             [
-                'https://httpbin.org/cookies/set?a=1',
-                'https://httpbin.org/cookies/set?b=2',
-                'https://httpbin.org/cookies/set?c=3',
+                f'{httpbin}/cookies/set?a=1',
+                f'{httpbin}/cookies/set?b=2',
+                f'{httpbin}/cookies/set?c=3',
             ]
         ),
         session_pool=session_pool,

diff --git a/tests/unit/httpx_client/test_httpx_client.py b/tests/unit/httpx_client/test_httpx_client.py
@@ -39,9 +39,9 @@ async def proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, None]:
         yield proxy_info
 
 
-async def test_proxy(proxy: ProxyInfo) -> None:
+async def test_proxy(proxy: ProxyInfo, httpbin: str) -> None:
     client = HttpxClient()
-    request = Request(url='https://httpbin.org/status/222', unique_key='42', id='42', user_data={})
+    request = Request(url=f'{httpbin}/status/222', unique_key='42', id='42', user_data={})
 
     async with Statistics() as statistics:
         result = await client.crawl(request, None, proxy, statistics)
@@ -65,9 +65,9 @@ async def disabled_proxy(proxy_info: ProxyInfo) -> AsyncGenerator[ProxyInfo, Non
         yield proxy_info
 
 
-async def test_proxy_disabled(disabled_proxy: ProxyInfo) -> None:
+async def test_proxy_disabled(disabled_proxy: ProxyInfo, httpbin: str) -> None:
     client = HttpxClient()
-    request = Request(url='https://httpbin.org/status/222', unique_key='42', id='42', user_data={})
+    request = Request(url=f'{httpbin}/status/222', unique_key='42', id='42', user_data={})
 
     with pytest.raises(ProxyError):
         async with Statistics() as statistics:

diff --git a/tests/unit/playwright_crawler/test_playwright_crawler.py b/tests/unit/playwright_crawler/test_playwright_crawler.py
@@ -9,8 +9,8 @@
     from crawlee.playwright_crawler import PlaywrightCrawlingContext
 
 
-async def test_basic_request() -> None:
-    request_provider = RequestList(['https://httpbin.org/'])
+async def test_basic_request(httpbin: str) -> None:
+    request_provider = RequestList([f'{httpbin}/'])
     crawler = PlaywrightCrawler(request_provider=request_provider)
     result: dict = {}
 
@@ -24,6 +24,6 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None:
 
     await crawler.run()
 
-    assert result.get('request_url') == result.get('page_url') == 'https://httpbin.org/'
+    assert result.get('request_url') == result.get('page_url') == f'{httpbin}/'
     assert 'httpbin' in result.get('page_title', '')
     assert '<html' in result.get('page_content', '')  # there is some HTML content