From 7b4fdf56002863a507b49d8d1fefa386368c5ba4 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 16 Sep 2025 13:48:08 +0000 Subject: [PATCH 1/3] save RequestQueueState in default KVS --- .../storage_clients/_file_system/_request_queue_client.py | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index a02773c1b7..c1cb79f764 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -115,9 +115,8 @@ def __init__( self._state = RecoverableState[RequestQueueState]( default_state=RequestQueueState(), - persist_state_key='request_queue_state', + persist_state_key=f'request_queue_{self._metadata.id}_state', persistence_enabled=True, - persist_state_kvs_name=f'__RQ_STATE_{self._metadata.id}', logger=logger, ) """Recoverable state to maintain request ordering, in-progress status, and handled status.""" From e337433f593d8ee5a20c688981e2037ecbfa7dcd Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Tue, 16 Sep 2025 14:03:55 +0000 Subject: [PATCH 2/3] update same tests --- tests/unit/crawlers/_http/test_http_crawler.py | 8 ++++++-- .../unit/crawlers/_playwright/test_playwright_crawler.py | 3 +++ 2 files changed, 9 insertions(+), 2 deletions(-) diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 15cc132b63..5d24bda2f0 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -565,11 +565,15 @@ async def request_handler(context: HttpCrawlingContext) -> None: kvs = await crawler.get_key_value_store() kvs_content = {} async for key_info in kvs.iterate_keys(): + # Skip any non-error snapshot keys, e.g. _state. + if 'ERROR_SNAPSHOT' not in key_info.key: + continue kvs_content[key_info.key] = await kvs.get_value(key_info.key) # One error, three time retried. + content_key = next(iter(kvs_content)) assert crawler.statistics.error_tracker.total == 4 assert crawler.statistics.error_tracker.unique_error_count == 1 assert len(kvs_content) == 1 - assert key_info.key.endswith('.html') - assert kvs_content[key_info.key] == HELLO_WORLD.decode('utf8') + assert content_key.endswith('.html') + assert kvs_content[content_key] == HELLO_WORLD.decode('utf8') diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index 2f52cac163..fe6617bc0b 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -631,6 +631,9 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: kvs_content = {} async for key_info in kvs.iterate_keys(): + # Skip any non-error snapshot keys, e.g. _state. + if 'ERROR_SNAPSHOT' not in key_info.key: + continue kvs_content[key_info.key] = await kvs.get_value(key_info.key) assert set(key_info.key).issubset(ErrorSnapshotter.ALLOWED_CHARACTERS) From 42536d803bf32edb60fb345c95c386ff7c63d199 Mon Sep 17 00:00:00 2001 From: Max Bohomolov Date: Fri, 19 Sep 2025 16:04:00 +0000 Subject: [PATCH 3/3] update persist key --- .../storage_clients/_file_system/_request_queue_client.py | 2 +- tests/unit/crawlers/_http/test_http_crawler.py | 2 +- tests/unit/crawlers/_playwright/test_playwright_crawler.py | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/src/crawlee/storage_clients/_file_system/_request_queue_client.py b/src/crawlee/storage_clients/_file_system/_request_queue_client.py index c1cb79f764..51fe9b6e8d 100644 --- a/src/crawlee/storage_clients/_file_system/_request_queue_client.py +++ b/src/crawlee/storage_clients/_file_system/_request_queue_client.py @@ -115,7 +115,7 @@ def __init__( self._state = RecoverableState[RequestQueueState]( default_state=RequestQueueState(), - persist_state_key=f'request_queue_{self._metadata.id}_state', + persist_state_key=f'__RQ_STATE_{self._metadata.id}', persistence_enabled=True, logger=logger, ) diff --git a/tests/unit/crawlers/_http/test_http_crawler.py b/tests/unit/crawlers/_http/test_http_crawler.py index 5d24bda2f0..8a3edfd986 100644 --- a/tests/unit/crawlers/_http/test_http_crawler.py +++ b/tests/unit/crawlers/_http/test_http_crawler.py @@ -565,7 +565,7 @@ async def request_handler(context: HttpCrawlingContext) -> None: kvs = await crawler.get_key_value_store() kvs_content = {} async for key_info in kvs.iterate_keys(): - # Skip any non-error snapshot keys, e.g. _state. + # Skip any non-error snapshot keys, e.g. __RQ_STATE_. if 'ERROR_SNAPSHOT' not in key_info.key: continue kvs_content[key_info.key] = await kvs.get_value(key_info.key) diff --git a/tests/unit/crawlers/_playwright/test_playwright_crawler.py b/tests/unit/crawlers/_playwright/test_playwright_crawler.py index fe6617bc0b..7e666489c8 100644 --- a/tests/unit/crawlers/_playwright/test_playwright_crawler.py +++ b/tests/unit/crawlers/_playwright/test_playwright_crawler.py @@ -631,7 +631,7 @@ async def request_handler(context: PlaywrightCrawlingContext) -> None: kvs_content = {} async for key_info in kvs.iterate_keys(): - # Skip any non-error snapshot keys, e.g. _state. + # Skip any non-error snapshot keys, e.g. __RQ_STATE_. if 'ERROR_SNAPSHOT' not in key_info.key: continue kvs_content[key_info.key] = await kvs.get_value(key_info.key)