Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,10 @@
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import RedisStorageClient

# Create a new instance of storage client using connection string.
# 'redis://localhost:6379' is the just placeholder, replace it with your actual
# connection string.
storage_client = RedisStorageClient(connection_string='redis://localhost:6379')

# And pass it to the crawler.
crawler = ParselCrawler(storage_client=storage_client)
Original file line number Diff line number Diff line change
@@ -0,0 +1,27 @@
from redis.asyncio import Redis

from crawlee.configuration import Configuration
from crawlee.crawlers import ParselCrawler
from crawlee.storage_clients import RedisStorageClient

# Create a new instance of storage client using a Redis client with custom settings.
# Replace host and port with your actual Redis server configuration.
# Other Redis client settings can be adjusted as needed.
storage_client = RedisStorageClient(
redis=Redis(
host='localhost',
port=6379,
retry_on_timeout=True,
socket_keepalive=True,
socket_connect_timeout=10,
)
)

# Create a configuration with custom settings.
configuration = Configuration(purge_on_start=False)

# And pass them to the crawler.
crawler = ParselCrawler(
storage_client=storage_client,
configuration=configuration,
)
167 changes: 167 additions & 0 deletions docs/guides/storage_clients.mdx
Original file line number Diff line number Diff line change
Expand Up @@ -8,12 +8,15 @@ import ApiLink from '@site/src/components/ApiLink';
import Tabs from '@theme/Tabs';
import TabItem from '@theme/TabItem';
import RunnableCodeBlock from '@site/src/components/RunnableCodeBlock';
import CodeBlock from '@theme/CodeBlock';

import MemoryStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/memory_storage_client_basic_example.py';
import FileSystemStorageClientBasicExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_basic_example.py';
import FileSystemStorageClientConfigurationExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/file_system_storage_client_configuration_example.py';
import CustomStorageClientExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/custom_storage_client_example.py';
import RegisteringStorageClientsExample from '!!raw-loader!roa-loader!./code_examples/storage_clients/registering_storage_clients_example.py';
import RedisStorageClientBasicExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_basic_example.py';
import RedisStorageClientConfigurationExample from '!!raw-loader!./code_examples/storage_clients/redis_storage_client_configuration_example.py';

Storage clients provide a unified interface for interacting with <ApiLink to="class/Dataset">`Dataset`</ApiLink>, <ApiLink to="class/KeyValueStore">`KeyValueStore`</ApiLink>, and <ApiLink to="class/RequestQueue">`RequestQueue`</ApiLink>, regardless of the underlying implementation. They handle operations like creating, reading, updating, and deleting storage instances, as well as managing data persistence and cleanup. This abstraction makes it easy to switch between different environments, such as local development and cloud production setups.

Expand All @@ -23,6 +26,7 @@ Crawlee provides three main storage client implementations:

- <ApiLink to="class/FileSystemStorageClient">`FileSystemStorageClient`</ApiLink> - Provides persistent file system storage with in-memory caching.
- <ApiLink to="class/MemoryStorageClient">`MemoryStorageClient`</ApiLink> - Stores data in memory with no persistence.
- <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> – Provides persistent storage using a [Redis](https://redis.io/) database v8.0+. Requires installing the extra dependency: 'crawlee[redis]'.
- [`ApifyStorageClient`](https://docs.apify.com/sdk/python/reference/class/ApifyStorageClient) - Manages storage on the [Apify platform](https://apify.com), implemented in the [Apify SDK](https://github.com/apify/apify-sdk-python).

```mermaid
Expand Down Expand Up @@ -50,6 +54,8 @@ class FileSystemStorageClient

class MemoryStorageClient

class RedisStorageClient

class ApifyStorageClient

%% ========================
Expand All @@ -58,6 +64,7 @@ class ApifyStorageClient

StorageClient --|> FileSystemStorageClient
StorageClient --|> MemoryStorageClient
StorageClient --|> RedisStorageClient
StorageClient --|> ApifyStorageClient
```

Expand Down Expand Up @@ -125,6 +132,166 @@ The `MemoryStorageClient` does not persist data between runs. All data is lost w
{MemoryStorageClientBasicExample}
</RunnableCodeBlock>

## Redis storage client

:::warning Experimental feature
The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> is experimental. Its API and behavior may change in future releases.
:::

The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> provides persistent storage using Redis database. It supports concurrent access from multiple independent clients or processes and uses Redis native data structures for efficient operations.

:::note dependencies
The <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> is not included in the core Crawlee package.
To use it, you need to install Crawlee with the Redis extra dependency:

<code>pip install 'crawlee[redis]'</code>

Additionally, Redis version 8.0 or higher is required.
:::

The client requires either a Redis connection string or a pre-configured Redis client instance. Use a pre-configured client when you need custom Redis settings such as connection pooling, timeouts, or SSL/TLS encryption.

<CodeBlock className="language-python" language="python">
{RedisStorageClientBasicExample}
</CodeBlock>

Data is organized using Redis key patterns. Below are the main data structures used for each storage type:

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Client
%% ========================

class RedisDatasetClient {
<<Dataset>>
}

%% ========================
%% Dataset Keys
%% ========================

class Dataset_Keys {
datasets:[name]:items - JSON Array
datasets:[name]:metadata - JSON Object
}

class Datasets_Indexes {
datasets:id_to_name - Hash
datasets:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisDatasetClient --> Dataset_Keys
RedisDatasetClient --> Datasets_Indexes
```

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class RedisKeyValueStoreClient {
<<Key-value store>>
}

%% ========================
%% Key-Value Store Keys
%% ========================

class Key_Value_Store_Keys {
key_value_stores:[name]:items - Hash
key_value_stores:[name]:metadata_items - Hash
key_value_stores:[name]:metadata - JSON Object
}

class Key_Value_Stores_Indexes {
key_value_stores:id_to_name - Hash
key_value_stores:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisKeyValueStoreClient --> Key_Value_Store_Keys
RedisKeyValueStoreClient --> Key_Value_Stores_Indexes
```

```mermaid
---
config:
class:
hideEmptyMembersBox: true
---

classDiagram

%% ========================
%% Storage Clients
%% ========================

class RedisRequestQueueClient {
<<Request queue>>
}

%% ========================
%% Request Queue Keys
%% ========================

class Request_Queue_Keys{
request_queues:[name]:queue - List
request_queues:[name]:data - Hash
request_queues:[name]:in_progress - Hash
request_queues:[name]:added_bloom_filter - Bloom Filter
request_queues:[name]:handled_bloom_filter - Bloom Filter
request_queues:[name]:metadata - JSON Object
}

class Request_Queues_Indexes {
request_queues:id_to_name - Hash
request_queues:name_to_id - Hash
}

%% ========================
%% Client to Keys arrows
%% ========================

RedisRequestQueueClient --> Request_Queue_Keys
RedisRequestQueueClient --> Request_Queues_Indexes
```

Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set through environment variables or the <ApiLink to="class/Configuration">`Configuration`</ApiLink> class:

- **`purge_on_start`** (env: `CRAWLEE_PURGE_ON_START`, default: `True`) - Whether to purge default storages on start.

Configuration options for the <ApiLink to="class/RedisStorageClient">`RedisStorageClient`</ApiLink> can be set via constructor arguments:

- **`connection_string`** – Redis connection string, e.g. `redis://localhost:6379/0`.
- **`redis`** – Pre-configured Redis client instance (optional).

<CodeBlock className="language-python" language="python">
{RedisStorageClientConfigurationExample}
</CodeBlock>

## Creating a custom storage client

A storage client consists of two parts: the storage client factory and individual storage type clients. The <ApiLink to="class/StorageClient">`StorageClient`</ApiLink> acts as a factory that creates specific clients (<ApiLink to="class/DatasetClient">`DatasetClient`</ApiLink>, <ApiLink to="class/KeyValueStoreClient">`KeyValueStoreClient`</ApiLink>, <ApiLink to="class/RequestQueueClient">`RequestQueueClient`</ApiLink>) where the actual storage logic is implemented.
Expand Down
4 changes: 3 additions & 1 deletion pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -48,7 +48,7 @@ dependencies = [
]

[project.optional-dependencies]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel]"]
all = ["crawlee[adaptive-crawler,beautifulsoup,cli,curl-impersonate,httpx,parsel,playwright,otel,redis]"]
adaptive-crawler = [
"jaro-winkler>=2.0.3",
"playwright>=1.27.0",
Expand All @@ -71,6 +71,7 @@ otel = [
"opentelemetry-semantic-conventions>=0.54",
"wrapt>=1.17.0",
]
redis = ["redis[hiredis] >= 6.4.0"]

[project.scripts]
crawlee = "crawlee._cli:cli"
Expand All @@ -90,6 +91,7 @@ dev = [
"apify_client", # For e2e tests.
"build~=1.3.0", # For e2e tests.
"dycw-pytest-only~=2.1.0",
"fakeredis[probabilistic,json,lua]>=2.31.0",
"mypy~=1.18.1",
"pre-commit~=4.3.0",
"proxy-py~=2.4.0",
Expand Down
2 changes: 2 additions & 0 deletions src/crawlee/storage_clients/__init__.py
Original file line number Diff line number Diff line change
@@ -1,9 +1,11 @@
from ._base import StorageClient
from ._file_system import FileSystemStorageClient
from ._memory import MemoryStorageClient
from ._redis import RedisStorageClient

__all__ = [
'FileSystemStorageClient',
'MemoryStorageClient',
'RedisStorageClient',
'StorageClient',
]
6 changes: 6 additions & 0 deletions src/crawlee/storage_clients/_redis/__init__.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,6 @@
from ._dataset_client import RedisDatasetClient
from ._key_value_store_client import RedisKeyValueStoreClient
from ._request_queue_client import RedisRequestQueueClient
from ._storage_client import RedisStorageClient

__all__ = ['RedisDatasetClient', 'RedisKeyValueStoreClient', 'RedisRequestQueueClient', 'RedisStorageClient']
Loading
Loading