Skip to content

Commit

Permalink
docs: Use renderable docs format for data structures (#617)
Browse files Browse the repository at this point in the history
### Description

- Use renderable docs format for data structures.
- Rm redundant new lines in args docs.
- Use a consistent "A default constructor." title for the __init__.

### Issues

- Closes: #616

### Testing

- API docs rendered locally.

### Checklist

- [x] CI passed
  • Loading branch information
vdusek authored Oct 24, 2024
1 parent 2a909fd commit 4a07dcc
Show file tree
Hide file tree
Showing 42 changed files with 256 additions and 268 deletions.
14 changes: 2 additions & 12 deletions src/crawlee/_autoscaling/autoscaled_pool.py
Original file line number Diff line number Diff line change
Expand Up @@ -54,38 +54,28 @@ def __init__(
scale_up_step_ratio: float = 0.05,
scale_down_step_ratio: float = 0.05,
) -> None:
"""Initialize the AutoscaledPool.
"""A default constructor.
Args:
system_status: Provides data about system utilization (load).
run_task_function: A function that performs an asynchronous resource-intensive task.
is_task_ready_function: A function that indicates whether `run_task_function` should be called. This
function is called every time there is free capacity for a new task and it should indicate whether
it should start a new task or not by resolving to either `True` or `False`. Besides its obvious use,
it is also useful for task throttling to save resources.
is_finished_function: A function that is called only when there are no tasks to be processed. If it
resolves to `True` then the pool's run finishes. Being called only when there are no tasks being
processed means that as long as `is_task_ready_function` keeps resolving to `True`,
`is_finished_function` will never be called. To abort a run, use the `abort` method.
task_timeout: Timeout in which the `run_task_function` needs to finish.
autoscale_interval: Defines how often the pool should attempt to adjust the desired concurrency based on
the latest system status. Setting it lower than 1 might have a severe impact on performance. We suggest
using a value from 5 to 20.
logging_interval: Specifies a period in which the instance logs its state, in seconds.
desired_concurrency_ratio: Minimum level of desired concurrency to reach before more scaling up is allowed.
scale_up_step_ratio: Defines the fractional amount of desired concurrency to be added with each scaling up.
scale_down_step_ratio: Defines the amount of desired concurrency to be subtracted with each scaling down.
concurrency_settings: Settings of concurrency levels
concurrency_settings: Settings of concurrency levels.
"""
self._system_status = system_status

Expand Down
14 changes: 1 addition & 13 deletions src/crawlee/_autoscaling/snapshotter.py
Original file line number Diff line number Diff line change
Expand Up @@ -47,42 +47,30 @@ def __init__(
memory_warning_cooldown_period: timedelta = timedelta(milliseconds=10000),
client_rate_limit_error_retry_count: int = 2,
) -> None:
"""Creates a new instance.
"""A default constructor.
Args:
event_manager: The event manager used to emit system info events. From data provided by this event
the CPU and memory usage are read.
event_loop_snapshot_interval: The interval at which the event loop is sampled.
client_snapshot_interval: The interval at which the client is sampled.
max_used_cpu_ratio: Sets the ratio, defining the maximum CPU usage. When the CPU usage is higher than
the provided ratio, the CPU is considered overloaded.
max_memory_size: Sets the maximum amount of system memory to be used by the `AutoscaledPool`. If `None`
is provided, the max amount of memory to be used is set to one quarter of total system memory.
I.e. on a system with 8192 MB, the `AutoscaledPool` will only use up to 2048 MB of memory.
max_used_memory_ratio: Sets the ratio, defining the maximum ratio of memory usage. When the memory usage
is higher than the provided ratio of `max_memory_size`, the memory is considered overloaded.
max_event_loop_delay: Sets the maximum delay of the event loop. When the delay is higher than the provided
value, the event loop is considered overloaded.
max_client_errors: Sets the maximum number of client errors (HTTP 429). When the number of client errors
is higher than the provided number, the client is considered overloaded.
snapshot_history: Sets the time interval for which the snapshots are kept.
available_memory_ratio: How big part of the system memory should be used if `max_memory_size` is not given.
reserve_memory_ratio: Fraction of memory kept in reserve. Used to calculate critical memory overload
threshold.
memory_warning_cooldown_period: Minimum time interval between logging successive critical memory overload
warnings.
client_rate_limit_error_retry_count: Number of retries for a client request before considering it a failure
due to rate limiting.
"""
Expand Down
7 changes: 1 addition & 6 deletions src/crawlee/_autoscaling/system_status.py
Original file line number Diff line number Diff line change
Expand Up @@ -45,23 +45,18 @@ def __init__(
event_loop_overload_threshold: float = 0.6,
client_overload_threshold: float = 0.3,
) -> None:
"""Creates a new instance.
"""A default constructor.
Args:
snapshotter: The `Snapshotter` instance to be queried for `SystemStatus`.
max_snapshot_age: Defines max age of snapshots used in the `SystemStatus.get_current_system_info`
measurement.
cpu_overload_threshold: Sets the threshold of overloaded snapshots in the CPU sample.
If the sample exceeds this threshold, the system will be considered overloaded.
memory_overload_threshold: Sets the threshold of overloaded snapshots in the memory sample.
If the sample exceeds this threshold, the system will be considered overloaded.
event_loop_overload_threshold: Sets the threshold of overloaded snapshots in the event loop sample.
If the sample exceeds this threshold, the system will be considered overloaded.
client_overload_threshold: Sets the threshold of overloaded snapshots in the Client sample.
If the sample exceeds this threshold, the system will be considered overloaded.
"""
Expand Down
104 changes: 50 additions & 54 deletions src/crawlee/_autoscaling/types.py
Original file line number Diff line number Diff line change
Expand Up @@ -10,45 +10,43 @@

@dataclass
class LoadRatioInfo:
"""Represents the load ratio of a resource.
Args:
limit_ratio: The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this
value, the resource is considered as overloaded.
actual_ratio: The actual ratio of overloaded and non-overloaded samples.
"""
"""Represent the load ratio of a resource."""

limit_ratio: float
"""The maximum ratio of overloaded and non-overloaded samples. If the actual ratio exceeds this value,
the resource is considered as overloaded."""

actual_ratio: float
"""The actual ratio of overloaded and non-overloaded samples."""

@property
def is_overloaded(self) -> bool:
"""Returns whether the resource is overloaded."""
"""Indicate whether the resource is currently overloaded."""
return self.actual_ratio > self.limit_ratio


@dataclass
class SystemInfo:
"""Represents the current status of the system.
Args:
cpu_info: The CPU load ratio.
memory_info: The memory load ratio.
event_loop_info: The event loop load ratio.
client_info: The client load ratio.
created_at: The time at which the measurement was taken.
"""
"""Represent the current status of the system."""

cpu_info: LoadRatioInfo
"""The CPU load ratio."""

memory_info: LoadRatioInfo
"""The memory load ratio."""

event_loop_info: LoadRatioInfo
"""The event loop load ratio."""

client_info: LoadRatioInfo
"""The client load ratio."""

created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
"""The time at which the system load information was measured."""

@property
def is_system_idle(self) -> bool:
"""Indicates whether the system is currently idle or overloaded."""
"""Indicate whether the system is currently idle or overloaded."""
return (
not self.cpu_info.is_overloaded
and not self.memory_info.is_overloaded
Expand All @@ -69,90 +67,88 @@ def __str__(self) -> str:

@dataclass
class CpuSnapshot:
"""A snapshot of CPU usage.
Args:
used_ratio: The ratio of CPU currently in use.
max_used_ratio: The maximum ratio of CPU that is considered acceptable.
created_at: The time at which the measurement was taken.
"""
"""A snapshot of CPU usage."""

used_ratio: float
"""The ratio of CPU currently in use."""

max_used_ratio: float
"""The maximum ratio of CPU that is considered acceptable."""

created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
"""The time at which the system load information was measured."""

@property
def is_overloaded(self) -> bool:
"""Returns whether the CPU is considered as overloaded."""
"""Indicate whether the CPU is considered as overloaded."""
return self.used_ratio > self.max_used_ratio


@dataclass
class MemorySnapshot:
"""A snapshot of memory usage.
Args:
total_size: Total memory available in the system.
current_size: Memory usage of the current Python process and its children.
max_memory_size: The maximum memory that can be used by `AutoscaledPool`.
max_used_memory_ratio: The maximum acceptable ratio of `current_size` to `max_memory_size`.
created_at: The time at which the measurement was taken.
"""
"""A snapshot of memory usage."""

total_size: ByteSize
"""Total memory available in the system."""

current_size: ByteSize
"""Memory usage of the current Python process and its children."""

max_memory_size: ByteSize
"""The maximum memory that can be used by `AutoscaledPool`."""

max_used_memory_ratio: float
"""The maximum acceptable ratio of `current_size` to `max_memory_size`."""

created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
"""The time at which the system load information was measured."""

@property
def is_overloaded(self) -> bool:
"""Returns whether the memory is considered as overloaded."""
"""Indicate whether the memory is considered as overloaded."""
return (self.current_size / self.max_memory_size) > self.max_used_memory_ratio


@dataclass
class EventLoopSnapshot:
"""Snapshot of the state of the event loop.
Args:
delay: The current delay of the event loop.
max_delay: The maximum delay that is considered acceptable.
created_at: The time at which the measurement was taken.
"""
"""Snapshot of the state of the event loop."""

delay: timedelta
"""The current delay of the event loop."""

max_delay: timedelta
"""The maximum delay that is considered acceptable."""

created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
"""The time at which the system load information was measured."""

@property
def max_delay_exceeded(self) -> timedelta:
"""Returns the amount of time by which the delay exceeds the maximum delay."""
"""The amount of time by which the delay exceeds the maximum delay."""
return max(self.delay - self.max_delay, timedelta(seconds=0))

@property
def is_overloaded(self) -> bool:
"""Returns whether the event loop is considered as overloaded."""
"""Indicate whether the event loop is considered as overloaded."""
return self.delay > self.max_delay


@dataclass
class ClientSnapshot:
"""Snapshot of the state of the client.
Args:
error_count: The number of errors (HTTP 429) that occurred.
max_error_count: The maximum number of errors that is considered acceptable.
created_at: The time at which the measurement was taken.
"""
"""Snapshot of the state of the client."""

error_count: int
"""The number of errors (HTTP 429) that occurred."""

max_error_count: int
"""The maximum number of errors that is considered acceptable."""

created_at: datetime = field(default_factory=lambda: datetime.now(timezone.utc))
"""The time at which the system load information was measured."""

@property
def is_overloaded(self) -> bool:
"""Returns whether the client is considered as overloaded."""
"""Indicate whether the client is considered as overloaded."""
return self.error_count > self.max_error_count


Expand Down
2 changes: 1 addition & 1 deletion src/crawlee/_log_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -97,7 +97,7 @@ def __init__(
*args: Any,
**kwargs: Any,
) -> None:
"""Create a new instance.
"""A default constructor.
Args:
include_logger_name: Include logger name at the beginning of the log line.
Expand Down
Loading

0 comments on commit 4a07dcc

Please sign in to comment.