Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 7 additions & 3 deletions aiperf/common/config/user_config.py
Original file line number Diff line number Diff line change
Expand Up @@ -227,9 +227,13 @@ def _count_dataset_entries(self) -> int:

@model_validator(mode="after")
def _compute_config(self) -> Self:
"""Compute additional configuration.

This method is automatically called after the model is validated to compute additional configuration.
"""
Compute derived configuration fields and populate any missing artifact directory.

If `output.artifact_directory` was not set by the user, computes and assigns it from `_compute_artifact_directory()`.

Returns:
self: The same UserConfig instance with computed fields applied.
"""

if "artifact_directory" not in self.output.model_fields_set:
Expand Down
135 changes: 119 additions & 16 deletions aiperf/common/enums/metric_enums.py
Original file line number Diff line number Diff line change
Expand Up @@ -205,7 +205,18 @@ class PowerMetricUnitInfo(BaseMetricUnitInfo):
watts: float

def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
"""Convert a value from this unit to another unit."""
"""
Convert a power quantity from this unit to another unit.

If the target unit is not a power unit, defers to the base unit conversion behavior.

Parameters:
other_unit (MetricUnitT): Target unit to convert to.
value (int | float): Power value expressed in this unit.

Returns:
float: The converted power value expressed in the target unit.
"""
if not isinstance(other_unit, PowerMetricUnit | PowerMetricUnitInfo):
return super().convert_to(other_unit, value)

Expand All @@ -228,17 +239,32 @@ class PowerMetricUnit(BaseMetricUnit):

@cached_property
def info(self) -> PowerMetricUnitInfo:
"""Get the info for the power unit."""
"""
Return the metadata object associated with this power unit.

Returns:
PowerMetricUnitInfo: Unit metadata containing attributes such as `long_name` and `watts`.
"""
return self._info # type: ignore

@cached_property
def watts(self) -> float:
"""The number of watts in the power unit."""
"""
Power of this unit expressed in watts.

Returns:
float: Number of watts represented by this power unit.
"""
return self.info.watts

@cached_property
def long_name(self) -> str:
"""The long name of the power unit."""
"""
Human-readable long name for the power unit.

Returns:
long_name (str): The long descriptive name of the unit.
"""
return self.info.long_name


Expand All @@ -249,7 +275,16 @@ class EnergyMetricUnitInfo(BaseMetricUnitInfo):
joules: float

def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
"""Convert a value from this unit to another unit."""
"""
Convert a numeric energy value from this unit into the specified target energy unit.

Parameters:
other_unit (MetricUnitT): Target unit; expected to be an EnergyMetricUnit or EnergyMetricUnitInfo.
value (int | float): Energy value in this unit to convert.

Returns:
float: The input `value` expressed in `other_unit`.
"""
if not isinstance(other_unit, EnergyMetricUnit | EnergyMetricUnitInfo):
return super().convert_to(other_unit, value)

Expand Down Expand Up @@ -277,7 +312,12 @@ class EnergyMetricUnit(BaseMetricUnit):

@cached_property
def info(self) -> EnergyMetricUnitInfo:
"""Get the info for the energy unit."""
"""
Get the EnergyMetricUnitInfo associated with this energy unit.

Returns:
EnergyMetricUnitInfo: The info object for this energy unit.
"""
return self._info # type: ignore

@cached_property
Expand All @@ -287,7 +327,12 @@ def joules(self) -> float:

@cached_property
def long_name(self) -> str:
"""The long name of the energy unit."""
"""
Human-readable long name of the energy unit.

Returns:
long_name (str): The long, human-readable name of the energy unit.
"""
return self.info.long_name


Expand Down Expand Up @@ -463,7 +508,15 @@ def dtype(self) -> Any:

@classmethod
def from_python_type(cls, type: type[MetricValueTypeT]) -> "MetricValueType":
"""Get the MetricValueType for a given type."""
"""
Map a Python type to the corresponding MetricValueType.

Parameters:
type (type[MetricValueTypeT]): The Python type to map (e.g., float, int, list[int], or the type variable `MetricValueTypeVarT`).

Returns:
MetricValueType: The MetricValueType that corresponds to the provided Python type. If `MetricValueTypeVarT` is provided, returns the float-backed MetricValueType.
"""
# If the type is a simple type like float or int, we have to use __name__.
# This is because using str() on float or int will return <class 'float'> or <class 'int'>, etc.
type_name = type.__name__
Expand All @@ -482,7 +535,18 @@ class FrequencyMetricUnitInfo(BaseMetricUnitInfo):
hertz: float

def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
"""Convert a value from this unit to another unit."""
"""
Convert a numeric value from this frequency unit to another frequency unit.

If `other_unit` is not a frequency unit, delegation is performed to the base implementation.

Parameters:
other_unit (MetricUnitT): Target unit (typically a FrequencyMetricUnit or FrequencyMetricUnitInfo).
value (int | float): Numeric value in this unit to convert.

Returns:
float: The input value expressed in `other_unit`.
"""
if not isinstance(other_unit, FrequencyMetricUnit | FrequencyMetricUnitInfo):
return super().convert_to(other_unit, value)

Expand Down Expand Up @@ -510,7 +574,12 @@ class FrequencyMetricUnit(BaseMetricUnit):

@cached_property
def info(self) -> FrequencyMetricUnitInfo:
"""Get the info for the frequency unit."""
"""
Access the FrequencyMetricUnit's associated FrequencyMetricUnitInfo.

Returns:
FrequencyMetricUnitInfo: The unit's info object containing metadata such as `hertz` and `long_name`.
"""
return self._info # type: ignore

@cached_property
Expand All @@ -520,7 +589,12 @@ def hertz(self) -> float:

@cached_property
def long_name(self) -> str:
"""The long name of the frequency unit."""
"""
Return the long human-readable name of the frequency unit.

Returns:
long_name (str): The unit's descriptive long name.
"""
return self.info.long_name


Expand All @@ -532,7 +606,16 @@ class TemperatureMetricUnitInfo(BaseMetricUnitInfo):
offset: float = 0.0

def convert_to(self, other_unit: "MetricUnitT", value: int | float) -> float:
"""Convert a value from this unit to another unit."""
"""
Convert a temperature value from this temperature unit to another temperature unit.

Parameters:
other_unit (MetricUnitT): Target unit; must be a TemperatureMetricUnit or TemperatureMetricUnitInfo — otherwise conversion is delegated to the base implementation.
value (int | float): Temperature value expressed in this unit.

Returns:
float: The temperature converted to the target unit.
"""
if not isinstance(
other_unit, TemperatureMetricUnit | TemperatureMetricUnitInfo
):
Expand Down Expand Up @@ -567,22 +650,42 @@ class TemperatureMetricUnit(BaseMetricUnit):

@cached_property
def info(self) -> TemperatureMetricUnitInfo:
"""Get the info for the temperature unit."""
"""
Return the TemperatureMetricUnitInfo associated with this unit.

Returns:
info (TemperatureMetricUnitInfo): Metadata and conversion rules for this temperature unit.
"""
return self._info # type: ignore

@cached_property
def celsius(self) -> float:
"""The celsius conversion factor."""
"""
Conversion factor to convert a value in this temperature unit to degrees Celsius.

Returns:
celsius (float): Multiplier to convert a value from this unit into degrees Celsius.
"""
return self.info.celsius

@cached_property
def offset(self) -> float:
"""The offset for temperature conversion."""
"""
Offset added to temperature values during conversion.

Returns:
float: The offset value for this temperature unit.
"""
return self.info.offset

@cached_property
def long_name(self) -> str:
"""The long name of the temperature unit."""
"""
Human-readable long name of the temperature unit.

Returns:
long_name (str): The long-form name of the unit.
"""
return self.info.long_name


Expand Down
7 changes: 6 additions & 1 deletion aiperf/common/messages/telemetry_messages.py
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,12 @@ class TelemetryRecordsMessage(BaseServiceMessage):

@property
def valid(self) -> bool:
"""Whether the telemetry collection was valid."""
"""
Indicates whether collected telemetry records are present and no error occurred.

Returns:
`true` if `error` is None and there is at least one record in `records`, `false` otherwise.
"""

return self.error is None and len(self.records) > 0

Expand Down
62 changes: 32 additions & 30 deletions aiperf/common/models/telemetry_models.py
Original file line number Diff line number Diff line change
Expand Up @@ -115,11 +115,12 @@ class GpuMetricTimeSeries(AIPerfBaseModel):
)

def append_snapshot(self, metrics: dict[str, float], timestamp_ns: int) -> None:
"""Add new snapshot with all metrics at once.

Args:
metrics: Dictionary of metric_name -> value for this timestamp
timestamp_ns: Timestamp when measurements were taken
"""
Append a timestamped snapshot containing the provided metric values to the time series.

Parameters:
metrics (dict[str, float]): Mapping of metric names to values; entries with `None` values are omitted.
timestamp_ns (int): Timestamp for the snapshot in nanoseconds.
"""
snapshot = GpuTelemetrySnapshot(
timestamp_ns=timestamp_ns,
Expand All @@ -145,19 +146,21 @@ def get_metric_values(self, metric_name: str) -> list[tuple[float, int]]:
def to_metric_result(
self, metric_name: str, tag: str, header: str, unit: str
) -> MetricResult:
"""Convert metric time series to MetricResult with statistical summary.

Args:
metric_name: Name of the metric to analyze
tag: Unique identifier for this metric (used by dashboard, exports, API)
header: Human-readable name for display
unit: Unit of measurement (e.g., "W" for Watts, "%" for percentage)

"""
Create a MetricResult summarizing the time-series values for a given metric.

Parameters:
metric_name (str): Metric key to extract from the time series.
tag (str): Identifier used by dashboards, exports, and APIs for this metric.
header (str): Human-readable display name for the metric.
unit (str): Measurement unit (for example, "W" for watts or "%" for percent).

Returns:
MetricResult with min/max/avg/percentiles computed from time series

MetricResult: Aggregated statistics for the metric, including min, max, average,
standard deviation, count, and selected percentiles (1, 5, 25, 50, 75, 90, 95, 99).

Raises:
NoMetricValue: If no data points are available for the specified metric
NoMetricValue: If no data points exist for the specified metric.
"""
data_points = self.get_metric_values(metric_name)

Expand Down Expand Up @@ -205,12 +208,14 @@ class GpuTelemetryData(AIPerfBaseModel):
)

def add_record(self, record: TelemetryRecord) -> None:
"""Add telemetry record as a grouped snapshot.

Args:
record: New telemetry data point from DCGM collector

Note: Groups all metric values from the record into a single snapshot
"""
Append a grouped snapshot of present metric values from a TelemetryRecord to the GPU time series.

Parameters:
record (TelemetryRecord): Telemetry data point from the DCGM collector.

Description:
Creates a snapshot containing only metrics with non-`None` values from `record` and appends it to the underlying time series if at least one metric is present.
"""
metric_mapping = {
"gpu_power_usage": record.gpu_power_usage,
Expand Down Expand Up @@ -268,14 +273,11 @@ class TelemetryHierarchy(AIPerfBaseModel):
)

def add_record(self, record: TelemetryRecord) -> None:
"""Add telemetry record to hierarchical storage.

Args:
record: New telemetry data from GPU monitoring

Note: Automatically creates hierarchy levels as needed:
- New DCGM endpoints get empty GPU dict
- New GPUs get initialized with metadata and empty metrics
"""
Store a TelemetryRecord in the hierarchy organized by DCGM endpoint URL and GPU UUID, initializing missing endpoint entries and per-GPU metadata.

Parameters:
record (TelemetryRecord): Telemetry data point to store; may initialize a new GPU entry with its static metadata if the GPU is not yet present.
"""

if record.dcgm_url not in self.dcgm_endpoints:
Expand Down
Loading
Loading