2
2
3
3
import pynvml
4
4
5
- from packaging import version
6
-
7
5
from ..tb_dcgm_types .gpu_free_memory import GPUFreeMemory
8
6
from ..tb_dcgm_types .gpu_peak_memory import GPUPeakMemory
9
7
from ..tb_dcgm_types .gpu_power_usage import GPUPowerUsage
14
12
15
13
16
14
class NVMLMonitor (Monitor ):
17
- """
18
- Use NVML to monitor GPU metrics
19
- """
15
+ """Use NVML to monitor GPU metrics."""
20
16
21
17
# Mapping between the NVML Fields and Model Analyzer Records
22
18
# For more explainations, please refer to https://docs.nvidia.com/deploy/nvml-api/group__nvmlDeviceQueries.html
@@ -28,7 +24,8 @@ class NVMLMonitor(Monitor):
28
24
}
29
25
30
26
def __init__ (self , gpus , frequency , metrics ):
31
- """
27
+ """Initialize the NVML monitor.
28
+
32
29
Parameters
33
30
----------
34
31
gpus : list of GPUDevice
@@ -48,24 +45,13 @@ def __init__(self, gpus, frequency, metrics):
48
45
self ._gpus = gpus
49
46
# gpu handles: {gpu: handle}
50
47
self ._gpu_handles = {}
51
- self ._nvmlDeviceGetHandleByUUID = None
52
- self .check_nvml_compatibility ()
48
+ self ._nvmlDeviceGetHandleByUUID = self ._nvml .nvmlDeviceGetHandleByUUID
53
49
for gpu in self ._gpus :
54
50
self ._gpu_handles [gpu ] = self ._nvmlDeviceGetHandleByUUID (gpu .device_uuid ())
55
51
self ._records [gpu ] = {}
56
52
for metric in self ._metrics :
57
53
self ._records [gpu ][metric ] = []
58
54
59
- def check_nvml_compatibility (self ):
60
- # check pynvml version, if it is less than 11.5.0, convert uuid to bytes
61
- current_version = version .parse (pynvml .__version__ )
62
- if current_version < version .parse ("11.5.0" ):
63
- self ._nvmlDeviceGetHandleByUUID = (
64
- self ._nvmlDeviceGetHandleByUUID_for_older_pynvml
65
- )
66
- else :
67
- self ._nvmlDeviceGetHandleByUUID = self ._nvml .nvmlDeviceGetHandleByUUID
68
-
69
55
def _nvmlDeviceGetHandleByUUID_for_older_pynvml (self , uuid ):
70
56
return self ._nvml .nvmlDeviceGetHandleByUUID (uuid .encode ("ascii" ))
71
57
0 commit comments