Skip to content

Commit

Permalink
[PLINT-467] Added support for vSAN metric/event collection in the vSp…
Browse files Browse the repository at this point in the history
…here integration (#18117)

* [WIP] updating vsphere check to get metrics for vsan clusters

* fixed cluster_name tagging and added a config option for vsan metric collection

* added a changelog

* validated config and models

* made string formatting compatible with python 2

* adding cluster health metrics and only supporting python 3

* fix for cluster health metrics

* adding host metrics and using infrastructure cache in resource discovery

* using format() instead of f-strings

* refactored and added disk metrics

* refactored vsan api code to improve performance

* adding support for vsan events collection

* ensuring that the PR is only tested in Python 3

* removing testing file from tracking

* fixing patching

* refactoring to use existing infra cache and changed event filtering

* adding a vsan test for api.py

* lint

* adding metrics to metadata.csv and improving logging

* removing ratio keyword

* fixing count keyword

* adding cluster health metrics to metadata.csv

* sorted metadata.csv

* adding descriptions to cluster health metrics

* increased complexity of check test

* addressing comments

* quick fix

* another fix

* adding code coverage, removing manual host tags, and removing percent metric prefixes

* adding more code coverage

* lint

* additional code coverage

* addressing more comments

* simplified collect_vsan_metrics logic

* removing disk-level metrics

* removing vsan disk events

* addressed some comments

* addressing more comments

* updating licenses

* updating api.py

* removing unnecessary import

* undoing licensing changes

* further metric filtering

* fixing vsan excluded_host_tags logic

* adding get_mor_tags and get_mor_props to test_vsan_metrics_included_in_check
  • Loading branch information
rahulkaukuntla authored Nov 14, 2024
1 parent b8de861 commit 718623b
Show file tree
Hide file tree
Showing 18 changed files with 1,108 additions and 341 deletions.
6 changes: 6 additions & 0 deletions vsphere/assets/configuration/spec.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -388,6 +388,12 @@ files:
value:
type: boolean
example: false
- name: collect_vsan_data
description: |
If true, the integration will collect additional metrics and events about vSAN clusters, and hosts.
value:
type: boolean
example: false
- name: attributes_prefix
description: |
Custom attributes attached to vSphere resources will be prefixed with this prefix when collected.
Expand Down
1 change: 1 addition & 0 deletions vsphere/changelog.d/18117.added
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
Added support for vSAN metric/event collection in the vSphere integration
77 changes: 76 additions & 1 deletion vsphere/datadog_checks/vsphere/api.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,8 +6,9 @@
import ssl
from typing import Any, Callable, List, TypeVar, cast # noqa: F401

import vsanapiutils
from pyVim import connect
from pyVmomi import vim, vmodl
from pyVmomi import SoapStubAdapter, vim, vmodl

from datadog_checks.base.log import CheckLoggingAdapter # noqa: F401
from datadog_checks.vsphere.config import VSphereConfig # noqa: F401
Expand All @@ -17,6 +18,10 @@
MAX_QUERY_METRICS_OPTION,
MOR_TYPE_AS_STRING,
UNLIMITED_HIST_METRICS_PER_QUERY,
VSAN_EVENT_IDS,
)
from datadog_checks.vsphere.metrics import (
ENTITY_REMAPPER,
)
from datadog_checks.vsphere.types import InfrastructureData
from datadog_checks.vsphere.utils import properties_to_collect
Expand Down Expand Up @@ -98,6 +103,7 @@ def __init__(self, config, log):
self.log = log

self._conn = cast(vim.ServiceInstance, None)
self._vsan_stub = cast(SoapStubAdapter, None)
self.smart_connect()

def smart_connect(self):
Expand Down Expand Up @@ -150,6 +156,8 @@ def smart_connect(self):
connect.Disconnect(self._conn)

self._conn = conn
if self.config.collect_vsan:
self._vsan_stub = vsanapiutils.GetVsanVcStub(conn._stub, context=context)
self.log.debug("Connected to %s", version_info.fullName)

@smart_retry
Expand Down Expand Up @@ -382,3 +390,70 @@ def get_max_query_metrics(self):
return max_historical_metrics
else:
return UNLIMITED_HIST_METRICS_PER_QUERY

@smart_retry
def get_vsan_events(self, timestamp):
event_manager = self._conn.content.eventManager
entity_time = vim.event.EventFilterSpec.ByTime(beginTime=timestamp)
query_filter = vim.event.EventFilterSpec(eventTypeId=VSAN_EVENT_IDS, time=entity_time)
events = event_manager.QueryEvents(query_filter)
self.log.debug("Received %s vSAN events", len(events))
return events

@smart_retry
def get_vsan_metrics(self, cluster_nested_elts, entity_ref_ids, id_to_tags, starting_time):
self.log.debug('Querying vSAN metrics')
vsan_perf_manager = vim.cluster.VsanPerformanceManager('vsan-performance-manager', self._vsan_stub)
health_metrics = []
performance_metrics = []
for cluster_reference, nested_ids in cluster_nested_elts.items():
self.log.debug("Querying vSAN metrics for cluster %s", cluster_reference.name)
unprocessed_health_metrics = vsan_perf_manager.QueryClusterHealth(cluster_reference)
if len(unprocessed_health_metrics) <= 0:
self.log.debug("No health metrics returned for cluster %s", cluster_reference.name)
continue
processed_health_metrics = {}
group_id = unprocessed_health_metrics[0].groupId
group_health = unprocessed_health_metrics[0].groupHealth
processed_health_metrics.update(
{
'vsphere.vsan.cluster.health.count': {
'group_id': group_id,
'status': group_health,
'vsphere_cluster': cluster_reference.name,
}
}
)
for health_test in unprocessed_health_metrics[0].groupTests:
test_name = health_test.testId.split('.')[-1]
processed_health_metrics.update(
{
'vsphere.vsan.cluster.health.{}.count'.format(test_name): {
'group_id': group_id,
'status': group_health,
'test_id': health_test.testId,
'test_status': health_test.testHealth,
'vsphere_cluster': cluster_reference.name,
}
}
)
health_metrics.append(processed_health_metrics)

vsan_perf_query_spec = []
for nested_id in nested_ids:
for entity_type in entity_ref_ids[id_to_tags[nested_id][0]]:
vsan_perf_query_spec.append(
vim.cluster.VsanPerfQuerySpec(
entityRefId=(entity_type + str(nested_id)),
labels=list(ENTITY_REMAPPER[entity_type]),
startTime=starting_time,
)
)
discovered_metrics = vsan_perf_manager.QueryVsanPerf(vsan_perf_query_spec, cluster_reference)
for entity_type in discovered_metrics:
for metric in entity_type.value:
metric.metricId.dynamicProperty.append(
id_to_tags[entity_type.entityRefId.replace("'", "").split(':')[-1]]
)
performance_metrics.append(discovered_metrics)
return [health_metrics, performance_metrics]
1 change: 1 addition & 0 deletions vsphere/datadog_checks/vsphere/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -94,6 +94,7 @@ def __init__(self, instance, init_config, log):
self.tags_prefix = instance.get("tags_prefix", DEFAULT_VSPHERE_TAG_PREFIX)
self.should_collect_attributes = is_affirmative(instance.get("collect_attributes", False))
self.collect_property_metrics = is_affirmative(instance.get("collect_property_metrics", False))
self.collect_vsan = is_affirmative(instance.get("collect_vsan_data", False))
self.attr_prefix = instance.get("attributes_prefix", DEFAULT_VSPHERE_ATTR_PREFIX)
self.excluded_host_tags = instance.get("excluded_host_tags", [])
self.base_tags = instance.get("tags", []) + ["vcenter_server:{}".format(self.hostname)]
Expand Down
4 changes: 4 additions & 0 deletions vsphere/datadog_checks/vsphere/config_models/defaults.py
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,10 @@ def instance_collect_tags():
return False


def instance_collect_vsan_data():
return False


def instance_collection_level():
return 1

Expand Down
1 change: 1 addition & 0 deletions vsphere/datadog_checks/vsphere/config_models/instance.py
Original file line number Diff line number Diff line change
Expand Up @@ -146,6 +146,7 @@ class InstanceConfig(BaseModel):
collect_per_instance_filters: Optional[CollectPerInstanceFilters] = None
collect_property_metrics: Optional[bool] = None
collect_tags: Optional[bool] = None
collect_vsan_data: Optional[bool] = None
collection_level: Optional[int] = None
collection_type: Optional[str] = None
connection_reset_timeout: Optional[int] = None
Expand Down
8 changes: 8 additions & 0 deletions vsphere/datadog_checks/vsphere/constants.py
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@
SOURCE_TYPE = 'vsphere'

BOTH = 'both'
NONE = 'None'
HISTORICAL = 'historical'
REALTIME = 'realtime'

Expand Down Expand Up @@ -218,4 +219,11 @@
'ScheduledTaskEvent',
]

VSAN_EVENT_IDS = [
'vsan.health.test.cluster.consistentconfig.event',
'vsan.health.test.network.hostdisconnected.event',
]

VSAN_EVENT_PREFIX = 'vsan'

HOSTNAME_CASE_OPTIONS = ['default', 'lower', 'upper']
5 changes: 5 additions & 0 deletions vsphere/datadog_checks/vsphere/data/conf.yaml.example
Original file line number Diff line number Diff line change
Expand Up @@ -340,6 +340,11 @@ instances:
#
# collect_property_metrics: false

## @param collect_vsan_data - boolean - optional - default: false
## If true, the integration will collect additional metrics and events about vSAN clusters, and hosts.
#
# collect_vsan_data: false

## @param attributes_prefix - string - optional
## Custom attributes attached to vSphere resources will be prefixed with this prefix when collected.
## Example use cases:
Expand Down
3 changes: 3 additions & 0 deletions vsphere/datadog_checks/vsphere/event.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
MOR_TYPE_AS_STRING,
PER_RESOURCE_EVENTS,
SOURCE_TYPE,
VSAN_EVENT_PREFIX,
)


Expand Down Expand Up @@ -55,6 +56,8 @@ def _is_filtered(self):
return True

if self.event_type not in self.exclude_filters:
if self.raw_event.eventTypeId and VSAN_EVENT_PREFIX in self.raw_event.eventTypeId:
return False
return True

filters = self.exclude_filters[self.event_type]
Expand Down
96 changes: 96 additions & 0 deletions vsphere/datadog_checks/vsphere/metrics.py
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@
'vsanDomObj.readCacheHitRate.latest',
}

VSAN_PERCENT_METRICS = {
'congestion',
'coreUtilPct',
'pcpuUsedPct',
'pcpuUtilPct',
}

# All metrics that can be collected from VirtualMachines.
VM_METRICS = {
REALTIME: [
Expand Down Expand Up @@ -403,6 +410,90 @@
HISTORICAL: [],
}

VSAN_CLUSTER_METRICS = {
'vsan.cluster.congestion',
'vsan.cluster.dedupRatio',
'vsan.cluster.free',
'vsan.cluster.health.count',
'vsan.cluster.health.statsdb.count',
'vsan.cluster.health.masterexist.count',
'vsan.cluster.health.collection.count',
'vsan.cluster.health.hostsmissing.count',
'vsan.cluster.health.renameddirs.count',
'vsan.cluster.iopsRead',
'vsan.cluster.iopsWrite',
'vsan.cluster.latencyAvgRead',
'vsan.cluster.latencyAvgWrite',
'vsan.cluster.oio',
'vsan.cluster.savedByDedup',
'vsan.cluster.throughputRead',
'vsan.cluster.throughputWrite',
'vsan.cluster.total',
'vsan.cluster.used',
}


VSAN_HOST_METRICS = {
'vsan.host.clientCacheHitRate',
'vsan.host.clientCacheHits',
'vsan.host.congestion',
'vsan.host.coreUtilPct',
'vsan.host.iopsRead',
'vsan.host.iopsUnmap',
'vsan.host.iopsWrite',
'vsan.host.latencyAvgRead',
'vsan.host.latencyAvgUnmap',
'vsan.host.latencyAvgWrite',
'vsan.host.oio',
'vsan.host.pcpuUsedPct',
'vsan.host.pcpuUtilPct',
'vsan.host.readCount',
'vsan.host.throughputRead',
'vsan.host.throughputUnmap',
'vsan.host.throughputWrite',
'vsan.host.writeCount',
}

CLUSTER_DOMCLIENT = {
'iopsRead',
'throughputRead',
'latencyAvgRead',
'iopsWrite',
'throughputWrite',
'latencyAvgWrite',
'congestion',
'oio',
}

VSAN_CLUSTER_CAPACITY = {'total', 'used', 'free', 'savedByDedup', 'dedupRatio'}

HOST_DOMCLIENT = {
'iopsRead',
'throughputRead',
'latencyAvgRead',
'readCount',
'iopsWrite',
'throughputWrite',
'latencyAvgWrite',
'writeCount',
'congestion',
'oio',
'clientCacheHits',
'clientCacheHitRate',
'iopsUnmap',
'throughputUnmap',
'latencyAvgUnmap',
}

HOST_CPU = {'coreUtilPct', 'pcpuUtilPct', 'pcpuUsedPct'}

ENTITY_REMAPPER = {
'cluster-domclient:': CLUSTER_DOMCLIENT,
'vsan-cluster-capacity:': VSAN_CLUSTER_CAPACITY,
'host-domclient:': HOST_DOMCLIENT,
'host-cpu:': HOST_CPU,
}

# All metrics that can be collected from Datastores.
DATASTORE_METRICS = {
REALTIME: [],
Expand All @@ -426,6 +517,11 @@
],
}

ALLOWED_METRICS_FOR_VSAN = {
'cluster': VSAN_CLUSTER_METRICS,
'host': VSAN_HOST_METRICS,
}

# All metrics that can be collected from Datacenters.
DATACENTER_METRICS = {
'realtime': [],
Expand Down
Loading

0 comments on commit 718623b

Please sign in to comment.