Description
There is no sample OpenTelemetryCollector resource definition file for deploying ADOT Collector with ADOT Operator to capture Container Insights infrastructure metrics.
There is a sample for CloudWatch, but this is for Prometheus support for Container Insights, not for collecting Container Insights infrastructure metrics.
It is possible for users to create an OpenTelemetryCollector resource by referring to the manifest file which deploy ADOT Collector for Container Insights, but it would be better to have it provided as a sample.
It is possible to create an OpenTelemetryCollector resource by referring to the manifest which deploy ADOT Collector directly for Container Insights, but it would be better to have sample configuration.
The definition would probably look something like this
# create namespace
apiVersion: v1
kind: Namespace
metadata:
name: aws-otel-eks
labels:
name: aws-otel-eks
---
# create cwagent service account and role binding
apiVersion: v1
kind: ServiceAccount
metadata:
name: aws-otel-sa
namespace: aws-otel-eks
---
kind: ClusterRole
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: aoc-agent-role
rules:
- apiGroups: [""]
resources: ["pods", "nodes", "endpoints"]
verbs: ["list", "watch", "get"]
- apiGroups: ["apps"]
resources: ["replicasets"]
verbs: ["list", "watch", "get"]
- apiGroups: ["batch"]
resources: ["jobs"]
verbs: ["list", "watch"]
- apiGroups: [""]
resources: ["nodes/proxy"]
verbs: ["get"]
- apiGroups: [""]
resources: ["nodes/stats", "configmaps", "events"]
verbs: ["create", "get"]
- apiGroups: [""]
resources: ["configmaps"]
verbs: ["update"]
- apiGroups: [""]
resources: ["configmaps"]
resourceNames: ["otel-container-insight-clusterleader"]
verbs: ["get","update", "create"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
verbs: ["create","get", "update"]
- apiGroups: ["coordination.k8s.io"]
resources: ["leases"]
resourceNames: ["otel-container-insight-clusterleader"]
verbs: ["get","update", "create"]
---
kind: ClusterRoleBinding
apiVersion: rbac.authorization.k8s.io/v1
metadata:
name: aoc-agent-role-binding
subjects:
- kind: ServiceAccount
name: aws-otel-sa
namespace: aws-otel-eks
roleRef:
kind: ClusterRole
name: aoc-agent-role
apiGroup: rbac.authorization.k8s.io
---
apiVersion: opentelemetry.io/v1alpha1
kind: OpenTelemetryCollector
metadata:
name: adot
namespace: aws-otel-eks
spec:
mode: daemonset
serviceAccount: aws-otel-sa
podAnnotations:
prometheus.io/scrape: 'true'
prometheus.io/port: '8888'
resources:
limits:
cpu: 200m
memory: 200Mi
requests:
cpu: 200m
memory: 200Mi
env:
- name: K8S_NODE_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: HOST_IP
valueFrom:
fieldRef:
fieldPath: status.hostIP
- name: HOST_NAME
valueFrom:
fieldRef:
fieldPath: spec.nodeName
- name: K8S_NAMESPACE
valueFrom:
fieldRef:
fieldPath: metadata.namespace
volumeMounts:
- name: rootfs
mountPath: /rootfs
readOnly: true
- name: dockersock
mountPath: /var/run/docker.sock
readOnly: true
- name: containerdsock
mountPath: /run/containerd/containerd.sock
- name: varlibdocker
mountPath: /var/lib/docker
readOnly: true
- name: sys
mountPath: /sys
readOnly: true
- name: devdisk
mountPath: /dev/disk
readOnly: true
volumes:
- name: rootfs
hostPath:
path: /
- name: dockersock
hostPath:
path: /var/run/docker.sock
- name: varlibdocker
hostPath:
path: /var/lib/docker
- name: containerdsock
hostPath:
path: /run/containerd/containerd.sock
- name: sys
hostPath:
path: /sys
- name: devdisk
hostPath:
path: /dev/disk/
config: |
extensions:
health_check:
receivers:
awscontainerinsightreceiver:
processors:
batch/metrics:
timeout: 60s
exporters:
awsemf:
namespace: ContainerInsights
log_group_name: '/aws/containerinsights/{ClusterName}/performance'
log_stream_name: '{NodeName}'
resource_to_telemetry_conversion:
enabled: true
dimension_rollup_option: NoDimensionRollup
parse_json_encoded_attr_values: [Sources, kubernetes]
metric_declarations:
# node metrics
- dimensions: [[NodeName, InstanceId, ClusterName]]
metric_name_selectors:
- node_cpu_utilization
- node_memory_utilization
- node_network_total_bytes
- node_cpu_reserved_capacity
- node_memory_reserved_capacity
- node_number_of_running_pods
- node_number_of_running_containers
- dimensions: [[ClusterName]]
metric_name_selectors:
- node_cpu_utilization
- node_memory_utilization
- node_network_total_bytes
- node_cpu_reserved_capacity
- node_memory_reserved_capacity
- node_number_of_running_pods
- node_number_of_running_containers
- node_cpu_usage_total
- node_cpu_limit
- node_memory_working_set
- node_memory_limit
# pod metrics
- dimensions: [[PodName, Namespace, ClusterName], [Service, Namespace, ClusterName], [Namespace, ClusterName], [ClusterName]]
metric_name_selectors:
- pod_cpu_utilization
- pod_memory_utilization
- pod_network_rx_bytes
- pod_network_tx_bytes
- pod_cpu_utilization_over_pod_limit
- pod_memory_utilization_over_pod_limit
- dimensions: [[PodName, Namespace, ClusterName], [ClusterName]]
metric_name_selectors:
- pod_cpu_reserved_capacity
- pod_memory_reserved_capacity
- dimensions: [[PodName, Namespace, ClusterName]]
metric_name_selectors:
- pod_number_of_container_restarts
# cluster metrics
- dimensions: [[ClusterName]]
metric_name_selectors:
- cluster_node_count
- cluster_failed_node_count
# service metrics
- dimensions: [[Service, Namespace, ClusterName], [ClusterName]]
metric_name_selectors:
- service_number_of_running_pods
# node fs metrics
- dimensions: [[NodeName, InstanceId, ClusterName], [ClusterName]]
metric_name_selectors:
- node_filesystem_utilization
# namespace metrics
- dimensions: [[Namespace, ClusterName], [ClusterName]]
metric_name_selectors:
- namespace_number_of_running_pods
service:
pipelines:
metrics:
receivers: [awscontainerinsightreceiver]
processors: [batch/metrics]
exporters: [awsemf]
extensions: [health_check]