From 263390cf5ba7ae12f7232b753090ef43e81a9f12 Mon Sep 17 00:00:00 2001 From: Pawel Palucki Date: Fri, 26 Apr 2024 16:34:01 +0200 Subject: [PATCH] update README an values comments --- deployment/pcm/README.md | 115 +++++++++++++++---------------- deployment/pcm/values-metal.yaml | 4 +- deployment/pcm/values-vm.yaml | 5 +- deployment/pcm/values.yaml | 61 ++++++++-------- 4 files changed, 92 insertions(+), 93 deletions(-) diff --git a/deployment/pcm/README.md b/deployment/pcm/README.md index 3102695e..a69a19fa 100644 --- a/deployment/pcm/README.md +++ b/deployment/pcm/README.md @@ -59,9 +59,8 @@ kubectl logs ds/pcm ### Requirements -- Full set of metrics requires bare-metal or .metal instance (uncore metrics, RDT, energy, UPI), -- Core metrics (instructions, cycles are also available) on VM instances, -- /sys/fs/resctrl has to be mounted on host OS, +- Full set of metrics (uncore/UPI, RDT, energy) requires bare-metal or .metal cloud instance. +- /sys/fs/resctrl has to be mounted on host OS (for default indirect deployment method), - pod is allowed to be run with privileged capabilities (SYS_ADMIN, SYS_RAWIO) on given namespace in other words: Pod Security Standards allow to run on privileged level, ``` @@ -77,51 +76,19 @@ More information here: https://kubernetes.io/docs/tutorials/security/ns-level-ps ### Defaults -- Use Linux abstraction to access event counters (Linux Perf, resctrl) and run container in un-privileged mode. -- hostPort 9738 is exposed on host, (TODO: security review) -- Prometheus podMonitor is disabled - -#### Metric availability and requirements (devices/mounts/permissions) - -| Method | Used interfaces | default | Notes | -|---------------|------------------------------------------------------------| -------- | ------------------------------------------------------------------------------------- | -| indirect | perf, resctrl | v | missing energy metrics, | -| direct | msr | | requires msr module and access to /dev/cpu (non trivial) or privileged access | - - -| Metrics | Available on Hardware | Available through interface | Available through method | -| --------------------- | ----------------------------- | ---------------------------- | ------------------------ | -| core | bare-metal, VM (any) | msr or perf | any | -| uncore (UPI) | bare-metal, VM (all sockets) | msr or perf | any | -| RDT (MBW,L3OCCUP) | bare-metal, VM (all sockets) | msr or resctrl | any | -| energy, temp | bare-metal (only) | msr | direct | -| perf-topdown | | perf only | indirect | - - -| Interface | Requirements | Controlled by (env/helm value) | default helm | Used by source code | Notes | -|---------------|------------------------------------------------------------|---------------------------------|-----------------------|----------------------------------------------------------|-----------------------------------------------------| -| perf | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_NO_PERF | use perf | programPerfEvent(), PerfVirtualControlRegister() | | -| perf-uncore | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_USE_UNCORE_PERF | use perf for uncore | programPerfEvent(), PerfVirtualControlRegister() | | -| perf-topdown | /sys/bus/event_source/devices/cpu/events | sysMount | yes | cpucounters.cpp:perfSupportsTopDown() | TODO: conflicts with sys/fs/resctrl | -| RDT | uses "msr" or "resctrl" interface | PCM_NO_RDT | yes | cpucounters.cpp:isRDTDisabled()/QOSMetricAvailable() | | -| resctrl | RW: /sys/fs/resctrl | PCM_USE_RESCTRL | yes | resctrl.cpp | resctrlHostMount | -| watchdog | RO/RW: /proc/sys/kernel/nmi_watchdog | PCM_KEEP_NMI_WATCHDOG | yes (tries to disable)| src/cpucounters.cpp:disableNMIWatchdog() | | -| msr | RW: /dev/cpu/X/msr + privileged or CAP_ADMIN/CAP_RAWIO | PCM_NO_MSR | msr is disabled | msr.cpp:MsrHandle() | privileged or some method to access /dev/cpu | -| | RW: /dev/mem | ? | msr is disabled | cpucounters.cpp:initUncoreObjects, pci.cpp:PCIHandleM() | privileged or some method to access /dev/cpu | -| | RO/RW: /sys/module/msr/parameters | PCM_NO_MSR | msr is disabled | msr.cpp:MsrHandle() | sysMount | -| | RW: /proc/bus/pci | PCM_USE_UNCORE_PERF | msr is disabled | pci.cpp:PCIHandle() | pciMount | -| | RO: /sys/firmware/acpi/tables/MCFG | PCM_USE_UNCORE_PERF | msr is disabled | pci.cpp:PciHandle::openMcfgTable() | mcfgMount | -| | energy | | | cpucounters.cpp initEnergyMonitoring() | | +- Indirect method uses Linux abstraction to access event counters (Linux Perf, resctrl) and run container in non-privileged mode. +- hostPort 9738 is exposed on host. (TODO: security review, consider TLS, together with Prometheus scrapping !!). +- Prometheus podMonitor is disabled (enabled it with --set podMonitor=true). ### Validation on local kind cluster - #### Requirements -- kubectl/kind/helm/jq binaries available in PATH -- docker service up and running +- kubectl/kind/helm/jq binaries available in PATH, +- docker service up and running. +- full set of metrics avaiable only bare-metal instance or Cloud .metal instance. -#### 1) Optionally mount resctrl filesystem +#### 1) (Optionally) mount resctrl filesystem (for RDT metrics) ``` mount -t resctrl resctrl /sys/fs/resctrl @@ -129,13 +96,11 @@ mount -t resctrl resctrl /sys/fs/resctrl #### 2) Create kind based Kubernetes cluster - ``` kind create cluster ``` -**Note** to be able to collect and test resctrl RDT metrics, kind cluster have to be created with additional mounts: - +**Note** to be able to collect and test RDT metrics through resctrl filesystem, kind cluster have to be created with additional mounts: ``` nodes: - role: control-plane @@ -143,8 +108,8 @@ nodes: - hostPath: /sys/fs/resctrl containerPath: /sys/fs/resctrl ``` -or (optionally), create kind cluster with local registry with [this script](https://kind.sigs.k8s.io/docs/user/local-registry/). -and apply the patch using sed: +e.g. create kind cluster with local registry with [this script](https://kind.sigs.k8s.io/docs/user/local-registry/). +and apply the patch to enable resctrl win following way: ``` wget https://kind.sigs.k8s.io/examples/kind-with-registry.sh @@ -156,7 +121,10 @@ nodes:\ - hostPath: /sys/fs/resctrl\ containerPath: /sys/fs/resctrl\ ' kind-with-registry.sh +``` +Then create cluster using above patched script: +``` bash kind-with-registry.sh ``` @@ -170,8 +138,7 @@ Export kind kubeconfig as default for further kubectl commands: kind export kubeconfig ``` - -#### 3) (Optionally) Deploy Node feature discovery +#### 3) (Optionally) Deploy Node Feature Discovery (nfd) ``` # I.a. Using Kustomize: @@ -196,27 +163,23 @@ kubectl get sts prometheus-prometheus-kube-prometheus-prometheus #### 5) Deploy PCM helm chart -Deploy with defaults: ``` -# Deploy to current namespace with defaults +# a) Deploy to current namespace with defaults helm install pcm . -# Alternatively deploy with NFD and with Prometheus enabled +# b) Alternatively deploy with NFD and/or with Prometheus enabled helm install pcm . --set podMonitor=true -kubectl get podmonitor pcm helm install pcm . --set nfd=true -# Alternatively deploy with NFD and with Prometheus enabled into own "pcm" namespace +# c) Alternatively deploy with NFD and with Prometheus enabled into own "pcm" namespace helm install pcm . --namespace pcm ``` -#### 6) Check metrics +#### 6) Check metrics are exported Run proxy in background: ``` kubectl proxy & -# for access from another host TODO to be remove (unsecure!!!) -kubectl proxy --address 0.0.0.0 & ``` Access PCM metrics directly: @@ -232,7 +195,7 @@ curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/met curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics | grep DRAM_Joules_Consumed # source: energy ``` -or through Prometheus UI/prom tool: +or through Prometheus UI/prom tool (requires prometheus operator to be deployed and helm install with with `--set podMonitor=true`): ``` http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy/graph promtool query range --step 1m http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'rate(DRAM_Writes{aggregate="system"}[5m])/1e9' @@ -265,7 +228,7 @@ helm install pcm-vm . -f values-vm.yaml helm install pcm-metal . -f values-metal.yaml ``` -#### Direct as non-privileged container +#### Direct method as non-privileged container (not recommended) **Note** PCM requires access to /dev/cpu device in read writer mode (MSR access) but it is no possible currently to mount devices in Kubernetes pods/containers in vanila Kubernetes. Please read this isses for more information https://github.com/kubernetes/kubernetes/issues/5607. @@ -350,7 +313,39 @@ docker push localhost:5001/pcm-local helm install pcm . -f values-local-image.yaml ``` -##### Troubleshooting +#### Troubleshooting + +##### Metric availability and requirements (devices/mounts/permissions) + +| Method | Used interfaces | default | Notes | +|---------------|------------------------------------------------------------| -------- | ------------------------------------------------------------------------------------- | +| indirect | perf, resctrl | v | missing energy metrics, | +| direct | msr | | requires msr module and access to /dev/cpu (non trivial) or privileged access | + + +| Metrics | Available on Hardware | Available through interface | Available through method | +| --------------------- | ----------------------------- | ---------------------------- | ------------------------ | +| core | bare-metal, VM (any) | msr or perf | any | +| uncore (UPI) | bare-metal, VM (all sockets) | msr or perf | any | +| RDT (MBW,L3OCCUP) | bare-metal, VM (all sockets) | msr or resctrl | any | +| energy, temp | bare-metal (only) | msr | direct | +| perf-topdown | | perf only | indirect | + + +| Interface | Requirements | Controlled by (env/helm value) | default helm | Used by source code | Notes | +|---------------|------------------------------------------------------------|---------------------------------|-----------------------|----------------------------------------------------------|-----------------------------------------------------| +| perf | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_NO_PERF | use perf | programPerfEvent(), PerfVirtualControlRegister() | | +| perf-uncore | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_USE_UNCORE_PERF | use perf for uncore | programPerfEvent(), PerfVirtualControlRegister() | | +| perf-topdown | /sys/bus/event_source/devices/cpu/events | sysMount | yes | cpucounters.cpp:perfSupportsTopDown() | TODO: conflicts with sys/fs/resctrl | +| RDT | uses "msr" or "resctrl" interface | PCM_NO_RDT | yes | cpucounters.cpp:isRDTDisabled()/QOSMetricAvailable() | | +| resctrl | RW: /sys/fs/resctrl | PCM_USE_RESCTRL | yes | resctrl.cpp | resctrlHostMount | +| watchdog | RO/RW: /proc/sys/kernel/nmi_watchdog | PCM_KEEP_NMI_WATCHDOG | yes (tries to disable)| src/cpucounters.cpp:disableNMIWatchdog() | | +| msr | RW: /dev/cpu/X/msr + privileged or CAP_ADMIN/CAP_RAWIO | PCM_NO_MSR | msr is disabled | msr.cpp:MsrHandle() | privileged or some method to access /dev/cpu | +| | RW: /dev/mem | ? | msr is disabled | cpucounters.cpp:initUncoreObjects, pci.cpp:PCIHandleM() | privileged or some method to access /dev/cpu | +| | RO/RW: /sys/module/msr/parameters | PCM_NO_MSR | msr is disabled | msr.cpp:MsrHandle() | sysMount | +| | RW: /proc/bus/pci | PCM_USE_UNCORE_PERF | msr is disabled | pci.cpp:PCIHandle() | pciMount | +| | RO: /sys/firmware/acpi/tables/MCFG | PCM_USE_UNCORE_PERF | msr is disabled | pci.cpp:PciHandle::openMcfgTable() | mcfgMount | +| | energy | | | cpucounters.cpp initEnergyMonitoring() | | One can replace pcm-sensor-server command and run pcm or sleep to investigate issue add following arguments when install helm chart ``` diff --git a/deployment/pcm/values-metal.yaml b/deployment/pcm/values-metal.yaml index daf01fc6..1ca73c1e 100644 --- a/deployment/pcm/values-metal.yaml +++ b/deployment/pcm/values-metal.yaml @@ -1,6 +1,8 @@ -#### ================ Tunning for VM ================ +#### ================ Tunning for bare-metal instances ================ +# with node-feature-discovery node affinity for non hypervisor and RDT nmiWatchdogMount: false PCM_NO_AWS_WORKAROUND: 1 PCM_KEEP_NMI_WATCHDOG: 0 nfd: true nfdBaremetalAffinity: true +nfdRDTAffinity: true diff --git a/deployment/pcm/values-vm.yaml b/deployment/pcm/values-vm.yaml index 71badb31..0c6d4139 100644 --- a/deployment/pcm/values-vm.yaml +++ b/deployment/pcm/values-vm.yaml @@ -1,4 +1,5 @@ #### ================ Tunning for VM ================ nmiWatchdogMount: true -mcfgMount: false -PCM_NO_RDT: 1 # 0 - try to collect RDT data, enables local/remote memory bandwidth + llc occupancy +# Disable RDT because is not avaiable for VM instances +PCM_NO_RDT: 1 +resctrlHostMount: false diff --git a/deployment/pcm/values.yaml b/deployment/pcm/values.yaml index c25af6ba..1c0f9f29 100644 --- a/deployment/pcm/values.yaml +++ b/deployment/pcm/values.yaml @@ -1,4 +1,4 @@ -### -------------- Naming ----------- +### -------------- Naming ------------------- # used in # - common label: app.kubernetes.io/name otherwise "Chart name" # - also in selectorLabels together with release.name @@ -7,33 +7,28 @@ nameOverride: "" # Used as daemonset name (usually based on truncated "name + release name") fullnameOverride: "" -### -------------- Image options ----------- +### -------------- Image options ------------ image: repository: ghcr.io/opcm/pcm pullPolicy: IfNotPresent tag: "latest" # uses .Chart.AppVersion if empty imagePullSecrets: {} -### -------------- Security ---------------- -# Configures SecurityContext to not be privileged (by default) so SYS_ADMIN/SYS_RAWIO capabilietes are required for running pod +### -------------- Security ------------------ +# Configures SecurityContext to not privileged (by default) so SYS_ADMIN/SYS_RAWIO capabilietes are required for running pod privileged: false -# PCM deployment to be intergrated with NRI balloons resource policy intergration -# if true, will add special annotation to allow pcm pod use all the core, regardless NRI balloons policy rules. -nriBalloonsPolicyIntegration: false - -### -------------- Prometheus operator integration -------------------- -# Expose run containerPort "pcm-sensor-server -p 9738" as hostPort, can be empty to disable hostPort -hostPort: 9738 -# Deploy PromtheusOperator PodMonitor -podMonitor: false +### -------------- Required OS affinity ------- +# Should only running on linux +nodeSelector: + kubernetes.io/os: linux ### -------------- Probes --------------------- probes: false -#### ================ Tune this section to handle VM or limited set of metrics +### ================ Metrics configuration ====================== -### -------------- Metrics: Uncore ------------------- +### -------------- Metrics: Uncore ------------ # required for uncore metrics, only in baremetal, not available for VM mcfgMount: false sysMount: false @@ -45,7 +40,7 @@ PCM_NO_MSR: 1 # do not use MSR PCM_NO_PERF: 0 # use Linux Perf over MSR for core metrics PCM_USE_UNCORE_PERF: 1 # use Linux Perf instead of MSR for uncore metrics (collection+detection) -### -------------- Metrics: RDT ---------------------- +### -------------- Metrics: RDT --------------- ### RDT rdt/resctrl: PCM_NO_RDT: 0 # 0 - try to collect RDT data, enables local/remote memory bandwidth + llc occupancy PCM_USE_RESCTRL: 1 # use Linux Perf instead of MSR access (more reliable) @@ -54,10 +49,8 @@ PCM_USE_RESCTRL: 1 # use Linux Perf instead of MSR access (more reli resctrlHostMount: true # mount from external host resctrlInsideMount: false # TODO: mount inside with extra call to mount, requires image with mount installed - doesn't require -### -------------- HW: vm/baremetal/aws/nmi +### -------------- Other (NMI handling and/or on VM/AWS) PCM_IGNORE_ARCH_PERFMON: 0 # After VM is detected through CPUID (hypervisor flag) - check arch_perfmon flag to be also enabled - fail if not avaiable (0 - do check, 1 - disable check) - -### -------------- Other (NMI handling and/or on AWS) # 0: Disabling NMI watchdog since it consumes one hw-PMU counter, requires nmiWatchdogMount to be true # 1: don't disable NMI watchdog (reducing the core metrics set) - prefferd for production usage! # but even with 0 automatic AWS workround applies! @@ -81,24 +74,32 @@ cpuLimit: 100m cpuRequest: 100m memoryLimit: 512Mi memoryRequest: 256Mi -extraResources: {} # requests + limits +# requests, limits level need to be specified here +extraResources: {} -### =============================== NodeSelector and node-feature-discovery integration ================= -### Enables integration with node-feautre-discovery -# So configuration will be based on discovery made by NFD +### =============================== Integrations with other projects ==================================== +# +### -------------- Prometheus operator -------------------- +# Expose run containerPort "pcm-sensor-server -p 9738" as hostPort, can be empty to disable hostPort +hostPort: 9738 +# Deploy PromtheusOperator PodMonitor (requires hostPort to be not empty) +podMonitor: false + +### -------------- NRI balloons policy plugin ------------- +# PCM deployment to be intergrated with NRI balloons resource policy intergration +# if true, will add special annotation to allow pcm pod use all the core, regardless NRI balloons policy rules. +nriBalloonsPolicyIntegration: false + +### ------------- node-feature-discovery ----------------- # when enabled specific set of labels will be used as node selector (Intel vendor, RDT availability, baremetal) nfd: false -# if enabled daemonset nodeAffinity would required node without feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR flag +# if enabled daemonset nodeAffinity will require node without feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR flag (requires nfd=true) nfdBaremetalAffinity: false -# if enabled +# if enabled, followin RDT labels will be required for scheduling (requires nfd=true) # feature.node.kubernetes.io/cpu-rdt.RDTCMT=true # feature.node.kubernetes.io/cpu-rdt.RDTL3CA=true # feature.node.kubernetes.io/cpu-rdt.RDTMBA=true # feature.node.kubernetes.io/cpu-rdt.RDTMBM=true # feature.node.kubernetes.io/cpu-rdt.RDTMON=true -# * use nodeSelect to only match flags avaiable on metal e.g. -# feature.node.kubernetes.io/cpu-cpuid.VMX: "true" # only appears on metal instances in AWS (not reliable) +nfdRDTAffinity: false -# Should only running on linux and baremetal -nodeSelector: - kubernetes.io/os: linux