From 2f3a04d9680d2c5da45693db6269e509c2a3641f Mon Sep 17 00:00:00 2001 From: Pawel Palucki Date: Mon, 12 Feb 2024 17:57:06 +0100 Subject: [PATCH] Helm chart for pcm - initial version --- .gitignore | 4 +- Dockerfile | 5 +- KUBERNETES.md | 92 +++++ deployment/pcm/.helmignore | 25 ++ deployment/pcm/Chart.yaml | 9 + deployment/pcm/LICENSE | 30 ++ deployment/pcm/README.md | 321 ++++++++++++++++++ .../helm_chart_test_and_notes_TODO/NOTES.txt | 6 + .../_tests/test-connection.yaml | 15 + deployment/pcm/k8s-test.sh | 77 +++++ deployment/pcm/templates/_helpers.tpl | 79 +++++ deployment/pcm/templates/daemonset.yaml | 184 ++++++++++ deployment/pcm/templates/podmonitor.yaml | 38 +++ deployment/pcm/values-device-injector.yaml | 16 + deployment/pcm/values-direct.yaml | 9 + deployment/pcm/values-local-image.yaml | 4 + deployment/pcm/values-metal.yaml | 6 + .../pcm/values-smarter-devices-cpu-mem.yaml | 9 + deployment/pcm/values-vm.yaml | 4 + deployment/pcm/values.yaml | 104 ++++++ src/cpucounters.cpp | 12 +- src/pcm.cpp | 2 +- src/resctrl.cpp | 7 +- src/resctrl.h | 2 +- 24 files changed, 1047 insertions(+), 13 deletions(-) create mode 100644 KUBERNETES.md create mode 100644 deployment/pcm/.helmignore create mode 100644 deployment/pcm/Chart.yaml create mode 100644 deployment/pcm/LICENSE create mode 100644 deployment/pcm/README.md create mode 100644 deployment/pcm/helm_chart_test_and_notes_TODO/NOTES.txt create mode 100644 deployment/pcm/helm_chart_test_and_notes_TODO/_tests/test-connection.yaml create mode 100644 deployment/pcm/k8s-test.sh create mode 100644 deployment/pcm/templates/_helpers.tpl create mode 100644 deployment/pcm/templates/daemonset.yaml create mode 100644 deployment/pcm/templates/podmonitor.yaml create mode 100644 deployment/pcm/values-device-injector.yaml create mode 100644 deployment/pcm/values-direct.yaml create mode 100644 deployment/pcm/values-local-image.yaml create mode 100644 deployment/pcm/values-metal.yaml create mode 100644 deployment/pcm/values-smarter-devices-cpu-mem.yaml create mode 100644 deployment/pcm/values-vm.yaml create mode 100644 deployment/pcm/values.yaml diff --git a/.gitignore b/.gitignore index fdf68e8c..5317d4e0 100644 --- a/.gitignore +++ b/.gitignore @@ -32,4 +32,6 @@ latex/ .vs/ .idea/ build -src/simdjson \ No newline at end of file +src/simdjson +/deployment/pcm/smarter-device-manager/ +/deployment/pcm/nri/ diff --git a/Dockerfile b/Dockerfile index c9f8e084..b854d4b2 100644 --- a/Dockerfile +++ b/Dockerfile @@ -2,9 +2,12 @@ FROM fedora:40@sha256:4e007f288dce23966216be81ef62ba05d139b9338f327c1d1c73b7167d RUN dnf -y install gcc-c++ git findutils make cmake COPY . /tmp/pcm -RUN cd /tmp/pcm && mkdir build && cd build && cmake .. && make +# TEMPORARY change to be remove before merge, to not conflict with local builds + use cache +# WARNING this approach requires me to build locally before docker build to get updated +RUN --mount=type=cache,target=/tmp/pcm/build2 cd /tmp/pcm && cd build2 && cmake -D CMAKE_BUILD_TYPE=Debug .. && make -j pcm pcm-sensor-server FROM fedora:40@sha256:4e007f288dce23966216be81ef62ba05d139b9338f327c1d1c73b7167dd47312 +RUN yum install -y strace gdb util-linux COPY --from=builder /tmp/pcm/build/bin/* /usr/local/bin/ ENV PCM_NO_PERF=1 diff --git a/KUBERNETES.md b/KUBERNETES.md new file mode 100644 index 00000000..dacd7295 --- /dev/null +++ b/KUBERNETES.md @@ -0,0 +1,92 @@ +##### Create kind based development cluster + +```sh +kind create cluster +kind export kubeconfig +``` + +##### 1) Install Prometheus operator required for PodMonitor CRD + +E.g. use prometheus operator helm chart from here: https://artifacthub.io/packages/helm/prometheus-community/kube-prometheus-stack + +https://prometheus-operator.dev/docs/operator/design/#podmonitor + +```sh +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts + +helm repo update +helm repo list + +# check prometheus source if needed +helm pull prometheus-community/kube-prometheus-stack +tar -xzvf kube-prometheus-stack-56.6.2.tgz + +# show/customize values e.g. pod monitor namespace selector +helm show values prometheus-community/kube-prometheus-stack +helm show values prometheus-community/kube-prometheus-stack | vim - +'set ft=yaml' +helm template prometheus-community/kube-prometheus-stack --set-json prometheus.prometheusSpec.podMonitorNamespaceSelector='{"matchLabels": {"kubernetes.io/metadata.name": "intel-pcm"}}' | vim - +'set ft=yaml' + +# Install +helm install prometheus prometheus-community/kube-prometheus-stack --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false + +helm list + +kubectl --namespace default get pods -l "release=prometheus" +``` + +To remove: +```sh +helm uninstall prometheus +helm repo remove prometheus-community +``` + +##### 2) Deploy PCM daemonset + +```sh +kubectl apply -f pcm-kubernetes.yaml + +# and verify ... +kubectl -n intel-pcm get daemonset +kubectl -n intel-pcm get pods +podname=`kubectl -n intel-pcm get pods -ojsonpath='{.items[0].metadata.name}'` +``` + +##### 3) Verirfy PCM metrics are collected by Prometheus + +```sh +kubectl proxy & +``` + +Access PCM metrics directly: +```sh +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/intel-pcm/pods/$podname/proxy/metrics +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/intel-pcm/pods/$podname/proxy/metrics | grep DRAM_Writes +``` + +or through Prometheus UI: +``` +http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy/graph +promtool query range --step 1m http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'rate(DRAM_Writes{aggregate="system"}[5m])/1e9' +promtool query instant http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'avg by(__name__) ({job="pcm"})' +``` + +query metrics via promtool +```sh +promtool query range --step 1m http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'rate(DRAM_Writes{aggregate="system"}[5m])/1e9' +promtool query instant http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'avg by(__name__) ({job="pcm"})' +``` + +run some workloads: +```sh +kubectl run -ti stress --image=alexeiled/stress-ng --restart=Never --rm -- --stream 8 +``` + + +##### Uninstall PCM + +```sh +kubectl delete -f pcm-kubernetes.yaml +``` + + + diff --git a/deployment/pcm/.helmignore b/deployment/pcm/.helmignore new file mode 100644 index 00000000..0bbd73a0 --- /dev/null +++ b/deployment/pcm/.helmignore @@ -0,0 +1,25 @@ +# Patterns to ignore when building packages. +# This supports shell glob matching, relative path matching, and +# negation (prefixed with !). Only one pattern per line. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*.orig +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ +smarter-device-manager/ +nri/ diff --git a/deployment/pcm/Chart.yaml b/deployment/pcm/Chart.yaml new file mode 100644 index 00000000..f200feaa --- /dev/null +++ b/deployment/pcm/Chart.yaml @@ -0,0 +1,9 @@ +apiVersion: v2 +name: pcm +version: 0.1.0 +appVersion: "202403" +description: A PCM Helm chart for Kubernetes +home: https://github.com/intel/pcm +maintainers: + - name: Pawel Palucki + email: pawel.palucki@intel.com diff --git a/deployment/pcm/LICENSE b/deployment/pcm/LICENSE new file mode 100644 index 00000000..1fee9e76 --- /dev/null +++ b/deployment/pcm/LICENSE @@ -0,0 +1,30 @@ +BSD 3-Clause License + +Copyright (c) 2009-2023, Intel Corporation +Copyright (c) 2016-2020, opcm +All rights reserved. + +Redistribution and use in source and binary forms, with or without +modification, are permitted provided that the following conditions are met: + +* Redistributions of source code must retain the above copyright notice, this + list of conditions and the following disclaimer. + +* Redistributions in binary form must reproduce the above copyright notice, + this list of conditions and the following disclaimer in the documentation + and/or other materials provided with the distribution. + +* Neither the name of the copyright holder nor the names of its + contributors may be used to endorse or promote products derived from + this software without specific prior written permission. + +THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" +AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE +DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE +FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL +DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR +SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER +CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, +OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE +OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. diff --git a/deployment/pcm/README.md b/deployment/pcm/README.md new file mode 100644 index 00000000..fb2aa9c8 --- /dev/null +++ b/deployment/pcm/README.md @@ -0,0 +1,321 @@ +-------------------------------------------------------------------------------- +Helm chart instructions +-------------------------------------------------------------------------------- + +### Features: + +- privilege and non-privileged container (value: `privileged`), +- node-feature-discovery based nodeSelector and nodeAffinity (values: nfd, nfdBaremetalAffinity, nfdRDTAffinity) +- bare-metal and VM host configurations (files: values-metal.yaml, values-vm.yaml), +- Ability to deploy multiple releases alongside configured differently to handle different kinds of machines (bare-metal, VM) at the same time, +- Examples for non-privileged mode using device plugin ("smarter-devices-manager") or using NRI device-injector plugin (TODO) (file: values-smarter-devices-cpu-mem.yaml), +- Deploy Prometheus operator' PodMonitor (value: `podMonitor`) +- Integration with NRI balloons policy plugin (value: `nriBalloonsPolicyIntegration`), +- Controllable set of metrics and method of collection (RDT, uncore), support direct (msr) and indirect (Linux abstractions perf/resctrl) counter accesses (file: values-indirect.yaml) +- Linux Watchdog handling (controlled with PCM_KEEP_NMI_WATCHDOG, PCM_NO_AWS_WORKAROUND, nmiWatchdogMount values) +- Deploy to own namespace with "helm install ... **-n pcm --create-namespace**" +- Local image registry for development (file: values-local-image.yaml), + +TODO/Ideas: +- [ ] Refactor extra features: node-feature-discovery, NRI interegration only as extra values for generic fields (annotations, nodeSelector/nodeAffinity) +- [ ] Check if energy metrics can be accessible through perf subsystem +- [ ] GitHub actions for linter/security scanners, +- [ ] Idea: Change metrics names (follow Prometheus best practices) +- [ ] Idea: init container to check permission for all required components (devices/CPU) +- [ ] Implement Helm chart test pods + NOTES +- [ ] Test liveness/readiness probes +- [ ] Testing in Cluster Manager Systems like (e.g. Ranger,Gardener) different node types VM(1socket,all sockets), bare-metal +- [ ] Test in different cloud GCP/Azure/AWS + +### Getting started + +#### Indirect non-privileged method using Linux abstractions (perf/resctrl) default. + +```sh +helm install pcm . +``` + +#### Direct privileged method + optional nfd/podmonitor +``` +helm upgrade --install pcm . --set privileged=true -f values-metal.yaml -f values-direct.yaml +``` + +#### Node-feature-discovery + Prometheus podMonitor + +``` +helm install ... --set nfd=true --set podMonitor=true +``` + +#### DEBUGGING & BUILDING + +** NOTE: DEBUGGING: TODO to be remove before merging ** + +Build local image for tests/development + fix /pcm/resctrl mounting: +``` +# Local image "indirect" +helm install pcm . --set debugPcm=true -f values-local-image.yaml + +BUIDLING IMAGE +(cd ../.. ; (cd build ; make -j pcm pcm-sensor-server) ; docker build . -t localhost:5001/pcm-local && docker push localhost:5001/pcm-local; docker run -ti --rm --name pcmtest --entrypoint bash localhost:5001/pcm-local -c "pcm 2>&1 | head -5" ) + +helm upgrade --install pcm . --set privileged=false --set nfd=true --set podMonitor=true -f values-metal.yaml -f values-indirect.yaml -f values-local-image.yaml --set debugPcm=1 +``` + +### Requirements + +- Full set of metrics requires metal instance (uncore metrics, RDT, energy, UPI), +- Core metrics (instructions, cycles are also available) on VM instances, +- In both case "msr" kernel module has to be loaded in host OS, +- pod is allowed to be run with privileged capabilities (SYS_ADMIN, SYS_RAWIO) on given namespace, +- Pod Security Standards allow to run on privileged level, +``` + pod-security.kubernetes.io/enforce: privileged + pod-security.kubernetes.io/enforce-version: latest + pod-security.kubernetes.io/audit: privileged + pod-security.kubernetes.io/audit-version: latest + pod-security.kubernetes.io/warn: privileged + pod-security.kubernetes.io/warn-version: latest +``` + +More information here: https://kubernetes.io/docs/tutorials/security/ns-level-pss/ . + +### Defaults + +- Use Linux abstraction to access event counters (Linux Perf, resctrl) and run container in un-privileged mode. +- hostPort 9738 is exposed on host, (TODO: security review) +- Prometheus podMonitor is disabled + +#### Metric availability and requirements (devices/mounts/permissions) + +| Method | Used interfaces | default | Notes | +|---------------|------------------------------------------------------------| -------- | ------------------------------------------------------------------------------------- | +| indirect | perf, resctrl | v | missing energy metrics, requires fix for /pcm/resctrl mount | +| direct | msr | | requires msr module and access to /dev/cpu (non trivial) | + + +| Metrics | Available on Hardware | Available through interface | Available through method | +| --------------------- | ----------------------------- | ---------------------------- | ------------------------ | +| core | bare-metal, VM (any) | msr or perf | any | +| uncore (UPI) | bare-metal, VM (all sockets) | msr or perf | any | +| RDT (MBW,L3OCCUP) | bare-metal, VM (all sockets) | msr or resctrl | any | +| energy, temp | bare-metal (only) | msr | msr only! | + + +| Interface | Requirements | Controlled by (env/helm value) | default pcm/helm | Used by source code | Helm Value | +|---------------|------------------------------------------------------------|---------------------------------|-----------------------|----------------------------------------------------------|-----------------------------------------------------| +| perf | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_NO_PERF | 0 / 1 (!) | programPerfEvent(), PerfVirtualControlRegister() | | +| perf-uncore | sys_perf_open() perf_paranoid<=0/privileged/CAP_ADMIN | PCM_USE_UNCORE_PERF | 0 / 0 | programPerfEvent(), PerfVirtualControlRegister() | | +| perf-topdown | /sys/bus/event_source/devices/cpu/events | | yes | cpucounters.cpp:perfSupportsTopDown() | sysMount (TODO: conflicts with sys/fs) | +| RDT | uses "msr" or "resctrl" interface | PCM_NO_RDT | 0 / 0 | cpucounters.cpp:isRDTDisabled()/QOSMetricAvailable() | PCM_NO_RDT | +| resctrl | RW: /sys/fs/resctrl | PCM_USE_RESCTRL | 0 / 0 | resctrl.cpp | resctrlHostMount | +| | Notes: | | | | | +| | * privileged "inside" mount or RW bind mount | | | | | +| | * /pcm/resctrl unprivileged (TODO WORKS! | | | | | +| | * unprivileged RW bind mount in sys (TODO: doesn't work!) | | | | | +| | * unprivileged "inside" mount in sys (TODO: doesn't work!) | | | | | +| watchdog | RO/RW: /proc/sys/kernel/nmi_watchdog | PCM_KEEP_NMI_WATCHDOG | yes (tries to disable)| src/cpucounters.cpp:disableNMIWatchdog() | | +| msr | RW: /dev/cpu/X/msr + privileged or CAP_ADMIN/CAP_RAWIO | PCM_NO_MSR | 0 / 0 | msr.cpp:MsrHandle() | privileged or values-device-injector.yaml | +| | RW: /dev/mem | ? | 0 / 0 | cpucounters.cpp:initUncoreObjects, pci.cpp:PCIHandleM() | privileged or values-device-injector.yaml | +| | RO/RW: /sys/module/msr/parameters | PCM_NO_MSR | 0 / 0 | msr.cpp:MsrHandle() | sysMount | +| | RW: /proc/bus/pci | PCM_USE_UNCORE_PERF ??? | 0 / 0 | pci.cpp:PCIHandle() | pciMount | +| | RO: /sys/firmware/acpi/tables/MCFG | PCM_USE_UNCORE_PERF ??? | 0 / 0 | pci.cpp:PciHandle::openMcfgTable() | mcfgMount | + +### Validation on local kind cluster + +#### 1) Optionally mount resctrl filesystem + +``` +mount -t resctrl resctrl /sys/fs/resctrl +``` + +#### 2) Create kind based Kubernetes cluster +``` +kind create cluster +kind export kubeconfig +``` + +or (optionally), create kind cluster with local registry with [this script](https://kind.sigs.k8s.io/docs/user/local-registry/). + +#### 3) (Optionally) Deploy Node feature discovery + +``` +# I.a. Using Kustomize: +kubectl apply -k https://github.com/kubernetes-sigs/node-feature-discovery/deployment/overlays/default?ref=v0.16.0-devel + +# I.b. or with Helm Chart: +helm repo add nfd https://kubernetes-sigs.github.io/node-feature-discovery/charts +helm repo update +helm install nfd/node-feature-discovery --namespace node-feature-discovery --create-namespace --generate-name + +# II. Check node "labels" with CPU features are added +kubectl get node kind-control-plane -o yaml | grep feature.node +``` + +#### 4) (Optionally) Deploy Prometheus operator + +``` +helm repo add prometheus-community https://prometheus-community.github.io/helm-charts +helm install prometheus prometheus-community/kube-prometheus-stack --set prometheus.prometheusSpec.podMonitorSelectorNilUsesHelmValues=false +kubectl get sts prometheus-prometheus-kube-prometheus-prometheus +``` + +#### 5) Deploy PCM helm chart in pcm namespace + +Deploy to "pcm" namespace +``` +# Deploy to current namespace with defaults +helm install pcm . + +# Alternatively deploy with NFD and with Prometheus enabled +helm install pcm . --set nfd=true --set podMonitor=true + +# Alternatively deploy with NFD and with Prometheus enabled into own "pcm" namespace +helm install pcm . -n pcm --set nfd=true --set podMonitor=true +``` + +#### 6) Check metrics + +# Run proxy in background +``` +kubectl proxy & +# for access from another host TODO to be remove +kubectl proxy --address 0.0.0.0 & +``` + +Access PCM metrics directly: +```sh +kubectl get daemonset pcm +kubectl get pods +podname=`kubectl get pod -l app.kubernetes.io/component=pcm-sensor-server -ojsonpath='{.items[0].metadata.name}'` + +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics +curl -Ls http://127.0.0.1:8001/api/v1/namespaces/default/pods/$podname/proxy/metrics | grep DRAM_Writes +``` + +or through Prometheus UI/prom tool: +``` +http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy/graph +promtool query range --step 1m http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'rate(DRAM_Writes{aggregate="system"}[5m])/1e9' +promtool query instant http://127.0.0.1:8001/api/v1/namespaces/default/services/prometheus-kube-prometheus-prometheus:http-web/proxy 'avg by(__name__) ({job="pcm"})' +``` + +### Deploy alternative options + +#### Direct as privileged container +``` +helm install pcm . -f values-direct.yaml --set privileged=true +``` + +#### Homogeneous bare metal instances cluster (full set of metrics) + +``` +helm install pcm . -f values-metal.yaml +``` + +#### Homogenizer VM instances cluster (limited set of metrics core) + +``` +helm install pcm . -f values-vm.yaml +``` + +#### Heterogeneous (mixed VM/metal instances) cluster + +``` +helm install pcm-vm . -f values-vm.yaml +helm install pcm-metal . -f values-metal.yaml +``` + +#### Direct as non-privileged container + +**Note** PCM requires access to /dev/cpu device in read writer mode (MSR access) but it is no possible currently to mount devices in Kubernetes pods/containers in vanila Kubernetes. Please read this isses for more information https://github.com/kubernetes/kubernetes/issues/5607. + +##### a) Device injection using 3rd party device-plugin + +TO run PCM with as non privileged pod, we can third party devices plugins e.g.: + +- https://github.com/smarter-project/smarter-device-manager +- https://github.com/squat/generic-device-plugin +- https://github.com/everpeace/k8s-host-device-plugin + +**Warning** This plugins were NOT audited for security concerns, **use it at your own risk**. + +Below is example how to pass /dev/cpu and /dev/mem using smarter-device-manager in kind based Kubernetes test cluster. + +``` +# Label node to deploy device plugin on that node +kubectl label node kind-control-plane smarter-device-manager=enabled + +# Install "smarter-device-manager" device plugin with only /dev/cpu and /dev/mem devices enabled: +git clone https://github.com/smarter-project/smarter-device-manager +helm install smarter-device-plugin --create-namespace --namespace smarter-device-plugin smarter-device-manager/charts/smarter-device-manager --set 'config[0].devicematch=^cpu$' --set 'config[0].nummaxdevices=1' --set 'config[1].devicematch=^mem$' --set 'config[1].nummaxdevices=1' + +# Check that cpu and mem devices are available - should return "1" +kubectl get node kind-control-plane -o json | jq .status.capacity + +# Install pcm helm chart in unprivileged mode with extraResources for cpu and memory devices. +helm install pcm . --set privileged=false -f values-direct.yaml -f values-smarter-devices-cpu-mem.yaml +``` + +##### b) Device injection using NRI plugin device-injection + +**TODO**: **Warning** This is work in progress, because it is needed to manually specific all /dev/cpu/XX/msr devices, which is unpractical in production. + +``` +git clone https://github.com/containerd/nri/ +(cd nri/plugins/device-injector/ && go build ) +docker cp kind-control-plane:/etc/containerd/config.toml config.toml + +cat >>config.toml < 0.09090909090909094 @[1707901856.957] +Clock_Unhalted_Ref => 1010026077.3913049 @[1707901856.957] +Clock_Unhalted_Thread => 1295730425.8695648 @[1707901856.957] +DRAM_Joules_Consumed => 0 @[1707901856.957] +DRAM_Reads => 3600814506.6666665 @[1707901856.957] +DRAM_Writes => 1974366592 @[1707901856.957] +Embedded_DRAM_Reads => 0 @[1707901856.957] +Embedded_DRAM_Writes => 0 @[1707901856.957] +Incoming_Data_Traffic_On_Link_0 => 689786624 @[1707901856.957] +Incoming_Data_Traffic_On_Link_1 => 689454432 @[1707901856.957] +Incoming_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +Instructions_Retired_Any => 749013885.5739133 @[1707901856.957] +Invariant_TSC => 432975372048881700 @[1707901856.957] +L2_Cache_Hits => 3531524.973913045 @[1707901856.957] +L2_Cache_Misses => 2334387.130434784 @[1707901856.957] +L3_Cache_Hits => 1325323.1739130428 @[1707901856.957] +L3_Cache_Misses => 627863.4000000003 @[1707901856.957] +L3_Cache_Occupancy => 0 @[1707901856.957] +Local_Memory_Bandwidth => 0 @[1707901856.957] +Measurement_Interval_in_us => 14507400443881 @[1707901856.957] +Memory_Controller_IO_Requests => 0 @[1707901856.957] +Number_of_sockets => 2 @[1707901856.957] +OS_ID => 55.499999999999986 @[1707901856.957] +Outgoing_Data_And_Non_Data_Traffic_On_Link_0 => 1843333122.5 @[1707901856.957] +Outgoing_Data_And_Non_Data_Traffic_On_Link_1 => 1849219231.5 @[1707901856.957] +Outgoing_Data_And_Non_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +Package_Joules_Consumed => 0 @[1707901856.957] +Persistent_Memory_Reads => 0 @[1707901856.957] +Persistent_Memory_Writes => 0 @[1707901856.957] +RawCStateResidency => 89486131.66409859 @[1707901856.957] +Remote_Memory_Bandwidth => 0 @[1707901856.957] +SMI_Count => 0 @[1707901856.957] +Thermal_Headroom => -2147483648 @[1707901856.957] +Utilization_Incoming_Data_Traffic_On_Link_0 => 0 @[1707901856.957] +Utilization_Incoming_Data_Traffic_On_Link_1 => 0 @[1707901856.957] +Utilization_Incoming_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +Utilization_Outgoing_Data_And_Non_Data_Traffic_On_Link_0 => 0 @[1707901856.957] +Utilization_Outgoing_Data_And_Non_Data_Traffic_On_Link_1 => 0 @[1707901856.957] +Utilization_Outgoing_Data_And_Non_Data_Traffic_On_Link_2 => 0 @[1707901856.957] +``` diff --git a/deployment/pcm/templates/_helpers.tpl b/deployment/pcm/templates/_helpers.tpl new file mode 100644 index 00000000..0c05b3d5 --- /dev/null +++ b/deployment/pcm/templates/_helpers.tpl @@ -0,0 +1,79 @@ +{{/* Expand the name of the chart. */}} +{{- define "pcm.name" -}} +{{- default .Chart.Name .Values.nameOverride | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* Create a default fully qualified app name. +We truncate at 63 chars because some Kubernetes name fields are limited to this (by the DNS naming spec). +If release name contains chart name it will be used as a full name. */}} +{{- define "pcm.fullname" -}} +{{- if .Values.fullnameOverride }} +{{- .Values.fullnameOverride | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- $name := default .Chart.Name .Values.nameOverride }} +{{- if contains $name .Release.Name }} +{{- .Release.Name | trunc 63 | trimSuffix "-" }} +{{- else }} +{{- printf "%s-%s" .Release.Name $name | trunc 63 | trimSuffix "-" }} +{{- end }} +{{- end }} +{{- end }} + +{{/* Create chart name and version as used by the chart label. */}} +{{- define "pcm.chart" -}} +{{- printf "%s-%s" .Chart.Name .Chart.Version | replace "+" "_" | trunc 63 | trimSuffix "-" }} +{{- end }} + +{{/* Selector labels */}} +{{- define "pcm.selectorLabels" -}} +app.kubernetes.io/name: {{ include "pcm.name" . }} +app.kubernetes.io/instance: {{ .Release.Name }} +app.kubernetes.io/component: pcm-sensor-server +{{- end }} + +{{/* Common labels */}} +{{- define "pcm.labels" -}} +helm.sh/chart: {{ include "pcm.chart" . }} +{{ include "pcm.selectorLabels" . }} +{{- if .Chart.AppVersion }} +app.kubernetes.io/version: {{ .Chart.AppVersion | quote }} +{{- end }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{- end }} + +{{/* SecurityContext privileged or capabilties */}} +{{- define "pcm.securityContext" -}} +securityContext: +{{- if .Values.privileged }} + privileged: true +{{- else -}} + {{/* TODO? + readOnlyRootFilesystem: false + runAsUser: 0 + runAsGroup: 0 + ## below two doesnt work on container level! + fsGroup: 0 + supplementalGroups: [0] + seccompProfile: + #type: RuntimeDefault + type: Unconfined + */}} + capabilities: + add: + - SYS_ADMIN + - SYS_RAWIO +{{- end }} +{{- end }} + + +{{/* Probes: liveness and readiness probe */}} +{{- define "pcm.probe" -}} +failureThreshold: 3 +httpGet: + path: / + port: 9738 + scheme: HTTP +periodSeconds: 10 +successThreshold: 1 +timeoutSeconds: 1 +{{- end }} diff --git a/deployment/pcm/templates/daemonset.yaml b/deployment/pcm/templates/daemonset.yaml new file mode 100644 index 00000000..8d665d28 --- /dev/null +++ b/deployment/pcm/templates/daemonset.yaml @@ -0,0 +1,184 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + name: {{ include "pcm.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "pcm.labels" . | nindent 4 }} +spec: + selector: + matchLabels: + {{- include "pcm.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "pcm.labels" . | nindent 8 }} + annotations: + {{- with .Values.podAnnotations }}{{- toYaml . | nindent 8 }}{{- end }} + {{- if .Values.nriBalloonsPolicyIntegration }} + cpu.preserve.resource-policy.nri.io: "true" + {{- end }} + spec: + nodeSelector: + {{- with .Values.nodeSelector -}}{{- toYaml . | nindent 8 -}}{{- end -}} + {{- if .Values.nfd }} + feature.node.kubernetes.io/cpu-model.vendor_id: Intel + {{- if .Values.nfdRDTAffinity }} + feature.node.kubernetes.io/cpu-rdt.RDTCMT: "true" + feature.node.kubernetes.io/cpu-rdt.RDTL3CA: "true" + feature.node.kubernetes.io/cpu-rdt.RDTMBA: "true" + feature.node.kubernetes.io/cpu-rdt.RDTMBM: "true" + feature.node.kubernetes.io/cpu-rdt.RDTMON: "true" + {{- end }} + {{- if .Values.nfdBaremetalAffinity}} + affinity: + nodeAffinity: + requiredDuringSchedulingIgnoredDuringExecution: + nodeSelectorTerms: + - matchExpressions: + - key: "feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR" + operator: DoesNotExist + {{- end }} + {{- end }} {{/* if nfd */}} + {{- with .Values.imagePullSecrets }} + imagePullSecrets: + {{- toYaml . | nindent 8 }} + {{- end -}} + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + terminationGracePeriodSeconds: 0 + containers: + - name: pcm + image: "{{ .Values.image.repository }}:{{ .Values.image.tag | default .Chart.AppVersion }}" + imagePullPolicy: {{ .Values.image.pullPolicy }} + {{- include "pcm.securityContext" . | nindent 8 }} + {{- if .Values.debugSleep }} + command: + - /usr/bin/sleep + - inf + {{- end -}} + {{- if .Values.debugPcm }} + command: + - /bin/bash + - -c + - "/usr/local/bin/pcm 2 -r -nc -nsys" + {{- end -}} + {{- if .Values.resctrlInternalMount }} + # Ugly hack to mount resctrl inside only for baremetal when we want use resctrl abstraction and is not mounted on HOST: TBC conflicts with + command: + - /bin/bash + - -c + - "dnf install -q -y util-linux-core; mount -t resctrl resctrl /sys/fs/resctrl; /usr/local/bin/pcm-sensor-server -p 9738 -r" + {{- end -}} + {{/* ALREADY DONE by securityContext on pod level + securityContext: + {{- toYaml .Values.podSecurityContext | nindent 12 }} + */}} + resources: + requests: + {{ with .Values.cpuRequest }}cpu: {{.}}{{ end }} + {{ with .Values.memoryRequest }}memory: {{.}}{{ end }} + {{- with .Values.extraResources }} {{- toYaml .requests | nindent 12 }} {{- end }} + limits: + {{ with .Values.cpuLimit }}cpu: {{.}}{{ end }} + {{ with .Values.memoryLimit }}memory: {{.}}{{ end }} + {{- with .Values.extraResources }} {{- toYaml .limits | nindent 12 }} {{- end }} + env: + - name: PCM_NO_MSR + value: {{ .Values.PCM_NO_MSR | quote }} + - name: PCM_NO_PERF + value: {{ .Values.PCM_NO_PERF | quote }} + - name: PCM_USE_UNCORE_PERF + value: {{ .Values.PCM_USE_UNCORE_PERF | quote }} + - name: PCM_NO_RDT + value: {{ .Values.PCM_NO_RDT | quote }} + - name: PCM_USE_RESCTRL + value: {{ .Values.PCM_USE_RESCTRL | quote }} + - name: PCM_IGNORE_ARCH_PERFMON + value: {{ .Values.PCM_IGNORE_ARCH_PERFMON | quote }} + - name: PCM_KEEP_NMI_WATCHDOG + value: {{ .Values.PCM_KEEP_NMI_WATCHDOG | quote }} + - name: PCM_NO_AWS_WORKAROUND + value: {{ .Values.PCM_NO_AWS_WORKAROUND | quote }} + {{- with .Values.probes }} + livenessProbe: + {{- include "pcm.probe" . | nindent 12 }} + readinessProbe: + {{- include "pcm.probe" . | nindent 12 }} + {{- end }} + {{- with .Values.hostPort }} + ports: + - containerPort: 9738 + hostPort: {{ . }} + name: pcm-metrics + protocol: TCP + {{- end }} + volumeMounts: + {{- if .Values.privileged }} + - mountPath: /pcm/dev/cpu + name: dev-cpu + readOnly: false + - mountPath: /pcm/dev/mem + name: dev-mem + readOnly: false + {{- end }} + {{- if .Values.pciMount }} + - mountPath: /pcm/proc/bus/pci + name: proc-pci + {{- end }} + {{- if .Values.sysMount }} + - mountPath: /pcm/sys + name: sysfs + readOnly: true + {{- end }} + {{- if .Values.nmiWatchdogMount }} + - mountPath: /pcm/proc/sys/kernel/nmi_watchdog + name: nmi-watchdog + readOnly: true # RW? + {{- end }} + {{- if .Values.resctrlHostMount }} + #- mountPath: /sys/fs/resctrl # HACK because issues with sys mounting! + - mountPath: /pcm/resctrl + name: sysfs-resctrl + {{- end }} + {{- if .Values.mcfgMount }} + - mountPath: /pcm/sys/firmware/acpi/tables/MCFG + name: sys-acpi + readOnly: true + {{- end }} + volumes: + {{- if .Values.privileged }} + - name: dev-cpu + hostPath: + path: /dev/cpu + - name: dev-mem + hostPath: + path: /dev/mem + {{- end}} + {{- if .Values.sysMount }} + - name: sysfs + hostPath: + path: /sys + {{- end}} + {{- if .Values.pciMount }} + - name: proc-pci + hostPath: + path: /proc/bus/pci + {{- end}} + {{- if .Values.nmiWatchdogMount }} + - name: nmi-watchdog + hostPath: + path: /proc/sys/kernel/nmi_watchdog + {{- end }} + {{- if .Values.mcfgMount }} + - name: sys-acpi + hostPath: + path: /sys/firmware/acpi/tables/MCFG + {{- end }} + {{- if .Values.resctrlHostMount }} + - name: sysfs-resctrl + hostPath: + path: /sys/fs/resctrl + {{- end }} diff --git a/deployment/pcm/templates/podmonitor.yaml b/deployment/pcm/templates/podmonitor.yaml new file mode 100644 index 00000000..22cf7f01 --- /dev/null +++ b/deployment/pcm/templates/podmonitor.yaml @@ -0,0 +1,38 @@ +{{- if .Values.podMonitor }} +apiVersion: monitoring.coreos.com/v1 +kind: PodMonitor +metadata: + name: {{ include "pcm.fullname" . }} + namespace: {{ .Release.Namespace }} + labels: + {{- include "pcm.labels" . | nindent 4 }} + app.kubernetes.io/component: metrics + jobLabel: pcm +spec: + attachMetadata: + node: true + jobLabel: jobLabel + namespaceSelector: + matchNames: + - {{ .Release.Namespace }} + podMetricsEndpoints: + # requires hostPort to be set {{ required "A valid .Values.hostPort is required with PodMonitor enabled " .Values.hostPort }} + - enableHttp2: false + filterRunning: true + followRedirects: false + honorLabels: true + honorTimestamps: true + path: /metrics + port: pcm-metrics + interval: 1s + relabelings: + - sourceLabels: + - __meta_kubernetes_pod_node_name + targetLabel: nodename + scheme: http + selector: + matchLabels: + app.kubernetes.io/component: pcm-sensor-server + app.kubernetes.io/instance: {{ .Release.Name }} + app.kubernetes.io/name: pcm +{{- end }} diff --git a/deployment/pcm/values-device-injector.yaml b/deployment/pcm/values-device-injector.yaml new file mode 100644 index 00000000..c012161d --- /dev/null +++ b/deployment/pcm/values-device-injector.yaml @@ -0,0 +1,16 @@ +# Requires device injector nri Plugin +# https://github.com/containerd/nri/tree/main/plugins/device-injector +podAnnotations: + devices.nri.io/container.pcm: |+ + - path: /dev/cpu/0/msr + type: c + major: 202 + minor: 0 + - path: /dev/cpu/1/msr + type: c + major: 202 + minor: 1 + - path: /dev/mem + type: c + major: 1 + minor: 1 diff --git a/deployment/pcm/values-direct.yaml b/deployment/pcm/values-direct.yaml new file mode 100644 index 00000000..04aa5494 --- /dev/null +++ b/deployment/pcm/values-direct.yaml @@ -0,0 +1,9 @@ +PCM_NO_MSR: 0 # use MSR +PCM_NO_PERF: 1 # do not use Linux perf +PCM_USE_UNCORE_PERF: 0 # also use MSR for uncore +PCM_NO_RDT: 0 # Collect RDT data +PCM_USE_RESCTRL: 0 # using MSR (no resctrl) +resctrlHostMount: false # with MSR resctrl mount is not needed +resctrlInsideMount: false +sysMount: true # /pcm/sys is required +pciMount: true # /pcm/proc/bus/pci is required diff --git a/deployment/pcm/values-local-image.yaml b/deployment/pcm/values-local-image.yaml new file mode 100644 index 00000000..7d1c336d --- /dev/null +++ b/deployment/pcm/values-local-image.yaml @@ -0,0 +1,4 @@ +image: + repository: localhost:5001/pcm-local + tag: "latest" + pullPolicy: Always diff --git a/deployment/pcm/values-metal.yaml b/deployment/pcm/values-metal.yaml new file mode 100644 index 00000000..daf01fc6 --- /dev/null +++ b/deployment/pcm/values-metal.yaml @@ -0,0 +1,6 @@ +#### ================ Tunning for VM ================ +nmiWatchdogMount: false +PCM_NO_AWS_WORKAROUND: 1 +PCM_KEEP_NMI_WATCHDOG: 0 +nfd: true +nfdBaremetalAffinity: true diff --git a/deployment/pcm/values-smarter-devices-cpu-mem.yaml b/deployment/pcm/values-smarter-devices-cpu-mem.yaml new file mode 100644 index 00000000..4808cf4f --- /dev/null +++ b/deployment/pcm/values-smarter-devices-cpu-mem.yaml @@ -0,0 +1,9 @@ +# Requires smarter device manager +# https://github.com/smarter-project/smarter-device-manager +extraResources: + requests: + smarter-devices/cpu: 1 + smarter-devices/mem: 1 + limits: + smarter-devices/cpu: 1 + smarter-devices/mem: 1 diff --git a/deployment/pcm/values-vm.yaml b/deployment/pcm/values-vm.yaml new file mode 100644 index 00000000..71badb31 --- /dev/null +++ b/deployment/pcm/values-vm.yaml @@ -0,0 +1,4 @@ +#### ================ Tunning for VM ================ +nmiWatchdogMount: true +mcfgMount: false +PCM_NO_RDT: 1 # 0 - try to collect RDT data, enables local/remote memory bandwidth + llc occupancy diff --git a/deployment/pcm/values.yaml b/deployment/pcm/values.yaml new file mode 100644 index 00000000..653d0c93 --- /dev/null +++ b/deployment/pcm/values.yaml @@ -0,0 +1,104 @@ +### -------------- Naming ----------- +# used in +# - common label: app.kubernetes.io/name otherwise "Chart name" +# - also in selectorLabels together with release.name +# defaults to "Chart.name" +nameOverride: "" +# Used as daemonset name (usually based on truncated "name + release name") +fullnameOverride: "" + +### -------------- Image options ----------- +image: + repository: ghcr.io/opcm/pcm + pullPolicy: IfNotPresent + tag: "latest" # uses .Chart.AppVersion if empty +imagePullSecrets: {} + +### -------------- Security ---------------- +# Configures SecurityContext to not be privileged (by default) so SYS_ADMIN/SYS_RAWIO capabilietes are required for running pod +privileged: false + +# PCM deployment to be intergrated with NRI balloons resource policy intergration +# if true, will add special annotation to allow pcm pod use all the core, regardless NRI balloons policy rules. +nriBalloonsPolicyIntegration: false + +### -------------- Prometheus operator integration -------------------- +# Expose run containerPort "pcm-sensor-server -p 9738" as hostPort, can be empty to disable hostPort +hostPort: 9738 +# Deploy PromtheusOperator PodMonitor +podMonitor: false + +### -------------- Probes --------------------- +probes: false + +#### ================ Tune this section to handle VM or limited set of metrics + +### -------------- Metrics: Uncore ------------------- +# required for uncore metrics, only in baremetal, not available for VM +mcfgMount: true +sysMount: true +pciMount: true + +### linux Perf (indirect) vs msr(direct) +# Lets try "indirect" as default +PCM_NO_MSR: 1 # do not use MSR +PCM_NO_PERF: 0 # use Linux Perf over MSR for core metrics +PCM_USE_UNCORE_PERF: 1 # use Linux Perf instead of MSR for uncore metrics (collection+detection) + +### -------------- Metrics: RDT ---------------------- +### RDT rdt/resctrl: +PCM_NO_RDT: 0 # 0 - try to collect RDT data, enables local/remote memory bandwidth + llc occupancy +PCM_USE_RESCTRL: 1 # use Linux Perf instead of MSR access (more reliable) +# required for indirect RDT access, not available for VM only in baremetal +# do not mount by default RDT can be also accessed through direct MSR programming +resctrlHostMount: true # mount from external host +resctrlInsideMount: false # TODO: mount inside with extra call to mount, requires image with mount installed - doesn't require + +### -------------- HW: vm/baremetal/aws/nmi +PCM_IGNORE_ARCH_PERFMON: 0 # After VM is detected through CPUID (hypervisor flag) - check arch_perfmon flag to be also enabled - fail if not avaiable (0 - do check, 1 - disable check) + +### -------------- Other (NMI handling and/or on AWS) +# 0: Disabling NMI watchdog since it consumes one hw-PMU counter, requires nmiWatchdogMount to be true +# 1: don't disable NMI watchdog (reducing the core metrics set) - prefferd for production usage! +# but even with 0 automatic AWS workround applies! +PCM_KEEP_NMI_WATCHDOG: 0 +# workaround: after VM is detected: "INFO: Reducing the number of programmable counters to 3 to workaround the fixed cycle counter virtualization issue on AWS.\n";) +# 1: disables workaround and tries to use four programable counters (without workaround on VM will pcm-sensor-server will hang) +# Please do not disable (value=1) on VMs +PCM_NO_AWS_WORKAROUND: 0 + +# mounting watchdog is recommened when PCM_KEEP_NMI_WATCHDOG=0 or we expect AWS workaround to be applied +nmiWatchdogMount: true + +### =============================== Optional POD fields no related to PCM =============================== +# Pod level +podAnnotations: {} +podLabels: {} +# Container level +tolerations: [] +# Resources cpu/mem +cpuLimit: 100m +cpuRequest: 100m +memoryLimit: 512Mi +memoryRequest: 256Mi +extraResources: {} # requests + limits + +### =============================== NodeSelector and node-feature-discovery integration ================= +### Enables integration with node-feautre-discovery +# So configuration will be based on discovery made by NFD +# when enabled specific set of labels will be used as node selector (Intel vendor, RDT availability, baremetal) +nfd: false +# if enabled daemonset nodeAffinity would required node without feature.node.kubernetes.io/cpu-cpuid.HYPERVISOR flag +nfdBaremetalAffinity: false +# if enabled +# feature.node.kubernetes.io/cpu-rdt.RDTCMT=true +# feature.node.kubernetes.io/cpu-rdt.RDTL3CA=true +# feature.node.kubernetes.io/cpu-rdt.RDTMBA=true +# feature.node.kubernetes.io/cpu-rdt.RDTMBM=true +# feature.node.kubernetes.io/cpu-rdt.RDTMON=true +# * use nodeSelect to only match flags avaiable on metal e.g. +# feature.node.kubernetes.io/cpu-cpuid.VMX: "true" # only appears on metal instances in AWS (not reliable) + +# Should only running on linux and baremetal +nodeSelector: + kubernetes.io/os: linux diff --git a/src/cpucounters.cpp b/src/cpucounters.cpp index f441538c..868483ac 100644 --- a/src/cpucounters.cpp +++ b/src/cpucounters.cpp @@ -551,7 +551,7 @@ bool PCM::L3CacheOccupancyMetricAvailable() const bool PCM::CoreLocalMemoryBWMetricAvailable() const { - if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata + //if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata PCM_CPUID_INFO cpuinfo; if (!(QOSMetricAvailable() && L3QOSMetricAvailable())) return false; @@ -561,7 +561,7 @@ bool PCM::CoreLocalMemoryBWMetricAvailable() const bool PCM::CoreRemoteMemoryBWMetricAvailable() const { - if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata + //if (cpu_model == SKX && cpu_stepping < 5) return false; // SKZ4 errata PCM_CPUID_INFO cpuinfo; if (!(QOSMetricAvailable() && L3QOSMetricAvailable())) return false; @@ -1864,9 +1864,9 @@ void PCM::initUncoreObjects() #ifdef _MSC_VER std::cerr << "You must have signed msr.sys driver in your current directory and have administrator rights to run this program.\n"; #else - //std::cerr << "you must have read and write permissions for /proc/bus/pci/7f/10.* and /proc/bus/pci/ff/10.* devices (the 'chown' command can help).\n"; - //std::cerr << "you must have read and write permissions for /dev/mem device (the 'chown' command can help).\n"; - //std::cerr << "you must have read permission for /sys/firmware/acpi/tables/MCFG device (the 'chmod' command can help).\n"; + std::cerr << "you must have read and write permissions for /proc/bus/pci/7f/10.* and /proc/bus/pci/ff/10.* devices (the 'chown' command can help).\n"; + std::cerr << "you must have read and write permissions for /dev/mem device (the 'chown' command can help).\n"; + std::cerr << "you must have read permission for /sys/firmware/acpi/tables/MCFG device (the 'chmod' command can help).\n"; std::cerr << "You must be root to access server uncore counters in PCM.\n"; #endif } @@ -6830,7 +6830,7 @@ void initSocket2Bus(std::vector > & socket2bus, uint32 // match if (DEV_IDS[i] == device_id) { - // std::cout << "DEBUG: found bus " << std::hex << bus << " with device ID " << device_id << std::dec << "\n"; + std::cerr << "PCI: DEBUG: found bus " << std::hex << bus << " with device ID " << device_id << std::dec << "\n"; socket2bus.push_back(std::make_pair(group, bus)); break; } diff --git a/src/pcm.cpp b/src/pcm.cpp index deec2ef7..94110201 100644 --- a/src/pcm.cpp +++ b/src/pcm.cpp @@ -1294,7 +1294,7 @@ int mainThrows(int argc, char * argv[]) set_signal_handlers(); cerr << "\n"; - cerr << " Intel(r) Performance Counter Monitor " << PCM_VERSION << "\n"; + cerr << " FOO1 Intel(r) Performance Counter Monitor " << PCM_VERSION << "\n"; cerr << "\n"; cerr << "\n"; diff --git a/src/resctrl.cpp b/src/resctrl.cpp index 8d96858c..fc226e19 100644 --- a/src/resctrl.cpp +++ b/src/resctrl.cpp @@ -17,7 +17,8 @@ namespace pcm bool Resctrl::isMounted() { struct stat st; - if (stat("/sys/fs/resctrl/mon_groups", &st) < 0) + //if (stat("/sys/fs/resctrl/mon_groups", &st) < 0) + if (stat("/pcm/resctrl/mon_groups", &st) < 0) { return false; } @@ -27,9 +28,9 @@ namespace pcm { if (isMounted() == false) { - std::cerr << "ERROR: /sys/fs/resctrl is not mounted\n"; + std::cerr << "ERROR: /pcm/resctrl is not mounted\n"; std::cerr << "ERROR: RDT metrics (L3OCC,LMB,RMB) will not be available\n"; - std::cerr << "Mount it to make it work: mount -t resctrl resctrl /sys/fs/resctrl\n"; + std::cerr << "Mount it to make it work: mount -t resctrl resctrl /pcm/resctrl\n"; return; } const auto numCores = pcm.getNumCores(); diff --git a/src/resctrl.h b/src/resctrl.h index 70a87378..ca58a591 100644 --- a/src/resctrl.h +++ b/src/resctrl.h @@ -28,7 +28,7 @@ namespace pcm FileMapType L3OCC, MBL, MBT; Resctrl() = delete; size_t getMetric(const FileMapType & fileMap, int core); - static constexpr auto PCMPath = "/sys/fs/resctrl/mon_groups/pcm"; + static constexpr auto PCMPath = "/pcm/resctrl/mon_groups/pcm"; public: Resctrl(PCM & m) : pcm(m) {} bool isMounted();