From 396bf238886c7682d60ce147c8384140b9ebc3d6 Mon Sep 17 00:00:00 2001 From: "Christopher M. Cantalupo" Date: Tue, 7 Apr 2026 15:38:12 -0700 Subject: [PATCH 1/3] resctrl-mon: add NRI plugin for per-pod resctrl monitoring groups Add nri-resctrl-mon, a standalone NRI plugin that creates per-pod resctrl monitoring groups (mon_groups) to support passive monitorning of Application Energy Telemetry (AET). The plugin uses the PostCreateContainer hook to assign container PIDs to mon_groups before exec/fork, eliminating the fork race that plagues userspace daemon approaches. RMID allocation is delegated to the kernel via mkdir/rmdir on the resctrl filesystem. Includes: - Plugin source (main.go, plugin.go, resctrl.go, state.go) - Unit tests (plugin_test.go, resctrl_test.go) - Dockerfile following nri-memory-qos pattern - Helm chart (Chart.yaml, values.yaml, templates/, schema) - Documentation (monitoring category, plugin docs, Helm docs) - Sample configuration Signed-off-by: Christopher M. Cantalupo Signed-off-by: Jedrzej Wasiukiewicz --- Makefile | 3 +- cmd/plugins/resctrl-mon/Dockerfile | 40 ++ cmd/plugins/resctrl-mon/main.go | 90 ++++ cmd/plugins/resctrl-mon/plugin.go | 384 ++++++++++++++++++ cmd/plugins/resctrl-mon/plugin_test.go | 369 +++++++++++++++++ cmd/plugins/resctrl-mon/resctrl.go | 216 ++++++++++ cmd/plugins/resctrl-mon/resctrl_test.go | 236 +++++++++++ cmd/plugins/resctrl-mon/state.go | 109 +++++ deployment/helm/resctrl-mon/.helmignore | 20 + deployment/helm/resctrl-mon/Chart.yaml | 11 + deployment/helm/resctrl-mon/README.md | 121 ++++++ .../helm/resctrl-mon/templates/_helpers.tpl | 16 + .../helm/resctrl-mon/templates/configmap.yaml | 12 + .../helm/resctrl-mon/templates/daemonset.yaml | 111 +++++ .../helm/resctrl-mon/values.schema.json | 117 ++++++ deployment/helm/resctrl-mon/values.yaml | 66 +++ docs/deployment/helm/index.md | 1 + docs/deployment/helm/resctrl-mon.md | 2 + docs/index.md | 1 + docs/monitoring/index.md | 9 + docs/monitoring/resctrl-mon.md | 160 ++++++++ sample-configs/nri-resctrl-mon.yaml | 3 + 22 files changed, 2096 insertions(+), 1 deletion(-) create mode 100644 cmd/plugins/resctrl-mon/Dockerfile create mode 100644 cmd/plugins/resctrl-mon/main.go create mode 100644 cmd/plugins/resctrl-mon/plugin.go create mode 100644 cmd/plugins/resctrl-mon/plugin_test.go create mode 100644 cmd/plugins/resctrl-mon/resctrl.go create mode 100644 cmd/plugins/resctrl-mon/resctrl_test.go create mode 100644 cmd/plugins/resctrl-mon/state.go create mode 100644 deployment/helm/resctrl-mon/.helmignore create mode 100644 deployment/helm/resctrl-mon/Chart.yaml create mode 100644 deployment/helm/resctrl-mon/README.md create mode 100644 deployment/helm/resctrl-mon/templates/_helpers.tpl create mode 100644 deployment/helm/resctrl-mon/templates/configmap.yaml create mode 100644 deployment/helm/resctrl-mon/templates/daemonset.yaml create mode 100644 deployment/helm/resctrl-mon/values.schema.json create mode 100644 deployment/helm/resctrl-mon/values.yaml create mode 100644 docs/deployment/helm/resctrl-mon.md create mode 100644 docs/monitoring/index.md create mode 100644 docs/monitoring/resctrl-mon.md create mode 100644 sample-configs/nri-resctrl-mon.yaml diff --git a/Makefile b/Makefile index 7191fba3f..3198b332e 100644 --- a/Makefile +++ b/Makefile @@ -83,7 +83,8 @@ PLUGINS ?= \ nri-memory-policy \ nri-memory-qos \ nri-memtierd \ - nri-sgx-epc + nri-sgx-epc \ + nri-resctrl-mon BINARIES ?= \ config-manager \ diff --git a/cmd/plugins/resctrl-mon/Dockerfile b/cmd/plugins/resctrl-mon/Dockerfile new file mode 100644 index 000000000..c833ca203 --- /dev/null +++ b/cmd/plugins/resctrl-mon/Dockerfile @@ -0,0 +1,40 @@ +ARG GO_VERSION=1.26 + +FROM golang:${GO_VERSION}-bookworm AS builder + +ARG IMAGE_VERSION +ARG BUILD_VERSION +ARG BUILD_BUILDID +ARG DEBUG=0 +ARG NORACE=0 +ARG SKIP_LICENSES=0 + +WORKDIR /go/builder + +# Fetch go dependencies in a separate layer for caching +COPY go.mod go.sum . +COPY pkg/topology/ pkg/topology/ +RUN --mount=type=cache,target=/go/pkg/mod/ go mod download + +# Build nri-resctrl-mon +COPY . . + +RUN --mount=type=cache,target=/go/pkg/mod/ \ + --mount=type=cache,target="/root/.cache/go-build" \ + make IMAGE_VERSION=${IMAGE_VERSION} \ + BUILD_VERSION=${BUILD_VERSION} \ + BUILD_BUILDID=${BUILD_BUILDID} \ + DEBUG=$DEBUG \ + NORACE=$NORACE \ + OTHER_IMAGE_TARGETS="" \ + BINARIES="" \ + PLUGINS=nri-resctrl-mon \ + clean install-go-licenses build-plugins-static licenses + +FROM gcr.io/distroless/static + +COPY --from=builder /go/builder/build/bin/nri-resctrl-mon /bin/nri-resctrl-mon +COPY --from=builder /go/builder/build/licenses/nri-resctrl-mon/ /licenses/nri-resctrl-mon/ +COPY --from=builder /go/builder/sample-configs/nri-resctrl-mon.yaml /etc/nri/resctrl-mon/config.yaml + +ENTRYPOINT ["/bin/nri-resctrl-mon", "-idx", "90", "-config", "/etc/nri/resctrl-mon/config.yaml"] diff --git a/cmd/plugins/resctrl-mon/main.go b/cmd/plugins/resctrl-mon/main.go new file mode 100644 index 000000000..eedf56599 --- /dev/null +++ b/cmd/plugins/resctrl-mon/main.go @@ -0,0 +1,90 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "flag" + "os" + + "github.com/containerd/nri/pkg/stub" + "github.com/sirupsen/logrus" +) + +var ( + log *logrus.Logger +) + +func main() { + var ( + pluginName string + pluginIdx string + configFile string + verbose bool + veryVerbose bool + err error + ) + + log = logrus.StandardLogger() + log.SetFormatter(&logrus.TextFormatter{ + PadLevelText: true, + }) + + flag.StringVar(&pluginName, "name", "", "plugin name to register to NRI") + flag.StringVar(&pluginIdx, "idx", "", "plugin index to register to NRI") + flag.StringVar(&configFile, "config", "", "configuration file name") + flag.BoolVar(&verbose, "v", false, "verbose output") + flag.BoolVar(&veryVerbose, "vv", false, "very verbose output") + flag.Parse() + + if verbose { + log.SetLevel(logrus.DebugLevel) + } + if veryVerbose { + log.SetLevel(logrus.TraceLevel) + } + + p := newPlugin() + + if configFile != "" { + log.Debugf("reading configuration from %q", configFile) + data, err := os.ReadFile(configFile) + if err != nil { + log.Fatalf("error reading configuration file %q: %s", configFile, err) + } + if err = p.setConfig(data); err != nil { + log.Fatalf("error applying configuration from file %q: %s", configFile, err) + } + } + + opts := []stub.Option{ + stub.WithOnClose(p.onClose), + } + if pluginName != "" { + opts = append(opts, stub.WithPluginName(pluginName)) + } + if pluginIdx != "" { + opts = append(opts, stub.WithPluginIdx(pluginIdx)) + } + + if p.stub, err = stub.New(p, opts...); err != nil { + log.Fatalf("failed to create plugin stub: %v", err) + } + + if err = p.stub.Run(context.Background()); err != nil { + log.Errorf("plugin exited (%v)", err) + os.Exit(1) + } +} diff --git a/cmd/plugins/resctrl-mon/plugin.go b/cmd/plugins/resctrl-mon/plugin.go new file mode 100644 index 000000000..52b067b27 --- /dev/null +++ b/cmd/plugins/resctrl-mon/plugin.go @@ -0,0 +1,384 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "fmt" + "os" + "path/filepath" + "sync" + "time" + + "sigs.k8s.io/yaml" + + "github.com/containerd/nri/pkg/api" + "github.com/containerd/nri/pkg/stub" +) + +const ( + // reconcileInterval is how often the background reconciler checks for + // orphaned mon_groups left behind by failed StopContainer removals. + reconcileInterval = 30 * time.Second +) + +// plugin implements the NRI plugin interface for resctrl monitoring groups. +type plugin struct { + stub stub.Stub + config *pluginConfig + state *podState + rdt *resctrlOps + mu sync.Mutex // serializes ensureMonGroup to prevent TOCTOU races + stopReconciler chan struct{} // closed to stop the background reconciler +} + +// pluginConfig holds the runtime configuration for the plugin. +type pluginConfig struct { + // ResctrlPath is the mount point of the resctrl filesystem. + ResctrlPath string `json:"resctrlPath"` + + // Namespaces filters mon_group creation to pods in these namespaces. + // Empty list means all namespaces. + Namespaces []string `json:"namespaces"` + + // LabelSelector filters mon_group creation to pods matching these labels. + // Empty map means all pods. + LabelSelector map[string]string `json:"labelSelector"` +} + +func newPlugin() *plugin { + cfg := &pluginConfig{ + ResctrlPath: defaultResctrlPath, + } + return &plugin{ + config: cfg, + state: newPodState(), + rdt: newResctrlOps(cfg.ResctrlPath), + } +} + +// Configure handles connecting to container runtime's NRI server. +func (p *plugin) Configure(ctx context.Context, config, runtime, version string) (stub.EventMask, error) { + log.Infof("Connected to %s %s...", runtime, version) + if config != "" { + log.Debugf("loading configuration from NRI server") + if err := p.setConfig([]byte(config)); err != nil { + return 0, err + } + } + return 0, nil +} + +// onClose handles losing connection to container runtime. +func (p *plugin) onClose() { + if p.stopReconciler != nil { + close(p.stopReconciler) + } + log.Infof("Connection to the runtime lost, exiting...") + os.Exit(0) +} + +// setConfig applies new plugin configuration. +func (p *plugin) setConfig(data []byte) error { + log.Tracef("setConfig: parsing\n---8<---\n%s\n--->8---", data) + cfg := pluginConfig{ + ResctrlPath: defaultResctrlPath, + } + if err := yaml.Unmarshal(data, &cfg); err != nil { + return fmt.Errorf("setConfig: cannot parse configuration: %w", err) + } + resctrlPath := filepath.Clean(cfg.ResctrlPath) + if resctrlPath == "" || !filepath.IsAbs(resctrlPath) { + return fmt.Errorf("setConfig: resctrlPath must be an absolute path, got %q", cfg.ResctrlPath) + } + cfg.ResctrlPath = resctrlPath + p.config = &cfg + p.rdt = newResctrlOps(cfg.ResctrlPath) + log.Debugf("configuration: resctrlPath=%s namespaces=%v labelSelector=%v", + cfg.ResctrlPath, cfg.Namespaces, cfg.LabelSelector) + return nil +} + +// Synchronize is called at plugin startup with the current set of pods and containers. +// It reconciles in-memory state with what exists on the resctrl filesystem. +func (p *plugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, containers []*api.Container) ([]*api.ContainerUpdate, error) { + log.Infof("synchronizing state: %d pods, %d containers", len(pods), len(containers)) + + // Build a lookup from sandbox ID to pod (containers reference + // pods by sandbox ID, not by Kubernetes UID). + podBySandboxID := make(map[string]*api.PodSandbox, len(pods)) + for _, pod := range pods { + podBySandboxID[pod.GetId()] = pod + } + + // Create mon_groups for running containers that don't have one, + // and write their PIDs to ensure monitoring is active after restart. + for _, ctr := range containers { + pod, ok := podBySandboxID[ctr.GetPodSandboxId()] + if !ok { + log.Debugf("Synchronize: container %s has no matching pod, skipping", ctr.GetName()) + continue + } + if !p.shouldMonitorPod(pod) { + continue + } + podUID := pod.GetUid() + rdtClass := getRDTClass(ctr) + if err := p.ensureMonGroup(podUID, ctr.GetId(), rdtClass); err != nil { + log.Warnf("Synchronize: failed to create mon_group for pod %s: %v", podUID, err) + continue + } + pid := int(ctr.GetPid()) + if pid > 0 { + monGroupDir := p.state.getMonGroupDir(podUID) + if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { + log.Warnf("Synchronize: failed to write PID %d for pod %s: %v", pid, podUID, err) + } else { + log.Debugf("Synchronize: assigned pid %d for pod %s", pid, podUID) + } + } + } + + // Remove orphaned mon_groups from a previous plugin instance. + p.rdt.cleanOrphanedMonGroups(p.state) + + // Start the background reconciler to periodically clean up orphaned + // mon_groups that could not be removed during StopContainer. + p.startReconciler() + + log.Infof("synchronization complete: tracking %d pods", p.state.podCount()) + return nil, nil +} + +// startReconciler launches a background goroutine that periodically removes +// orphaned mon_group directories. This handles the case where removeMonGroup +// fails in StopContainer (e.g., kernel busy) and the directory lingers. +func (p *plugin) startReconciler() { + if p.stopReconciler != nil { + // Already running from a previous Synchronize call. + return + } + p.stopReconciler = make(chan struct{}) + go func() { + ticker := time.NewTicker(reconcileInterval) + defer ticker.Stop() + for { + select { + case <-p.stopReconciler: + return + case <-ticker.C: + p.rdt.cleanOrphanedMonGroups(p.state) + } + } + }() + log.Debugf("background reconciler started (interval=%s)", reconcileInterval) +} + +// PostCreateContainer is called after the container is created but before +// it starts executing. The container PID is NOT yet available (pid=0) because +// the init process has not been started. We create the mon_group here so it +// is ready for PID assignment in StartContainer. +func (p *plugin) PostCreateContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + + log.Debugf("PostCreateContainer %s: pid=%d (expected 0)", ctrName, ctr.GetPid()) + + if !p.shouldMonitorPod(pod) { + log.Debugf("PostCreateContainer %s: pod filtered out, skipping", ctrName) + return nil + } + + rdtClass := getRDTClass(ctr) + if err := p.ensureMonGroup(podUID, ctr.GetId(), rdtClass); err != nil { + log.Warnf("PostCreateContainer %s: failed to create mon_group: %v", ctrName, err) + return nil // non-fatal: don't block container creation + } + + log.Infof("PostCreateContainer %s: mon_group ready, PID will be assigned in StartContainer", ctrName) + return nil +} + +// StartContainer is called just before the container process starts executing. +// At this point the init process has been created (via runc create) and the PID +// is available, but the process is paused and has NOT forked any threads yet. +// This is the ideal moment to write the PID to the resctrl mon_group tasks +// file: the kernel assigns the RMID to this PID, and when the process starts +// and forks threads they all inherit the RMID automatically. +// +// If the PID is not available (should not happen at this stage), we fall back +// to PostStartContainer which will write PIDs after the process starts. +func (p *plugin) StartContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + pid := int(ctr.GetPid()) + + log.Debugf("StartContainer %s: pid=%d", ctrName, pid) + + if !p.shouldMonitorPod(pod) { + return nil + } + + monGroupDir := p.state.getMonGroupDir(podUID) + if monGroupDir == "" { + log.Debugf("StartContainer %s: no mon_group (pod not tracked), skipping", ctrName) + return nil + } + + if pid > 0 { + if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { + log.Warnf("StartContainer %s: failed to write PID %d to tasks: %v", ctrName, pid, err) + } else { + log.Infof("StartContainer %s: assigned pid %d to mon_group %s (pre-start, no threads yet)", ctrName, pid, monGroupDir) + } + } else { + log.Warnf("StartContainer %s: PID not available at pre-start, will retry in PostStartContainer", ctrName) + } + + return nil +} + +// PostStartContainer is called after the container process has been started. +// This is a fallback: if StartContainer did not have the PID, we write the +// init PID here. The init PID is sufficient because all child threads inherit +// the RMID. +func (p *plugin) PostStartContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) error { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + pid := int(ctr.GetPid()) + + log.Debugf("PostStartContainer %s: pid=%d", ctrName, pid) + + if !p.shouldMonitorPod(pod) { + return nil + } + + monGroupDir := p.state.getMonGroupDir(podUID) + if monGroupDir == "" { + return nil + } + + // Fallback: write the init PID if StartContainer didn't. + if pid > 0 { + if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { + log.Warnf("PostStartContainer %s: failed to write PID %d to tasks: %v", ctrName, pid, err) + } else { + log.Infof("PostStartContainer %s: fallback assigned pid %d to mon_group %s", ctrName, pid, monGroupDir) + } + } else { + log.Warnf("PostStartContainer %s: PID still 0 after start, unexpected", ctrName) + } + + return nil +} + +// StopContainer is called when a container is being stopped. +func (p *plugin) StopContainer(ctx context.Context, pod *api.PodSandbox, ctr *api.Container) ([]*api.ContainerUpdate, error) { + podUID := pod.GetUid() + ctrName := pprintCtr(pod, ctr) + + log.Debugf("StopContainer %s", ctrName) + + monGroupDir := p.state.getMonGroupDir(podUID) + if monGroupDir == "" { + return nil, nil + } + + p.state.removeContainer(podUID, ctr.GetId()) + + if p.state.podHasNoContainers(podUID) { + log.Infof("StopContainer %s: last container, removing mon_group %s", ctrName, monGroupDir) + if err := p.rdt.removeMonGroup(monGroupDir); err != nil { + log.Warnf("StopContainer %s: failed to remove mon_group (will be cleaned on next restart): %v", ctrName, err) + } + p.state.removePod(podUID) + } + + return nil, nil +} + +// ensureMonGroup creates the mon_group directory if it doesn't exist and registers +// the container in the in-memory state. +// +// Limitation: all containers in a pod share a single mon_group under the first +// container's RDT class. If an allocation plugin assigns different classes to +// containers in the same pod, subsequent containers use the first class. +func (p *plugin) ensureMonGroup(podUID, containerID, rdtClass string) error { + if !looksLikePodUID(podUID) { + return fmt.Errorf("invalid pod UID %q", podUID) + } + + p.mu.Lock() + defer p.mu.Unlock() + + if p.state.getMonGroupDir(podUID) != "" { + // Mon_group already exists for this pod. Just add the container. + p.state.addContainer(podUID, containerID) + return nil + } + + monGroupDir, err := p.rdt.createMonGroup(rdtClass, podUID) + if err != nil { + return err + } + + p.state.addPod(podUID, monGroupDir) + p.state.addContainer(podUID, containerID) + log.Infof("created mon_group %s for pod %s", monGroupDir, podUID) + return nil +} + +// shouldMonitorPod checks namespace and label filters. +func (p *plugin) shouldMonitorPod(pod *api.PodSandbox) bool { + if len(p.config.Namespaces) > 0 { + ns := pod.GetNamespace() + found := false + for _, allowed := range p.config.Namespaces { + if ns == allowed { + found = true + break + } + } + if !found { + return false + } + } + if len(p.config.LabelSelector) > 0 { + labels := pod.GetLabels() + for k, v := range p.config.LabelSelector { + if labels[k] != v { + return false + } + } + } + return true +} + +// getRDTClass extracts the RDT class from a container's Linux resources. +func getRDTClass(ctr *api.Container) string { + if linux := ctr.GetLinux(); linux != nil { + if res := linux.GetResources(); res != nil { + if rdt := res.GetRdtClass(); rdt != nil { + return rdt.GetValue() + } + } + } + return "" +} + +// pprintCtr returns a human-readable container identifier. +func pprintCtr(pod *api.PodSandbox, ctr *api.Container) string { + return fmt.Sprintf("%s/%s:%s", pod.GetNamespace(), pod.GetName(), ctr.GetName()) +} diff --git a/cmd/plugins/resctrl-mon/plugin_test.go b/cmd/plugins/resctrl-mon/plugin_test.go new file mode 100644 index 000000000..272c91754 --- /dev/null +++ b/cmd/plugins/resctrl-mon/plugin_test.go @@ -0,0 +1,369 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "context" + "os" + "path/filepath" + "testing" + + "github.com/containerd/nri/pkg/api" + "github.com/sirupsen/logrus" + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func init() { + log = logrus.StandardLogger() + log.SetLevel(logrus.TraceLevel) +} + +func newTestPlugin(resctrlPath string) *plugin { + cfg := &pluginConfig{ + ResctrlPath: resctrlPath, + } + return &plugin{ + config: cfg, + state: newPodState(), + rdt: newResctrlOps(resctrlPath), + } +} + +func makePod(uid, namespace, name string) *api.PodSandbox { + return &api.PodSandbox{ + Id: "sandbox-" + uid, // CRI sandbox ID != K8s pod UID + Uid: uid, + Namespace: namespace, + Name: name, + Labels: map[string]string{}, + } +} + +func makeContainer(id, name, podSandboxID string, pid uint32, rdtClass string) *api.Container { + ctr := &api.Container{ + Id: id, + PodSandboxId: podSandboxID, + Name: name, + Pid: pid, + Linux: &api.LinuxContainer{ + Resources: &api.LinuxResources{}, + }, + } + if rdtClass != "" { + ctr.Linux.Resources.RdtClass = &api.OptionalString{Value: rdtClass} + } + return ctr +} + +func TestShouldMonitorPod_NoFilters(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + pod := makePod("uid-1", "default", "test-pod") + assert.True(t, p.shouldMonitorPod(pod)) +} + +func TestShouldMonitorPod_NamespaceFilter(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + p.config.Namespaces = []string{"production", "staging"} + + pod1 := makePod("uid-1", "production", "pod1") + assert.True(t, p.shouldMonitorPod(pod1)) + + pod2 := makePod("uid-2", "kube-system", "pod2") + assert.False(t, p.shouldMonitorPod(pod2)) +} + +func TestShouldMonitorPod_LabelFilter(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + p.config.LabelSelector = map[string]string{"monitor": "true"} + + pod1 := makePod("uid-1", "default", "pod1") + pod1.Labels = map[string]string{"monitor": "true", "app": "web"} + assert.True(t, p.shouldMonitorPod(pod1)) + + pod2 := makePod("uid-2", "default", "pod2") + pod2.Labels = map[string]string{"app": "web"} + assert.False(t, p.shouldMonitorPod(pod2)) +} + +func TestGetRDTClass(t *testing.T) { + ctr1 := makeContainer("c1", "container1", "uid-1", 1234, "BestEffort") + assert.Equal(t, "BestEffort", getRDTClass(ctr1)) + + ctr2 := makeContainer("c2", "container2", "uid-1", 1235, "") + assert.Equal(t, "", getRDTClass(ctr2)) + + ctr3 := &api.Container{ + Id: "c3", + Name: "container3", + } + assert.Equal(t, "", getRDTClass(ctr3)) +} + +func TestPprintCtr(t *testing.T) { + pod := makePod("uid-1", "default", "my-pod") + ctr := makeContainer("c1", "my-container", "uid-1", 1234, "") + assert.Equal(t, "default/my-pod:my-container", pprintCtr(pod, ctr)) +} + +func TestPostCreateContainer_FilteredPod(t *testing.T) { + p := newTestPlugin(t.TempDir()) + p.config.Namespaces = []string{"production"} + + pod := makePod("uid-1", "default", "test-pod") + ctr := makeContainer("c1", "container1", "uid-1", 1234, "") + + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Pod should not be tracked since it's not in the production namespace. + assert.Equal(t, 0, p.state.podCount()) +} + +func TestPostCreateContainer_CreatesMonGroup(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + + pod := makePod("a1b2c3d4-e5f6-7890-abcd-ef1234567890", "default", "test-pod") + ctr := makeContainer("c1", "container1", "a1b2c3d4-e5f6-7890-abcd-ef1234567890", 0, "") + + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // Pod should be tracked. + assert.Equal(t, 1, p.state.podCount()) + monDir := p.state.getMonGroupDir("a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Contains(t, monDir, "mon_groups/a1b2c3d4-e5f6-7890-abcd-ef1234567890") +} + +func TestPostCreateContainer_WithRDTClass(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + + pod := makePod("a1b2c3d4-e5f6-7890-abcd-ef1234567890", "default", "test-pod") + ctr := makeContainer("c1", "container1", "a1b2c3d4-e5f6-7890-abcd-ef1234567890", 0, "BestEffort") + + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + monDir := p.state.getMonGroupDir("a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Contains(t, monDir, "BestEffort/mon_groups/a1b2c3d4-e5f6-7890-abcd-ef1234567890") +} + +func TestMultiContainerPod(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "multi-pod") + ctr1 := makeContainer("c1", "container1", podUID, 0, "") + ctr2 := makeContainer("c2", "container2", podUID, 0, "") + + // First container creates the mon_group. + err := p.PostCreateContainer(context.Background(), pod, ctr1) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) + + // Second container reuses the same mon_group. + err = p.PostCreateContainer(context.Background(), pod, ctr2) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) // still one pod + + // Stopping first container should not remove the mon_group. + _, err = p.StopContainer(context.Background(), pod, ctr1) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) + assert.False(t, p.state.podHasNoContainers(podUID)) + + // Stopping second container should remove the mon_group. + _, err = p.StopContainer(context.Background(), pod, ctr2) + require.NoError(t, err) + assert.Equal(t, 0, p.state.podCount()) +} + +func TestStopContainer_UnknownPod(t *testing.T) { + p := newTestPlugin(t.TempDir()) + + pod := makePod("unknown-uid", "default", "unknown-pod") + ctr := makeContainer("c1", "container1", "unknown-uid", 1234, "") + + updates, err := p.StopContainer(context.Background(), pod, ctr) + require.NoError(t, err) + assert.Nil(t, updates) +} + +func TestSetConfig(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + + configYAML := []byte(` +resctrlPath: /tmp/test-resctrl +namespaces: + - production + - staging +labelSelector: + monitor: "true" +`) + + err := p.setConfig(configYAML) + require.NoError(t, err) + assert.Equal(t, "/tmp/test-resctrl", p.config.ResctrlPath) + assert.Equal(t, []string{"production", "staging"}, p.config.Namespaces) + assert.Equal(t, map[string]string{"monitor": "true"}, p.config.LabelSelector) +} + +func TestSetConfig_InvalidYAML(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + + err := p.setConfig([]byte(":::invalid yaml")) + assert.Error(t, err) +} + +func TestSetConfig_RelativePath(t *testing.T) { + p := newTestPlugin("/tmp/resctrl-test") + + err := p.setConfig([]byte("resctrlPath: relative/path")) + assert.Error(t, err) + assert.Contains(t, err.Error(), "absolute path") +} + +func TestSynchronize_UsesUIDNotSandboxID(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "sync-pod") + // Container references the pod by sandbox ID, not by UID. + ctr := makeContainer("c1", "container1", pod.GetId(), 0, "") + + _, err := p.Synchronize(context.Background(), []*api.PodSandbox{pod}, []*api.Container{ctr}) + require.NoError(t, err) + + // The mon_group should be keyed by the K8s pod UID, not the sandbox ID. + assert.Equal(t, 1, p.state.podCount()) + assert.True(t, p.state.hasPod(podUID)) + assert.False(t, p.state.hasPod(pod.GetId())) + + monDir := p.state.getMonGroupDir(podUID) + assert.Contains(t, monDir, podUID) +} + +func TestEnsureMonGroup_InvalidUID(t *testing.T) { + p := newTestPlugin(t.TempDir()) + + err := p.ensureMonGroup("", "c1", "") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid pod UID") + + err = p.ensureMonGroup("not-a-uuid", "c1", "") + assert.Error(t, err) + + assert.Equal(t, 0, p.state.podCount()) +} + +func TestStartContainer_AssignsPID(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "test-pod") + ctr := makeContainer("c1", "container1", podUID, 0, "") + + // Create the mon_group via PostCreateContainer. + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + monDir := p.state.getMonGroupDir(podUID) + require.NotEmpty(t, monDir) + + // Simulate the kernel creating the tasks file. + require.NoError(t, os.WriteFile(filepath.Join(monDir, "tasks"), nil, 0644)) + + // StartContainer with a valid PID should write it to tasks. + ctrWithPid := makeContainer("c1", "container1", podUID, 42, "") + err = p.StartContainer(context.Background(), pod, ctrWithPid) + require.NoError(t, err) + + data, err := os.ReadFile(filepath.Join(monDir, "tasks")) + require.NoError(t, err) + assert.Equal(t, "42\n", string(data)) +} + +func TestStartContainer_PIDZero_FallbackToPostStart(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "test-pod") + ctr := makeContainer("c1", "container1", podUID, 0, "") + + // Create the mon_group. + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + monDir := p.state.getMonGroupDir(podUID) + require.NotEmpty(t, monDir) + require.NoError(t, os.WriteFile(filepath.Join(monDir, "tasks"), nil, 0644)) + + // StartContainer with PID 0 should not fail (just warns). + err = p.StartContainer(context.Background(), pod, ctr) + require.NoError(t, err) + + // PostStartContainer with a valid PID should write it. + ctrWithPid := makeContainer("c1", "container1", podUID, 99, "") + err = p.PostStartContainer(context.Background(), pod, ctrWithPid) + require.NoError(t, err) + + data, err := os.ReadFile(filepath.Join(monDir, "tasks")) + require.NoError(t, err) + assert.Equal(t, "99\n", string(data)) +} + +func TestStartContainer_FilteredPod(t *testing.T) { + p := newTestPlugin(t.TempDir()) + p.config.Namespaces = []string{"production"} + + pod := makePod("a1b2c3d4-e5f6-7890-abcd-ef1234567890", "default", "test-pod") + ctr := makeContainer("c1", "container1", "a1b2c3d4-e5f6-7890-abcd-ef1234567890", 42, "") + + // Should not error even though pod is filtered. + err := p.StartContainer(context.Background(), pod, ctr) + require.NoError(t, err) +} + +func TestStopContainer_RemovesStateOnRmdirFailure(t *testing.T) { + tmpDir := t.TempDir() + p := newTestPlugin(tmpDir) + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + pod := makePod(podUID, "default", "test-pod") + ctr := makeContainer("c1", "container1", podUID, 0, "") + + // Create the mon_group. + err := p.PostCreateContainer(context.Background(), pod, ctr) + require.NoError(t, err) + assert.Equal(t, 1, p.state.podCount()) + + monDir := p.state.getMonGroupDir(podUID) + require.NotEmpty(t, monDir) + + // Put a file inside the mon_group dir so os.Remove fails (dir not empty). + require.NoError(t, os.WriteFile(filepath.Join(monDir, "tasks"), nil, 0644)) + + // StopContainer should still remove pod from state even if rmdir fails. + _, err = p.StopContainer(context.Background(), pod, ctr) + require.NoError(t, err) + assert.Equal(t, 0, p.state.podCount()) +} diff --git a/cmd/plugins/resctrl-mon/resctrl.go b/cmd/plugins/resctrl-mon/resctrl.go new file mode 100644 index 000000000..fdb37cf4d --- /dev/null +++ b/cmd/plugins/resctrl-mon/resctrl.go @@ -0,0 +1,216 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "errors" + "fmt" + "os" + "path/filepath" + "strconv" + "strings" + "syscall" +) + +const ( + defaultResctrlPath = "/sys/fs/resctrl" + monGroupsDir = "mon_groups" +) + +// resctrlOps handles filesystem operations on the resctrl mount. +type resctrlOps struct { + resctrlPath string +} + +func newResctrlOps(resctrlPath string) *resctrlOps { + return &resctrlOps{ + resctrlPath: resctrlPath, + } +} + +// createMonGroup creates a mon_group directory under the appropriate ctrl_group +// and returns the full path. If rdtClass is empty, the mon_group is created +// under the root resctrl directory. +// +// The kernel assigns an RMID to the new mon_group on mkdir. If no RMIDs are +// available, mkdir returns ENOSPC. +func (r *resctrlOps) createMonGroup(rdtClass, podUID string) (string, error) { + parentDir := r.resctrlPath + if rdtClass != "" { + if !isValidRDTClass(rdtClass) { + return "", fmt.Errorf("invalid RDT class name %q", rdtClass) + } + parentDir = filepath.Join(r.resctrlPath, rdtClass) + } + + // When an RDT class is specified, the ctrl_group must already exist + // (created by an allocation plugin). Do not create it implicitly — + // that would make an unintended ctrl_group in the resctrl filesystem. + if rdtClass != "" { + info, err := os.Stat(parentDir) + if err != nil { + return "", fmt.Errorf("ctrl_group %s does not exist: %w", parentDir, err) + } + if !info.IsDir() { + return "", fmt.Errorf("ctrl_group %s is not a directory", parentDir) + } + } + + monGroupsPath := filepath.Join(parentDir, monGroupsDir) + monGroupDir := filepath.Join(monGroupsPath, podUID) + + // Ensure the mon_groups/ directory exists. On a real resctrl mount + // this is always present. For testing, create it if needed. + if err := os.MkdirAll(monGroupsPath, 0755); err != nil { + return "", fmt.Errorf("mon_groups dir not available at %s: %w", monGroupsPath, err) + } + + // Use Mkdir (not MkdirAll) for the final mon_group directory to + // avoid accidentally creating a ctrl_group if rdtClass is wrong. + if err := os.Mkdir(monGroupDir, 0755); err != nil { + if errors.Is(err, os.ErrExist) { + return monGroupDir, nil + } + if errors.Is(err, syscall.ENOSPC) { + return "", fmt.Errorf("no RMIDs available for pod %s: %w", podUID, err) + } + return "", fmt.Errorf("failed to create mon_group %s: %w", monGroupDir, err) + } + + return monGroupDir, nil +} + +// removeMonGroup removes a mon_group directory. The kernel releases the RMID. +func (r *resctrlOps) removeMonGroup(monGroupDir string) error { + err := os.Remove(monGroupDir) + if err != nil && !errors.Is(err, os.ErrNotExist) { + return fmt.Errorf("failed to remove mon_group %s: %w", monGroupDir, err) + } + return nil +} + +// writeTaskPID writes a PID to the mon_group's tasks file. The kernel assigns +// this PID (and all future child processes) to the mon_group's RMID. +func (r *resctrlOps) writeTaskPID(monGroupDir string, pid int) error { + tasksFile := filepath.Join(monGroupDir, "tasks") + f, err := os.OpenFile(tasksFile, os.O_WRONLY, 0) + if err != nil { + return fmt.Errorf("failed to open %s for pid %d: %w", tasksFile, pid, err) + } + defer func() { _ = f.Close() }() + data := []byte(strconv.Itoa(pid) + "\n") + if _, err := f.Write(data); err != nil { + return fmt.Errorf("failed to write pid %d to %s: %w", pid, tasksFile, err) + } + return nil +} + +// cleanOrphanedMonGroups removes mon_group directories that are not tracked +// in the given state. This handles cleanup after a plugin crash/restart. +func (r *resctrlOps) cleanOrphanedMonGroups(state *podState) { + // Scan root-level mon_groups. + r.cleanOrphanedInDir(filepath.Join(r.resctrlPath, monGroupsDir), state) + + // Scan ctrl_group-level mon_groups. + entries, err := os.ReadDir(r.resctrlPath) + if err != nil { + log.Warnf("cleanOrphanedMonGroups: failed to read %s: %v", r.resctrlPath, err) + return + } + for _, entry := range entries { + if !entry.IsDir() { + continue + } + name := entry.Name() + // Skip non-ctrl_group entries. + if name == monGroupsDir || name == "info" || strings.HasPrefix(name, "mon_") { + continue + } + ctrlGroupMonDir := filepath.Join(r.resctrlPath, name, monGroupsDir) + r.cleanOrphanedInDir(ctrlGroupMonDir, state) + } +} + +// cleanOrphanedInDir removes mon_group directories in a specific mon_groups/ +// directory that look like pod UIDs but are not tracked in state. +func (r *resctrlOps) cleanOrphanedInDir(monGroupsPath string, state *podState) { + entries, err := os.ReadDir(monGroupsPath) + if err != nil { + if !errors.Is(err, os.ErrNotExist) { + log.Warnf("failed to read mon_groups directory %s: %v", monGroupsPath, err) + } + return + } + for _, entry := range entries { + if !entry.IsDir() { + continue + } + name := entry.Name() + // Only clean directories that look like pod UIDs (contain dashes like UUIDs). + if !looksLikePodUID(name) { + continue + } + orphanDir := filepath.Join(monGroupsPath, name) + trackedDir := state.getMonGroupDir(name) + if trackedDir == orphanDir { + // This is the active mon_group for this pod. + continue + } + log.Infof("removing orphaned mon_group %s", orphanDir) + if err := os.Remove(orphanDir); err != nil && !errors.Is(err, os.ErrNotExist) { + log.Warnf("failed to remove orphaned mon_group %s: %v", orphanDir, err) + } + } +} + +// looksLikePodUID returns true if the name looks like a Kubernetes pod UID +// (UUID format with dashes, e.g., a1b2c3d4-e5f6-7890-abcd-ef1234567890). +func looksLikePodUID(name string) bool { + if len(name) != 36 { + return false + } + // Check for UUID-like pattern: 8-4-4-4-12 hex chars. + parts := strings.Split(name, "-") + if len(parts) != 5 { + return false + } + expectedLens := []int{8, 4, 4, 4, 12} + for i, part := range parts { + if len(part) != expectedLens[i] { + return false + } + for _, c := range part { + if (c < '0' || c > '9') && (c < 'a' || c > 'f') && (c < 'A' || c > 'F') { + return false + } + } + } + return true +} + +// isValidRDTClass returns true if the name is a safe resctrl ctrl_group name. +// It rejects path separators, dot-segments, and empty strings to prevent +// path traversal outside the resctrl mount. +func isValidRDTClass(name string) bool { + if name == "" || name == "." || name == ".." { + return false + } + for _, c := range name { + if c == '/' || c == 0 { + return false + } + } + return true +} diff --git a/cmd/plugins/resctrl-mon/resctrl_test.go b/cmd/plugins/resctrl-mon/resctrl_test.go new file mode 100644 index 000000000..9d2dbedb1 --- /dev/null +++ b/cmd/plugins/resctrl-mon/resctrl_test.go @@ -0,0 +1,236 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import ( + "os" + "path/filepath" + "testing" + + "github.com/stretchr/testify/assert" + "github.com/stretchr/testify/require" +) + +func TestCreateMonGroup_RootClass(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + assert.Equal(t, filepath.Join(tmpDir, "mon_groups", "pod-uid-1"), dir) + + // Directory should exist. + info, err := os.Stat(dir) + require.NoError(t, err) + assert.True(t, info.IsDir()) +} + +func TestCreateMonGroup_WithRDTClass(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + + dir, err := r.createMonGroup("BestEffort", "pod-uid-2") + require.NoError(t, err) + assert.Equal(t, filepath.Join(tmpDir, "BestEffort", "mon_groups", "pod-uid-2"), dir) + + info, err := os.Stat(dir) + require.NoError(t, err) + assert.True(t, info.IsDir()) +} + +func TestCreateMonGroup_MissingCtrlGroup(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + // Attempt to create a mon_group under a non-existent ctrl_group. + _, err := r.createMonGroup("NoSuchClass", "pod-uid-3") + assert.Error(t, err) + assert.Contains(t, err.Error(), "ctrl_group") + + // Verify the ctrl_group was NOT created. + _, err = os.Stat(filepath.Join(tmpDir, "NoSuchClass")) + assert.True(t, os.IsNotExist(err)) +} + +func TestCreateMonGroup_Idempotent(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir1, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + dir2, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + assert.Equal(t, dir1, dir2) +} + +func TestRemoveMonGroup(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + err = r.removeMonGroup(dir) + require.NoError(t, err) + + _, err = os.Stat(dir) + assert.True(t, os.IsNotExist(err)) +} + +func TestRemoveMonGroup_NotExist(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + err := r.removeMonGroup(filepath.Join(tmpDir, "mon_groups", "nonexistent")) + assert.NoError(t, err) +} + +func TestWriteTaskPID(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + dir, err := r.createMonGroup("", "pod-uid-1") + require.NoError(t, err) + + // In real resctrl, the kernel creates the tasks file when the + // mon_group directory is created. Simulate that here. + tasksFile := filepath.Join(dir, "tasks") + require.NoError(t, os.WriteFile(tasksFile, nil, 0644)) + + err = r.writeTaskPID(dir, 12345) + require.NoError(t, err) + + data, err := os.ReadFile(tasksFile) + require.NoError(t, err) + assert.Equal(t, "12345\n", string(data)) +} + +func TestCleanOrphanedMonGroups(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + state := newPodState() + + // Create a mon_group that IS tracked. + trackedUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + dir, err := r.createMonGroup("", trackedUID) + require.NoError(t, err) + state.addPod(trackedUID, dir) + + // Create a mon_group that is NOT tracked (orphan). + orphanUID := "deadbeef-dead-beef-dead-beefdeadbeef" + _, err = r.createMonGroup("", orphanUID) + require.NoError(t, err) + + r.cleanOrphanedMonGroups(state) + + // Tracked should still exist. + _, err = os.Stat(filepath.Join(tmpDir, "mon_groups", trackedUID)) + assert.NoError(t, err) + + // Orphan should be removed. + _, err = os.Stat(filepath.Join(tmpDir, "mon_groups", orphanUID)) + assert.True(t, os.IsNotExist(err)) +} + +func TestCleanOrphanedMonGroups_CtrlGroup(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + state := newPodState() + + // Create orphan under a ctrl_group. + orphanUID := "deadbeef-dead-beef-dead-beefdeadbeef" + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + _, err := r.createMonGroup("BestEffort", orphanUID) + require.NoError(t, err) + + r.cleanOrphanedMonGroups(state) + + _, err = os.Stat(filepath.Join(tmpDir, "BestEffort", "mon_groups", orphanUID)) + assert.True(t, os.IsNotExist(err)) +} + +func TestCleanOrphanedMonGroups_StaleLocation(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + state := newPodState() + + podUID := "a1b2c3d4-e5f6-7890-abcd-ef1234567890" + + // Create a mon_group under BestEffort (simulates previous run). + require.NoError(t, os.Mkdir(filepath.Join(tmpDir, "BestEffort"), 0755)) + _, err := r.createMonGroup("BestEffort", podUID) + require.NoError(t, err) + + // Track the pod at the root class (simulates current run with different RDT class). + rootDir, err := r.createMonGroup("", podUID) + require.NoError(t, err) + state.addPod(podUID, rootDir) + + r.cleanOrphanedMonGroups(state) + + // Root mon_group (tracked) should still exist. + _, err = os.Stat(rootDir) + assert.NoError(t, err) + + // BestEffort mon_group (stale) should be removed. + _, err = os.Stat(filepath.Join(tmpDir, "BestEffort", "mon_groups", podUID)) + assert.True(t, os.IsNotExist(err)) +} + +func TestLooksLikePodUID(t *testing.T) { + assert.True(t, looksLikePodUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890")) + assert.True(t, looksLikePodUID("DEADBEEF-DEAD-BEEF-DEAD-BEEFDEADBEEF")) + assert.True(t, looksLikePodUID("00000000-0000-0000-0000-000000000000")) + + assert.False(t, looksLikePodUID("short")) + assert.False(t, looksLikePodUID("not-a-uuid-at-all-nope-notthisone!")) + assert.False(t, looksLikePodUID("a1b2c3d4-e5f6-7890-abcd-ef123456789")) // too short last segment + assert.False(t, looksLikePodUID("g1b2c3d4-e5f6-7890-abcd-ef1234567890")) // 'g' is not hex + assert.False(t, looksLikePodUID("a1b2c3d4-e5f6-7890-abcd-ef1234567890x")) // too long +} + +func TestIsValidRDTClass(t *testing.T) { + assert.True(t, isValidRDTClass("BestEffort")) + assert.True(t, isValidRDTClass("Guaranteed")) + assert.True(t, isValidRDTClass("COS1")) + assert.True(t, isValidRDTClass("my-class_v2")) + + assert.False(t, isValidRDTClass("")) + assert.False(t, isValidRDTClass(".")) + assert.False(t, isValidRDTClass("..")) + assert.False(t, isValidRDTClass("../../etc")) + assert.False(t, isValidRDTClass("foo/bar")) + assert.False(t, isValidRDTClass("class\x00name")) +} + +func TestCreateMonGroup_PathTraversal(t *testing.T) { + tmpDir := t.TempDir() + r := newResctrlOps(tmpDir) + + _, err := r.createMonGroup("../../etc", "a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid RDT class") + + _, err = r.createMonGroup("foo/bar", "a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid RDT class") + + _, err = r.createMonGroup("..", "a1b2c3d4-e5f6-7890-abcd-ef1234567890") + assert.Error(t, err) + assert.Contains(t, err.Error(), "invalid RDT class") +} diff --git a/cmd/plugins/resctrl-mon/state.go b/cmd/plugins/resctrl-mon/state.go new file mode 100644 index 000000000..47dba1f1a --- /dev/null +++ b/cmd/plugins/resctrl-mon/state.go @@ -0,0 +1,109 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package main + +import "sync" + +// podInfo tracks the mon_group directory and container set for a single pod. +type podInfo struct { + monGroupDir string + containers map[string]struct{} // container IDs +} + +// podState tracks all pods with active mon_groups. +type podState struct { + mu sync.Mutex + pods map[string]*podInfo // keyed by pod UID +} + +func newPodState() *podState { + return &podState{ + pods: make(map[string]*podInfo), + } +} + +// addPod registers a new pod with its mon_group directory. +// If the pod already exists, the existing entry is preserved. +func (s *podState) addPod(podUID, monGroupDir string) { + s.mu.Lock() + defer s.mu.Unlock() + if _, ok := s.pods[podUID]; ok { + return + } + s.pods[podUID] = &podInfo{ + monGroupDir: monGroupDir, + containers: make(map[string]struct{}), + } +} + +// addContainer adds a container ID to an existing pod's tracking. +func (s *podState) addContainer(podUID, containerID string) { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + info.containers[containerID] = struct{}{} + } +} + +// removeContainer removes a container ID from a pod's tracking. +func (s *podState) removeContainer(podUID, containerID string) { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + delete(info.containers, containerID) + } +} + +// removePod removes all tracking for a pod. +func (s *podState) removePod(podUID string) { + s.mu.Lock() + defer s.mu.Unlock() + delete(s.pods, podUID) +} + +// getMonGroupDir returns the mon_group directory for a pod, or empty string. +func (s *podState) getMonGroupDir(podUID string) string { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + return info.monGroupDir + } + return "" +} + +// podHasNoContainers returns true if the pod has no remaining containers. +func (s *podState) podHasNoContainers(podUID string) bool { + s.mu.Lock() + defer s.mu.Unlock() + if info, ok := s.pods[podUID]; ok { + return len(info.containers) == 0 + } + return true +} + +// hasPod returns true if the pod UID is being tracked. +func (s *podState) hasPod(podUID string) bool { + s.mu.Lock() + defer s.mu.Unlock() + _, ok := s.pods[podUID] + return ok +} + +// podCount returns the number of tracked pods. +func (s *podState) podCount() int { + s.mu.Lock() + defer s.mu.Unlock() + return len(s.pods) +} diff --git a/deployment/helm/resctrl-mon/.helmignore b/deployment/helm/resctrl-mon/.helmignore new file mode 100644 index 000000000..bf4d580e7 --- /dev/null +++ b/deployment/helm/resctrl-mon/.helmignore @@ -0,0 +1,20 @@ +# Patterns to ignore when building packages. +.DS_Store +# Common VCS dirs +.git/ +.gitignore +.bzr/ +.bzrignore +.hg/ +.hgignore +.svn/ +# Common backup files +*.swp +*.bak +*.tmp +*~ +# Various IDEs +.project +.idea/ +*.tmproj +.vscode/ diff --git a/deployment/helm/resctrl-mon/Chart.yaml b/deployment/helm/resctrl-mon/Chart.yaml new file mode 100644 index 000000000..f1e3e0d70 --- /dev/null +++ b/deployment/helm/resctrl-mon/Chart.yaml @@ -0,0 +1,11 @@ +apiVersion: v2 +appVersion: unstable +description: | + The resctrl-mon NRI plugin creates per-pod resctrl monitoring groups + (mon_groups) for Application Energy Telemetry via Kepler passive mode. +name: nri-resctrl-mon +sources: + - https://github.com/containers/nri-plugins +home: https://github.com/containers/nri-plugins +type: application +version: v0.0.0 diff --git a/deployment/helm/resctrl-mon/README.md b/deployment/helm/resctrl-mon/README.md new file mode 100644 index 000000000..206e3eced --- /dev/null +++ b/deployment/helm/resctrl-mon/README.md @@ -0,0 +1,121 @@ +# Resctrl-Mon Plugin + +This chart deploys the resctrl-mon Node Resource Interface (NRI) plugin. The +resctrl-mon NRI plugin creates per-pod resctrl monitoring groups (mon_groups) +to support Application Energy Telemetry (AET) via Kepler passive mode. + +## Prerequisites + +- Kubernetes 1.24+ +- Helm 3.0.0+ +- Intel CPU with RDT monitoring support (CMT/MBM and/or AET) +- resctrl filesystem mounted at `/sys/fs/resctrl` +- Container runtime: + - containerd: + - At least [containerd 1.7.0](https://github.com/containerd/containerd/releases/tag/v1.7.0) + release version to use the NRI feature. + + - Enable NRI feature by following + [these](https://github.com/containerd/containerd/blob/main/docs/NRI.md#enabling-nri-support-in-containerd) + detailed instructions. You can optionally enable the NRI in containerd + using the Helm chart during the chart installation simply by setting the + `nri.runtime.patchConfig` parameter. For instance, + + ```sh + helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --set nri.runtime.patchConfig=true --namespace kube-system + ``` + + Enabling `nri.runtime.patchConfig` creates an init container to turn on + NRI feature in containerd and only after that proceed the plugin + installation. + + - CRI-O + - At least [v1.26.0](https://github.com/cri-o/cri-o/releases/tag/v1.26.0) + release version to use the NRI feature + - Enable NRI feature by following + [these](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table) + detailed instructions. You can optionally enable the NRI in CRI-O using + the Helm chart during the chart installation simply by setting the + `nri.runtime.patchConfig` parameter. For instance, + + ```sh + helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system --set nri.runtime.patchConfig=true + ``` + +## Installing the Chart + +Path to the chart: `nri-resctrl-mon`. + +```sh +helm repo add nri-plugins https://containers.github.io/nri-plugins +helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system +``` + +The command above deploys resctrl-mon NRI plugin on the Kubernetes cluster +within the `kube-system` namespace with default configuration. To customize the +available parameters as described in the [Configuration options](#configuration-options) +below, you have two options: you can use the `--set` flag or create a custom +values.yaml file and provide it using the `-f` flag. For example: + +```sh +# Install the resctrl-mon plugin with custom values provided using the --set option +helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system --set nri.runtime.patchConfig=true +``` + +```sh +# Install the resctrl-mon plugin with custom values specified in a custom values.yaml file +cat < myPath/values.yaml +nri: + runtime: + patchConfig: true + plugin: + index: 90 + +tolerations: +- key: "node-role.kubernetes.io/control-plane" + operator: "Exists" + effect: "NoSchedule" +EOF + +helm install my-resctrl-mon nri-plugins/nri-resctrl-mon --namespace kube-system -f myPath/values.yaml +``` + +## Uninstalling the Chart + +To uninstall the resctrl-mon plugin run the following command: + +```sh +helm delete my-resctrl-mon --namespace kube-system +``` + +## Security + +The DaemonSet runs with `hostPID: true` because the plugin must write +host-namespace PIDs into resctrl `tasks` files. Without host PID +visibility the kernel rejects the write (`ESRCH`). The container also +requires `SYS_ADMIN` and `DAC_OVERRIDE` capabilities to manage resctrl +`mon_group` directories. + +## Configuration options + +The tables below present an overview of the parameters available for users to +customize with their own values, along with the default values. + +| Name | Default | Description | +| ------------------------ | ----------------------------------------------------------------------------------------------------------------------------- | ---------------------------------------------------- | +| `image.name` | [ghcr.io/containers/nri-plugins/nri-resctrl-mon](https://ghcr.io/containers/nri-plugins/nri-resctrl-mon) | container image name | +| `image.tag` | unstable | container image tag | +| `image.pullPolicy` | Always | image pull policy | +| `resources.cpu` | 10m | cpu resources for the Pod | +| `resources.memory` | 50Mi | memory quota for the Pod | +| `nri.runtime.config.pluginRegistrationTimeout` | "" | set NRI plugin registration timeout in NRI config of containerd or CRI-O | +| `nri.runtime.config.pluginRequestTimeout` | "" | set NRI plugin request timeout in NRI config of containerd or CRI-O | +| `nri.runtime.patchConfig` | false | patch NRI configuration in containerd or CRI-O | +| `nri.plugin.index` | 90 | NRI plugin index to register with | +| `initContainerImage.name` | [ghcr.io/containers/nri-plugins/nri-config-manager](https://ghcr.io/containers/nri-plugins/nri-config-manager) | init container image name | +| `initContainerImage.tag` | unstable | init container image tag | +| `initContainerImage.pullPolicy` | Always | init container image pull policy | +| `tolerations` | [] | specify taint toleration key, operator and effect | +| `affinity` | [] | specify node affinity | +| `nodeSelector` | [] | specify node selector labels | +| `podPriorityClassNodeCritical` | true | enable [marking Pod as node critical](https://kubernetes.io/docs/tasks/administer-cluster/guaranteed-scheduling-critical-addon-pods/#marking-pod-as-critical) | diff --git a/deployment/helm/resctrl-mon/templates/_helpers.tpl b/deployment/helm/resctrl-mon/templates/_helpers.tpl new file mode 100644 index 000000000..9b4239372 --- /dev/null +++ b/deployment/helm/resctrl-mon/templates/_helpers.tpl @@ -0,0 +1,16 @@ +{{/* +Common labels +*/}} +{{- define "nri-plugin.labels" -}} +helm.sh/chart: {{ .Chart.Name }}-{{ .Chart.Version }} +app.kubernetes.io/managed-by: {{ .Release.Service }} +{{ include "nri-plugin.selectorLabels" . }} +{{- end -}} + +{{/* +Selector labels +*/}} +{{- define "nri-plugin.selectorLabels" -}} +app.kubernetes.io/name: nri-resctrl-mon +app.kubernetes.io/instance: {{ .Release.Name }} +{{- end -}} diff --git a/deployment/helm/resctrl-mon/templates/configmap.yaml b/deployment/helm/resctrl-mon/templates/configmap.yaml new file mode 100644 index 000000000..3ad305503 --- /dev/null +++ b/deployment/helm/resctrl-mon/templates/configmap.yaml @@ -0,0 +1,12 @@ +apiVersion: v1 +kind: ConfigMap +metadata: + name: nri-resctrl-mon-config.default + namespace: {{ .Release.Namespace }} + labels: + {{- include "nri-plugin.labels" . | nindent 4 }} +data: + config.yaml: | + resctrlPath: /sys/fs/resctrl + namespaces: [] + labelSelector: {} diff --git a/deployment/helm/resctrl-mon/templates/daemonset.yaml b/deployment/helm/resctrl-mon/templates/daemonset.yaml new file mode 100644 index 000000000..7b3dd4ead --- /dev/null +++ b/deployment/helm/resctrl-mon/templates/daemonset.yaml @@ -0,0 +1,111 @@ +apiVersion: apps/v1 +kind: DaemonSet +metadata: + labels: + {{- include "nri-plugin.labels" . | nindent 4 }} + name: nri-resctrl-mon + namespace: {{ .Release.Namespace }} +spec: + selector: + matchLabels: + {{- include "nri-plugin.selectorLabels" . | nindent 6 }} + template: + metadata: + labels: + {{- include "nri-plugin.labels" . | nindent 8 }} + spec: + {{- with .Values.tolerations }} + tolerations: + {{- toYaml . | nindent 8 }} + {{- end }} + {{- with .Values.affinity }} + affinity: + {{- toYaml . | nindent 8 }} + {{- end }} + hostPID: true + nodeSelector: + kubernetes.io/os: "linux" + {{- with .Values.nodeSelector }} + {{- toYaml . | nindent 8 }} + {{- end }} + {{- if .Values.nri.runtime.patchConfig }} + initContainers: + - name: patch-runtime + {{- if (not (or (eq .Values.nri.runtime.config nil) (eq .Values.nri.runtime.config.pluginRegistrationTimeout ""))) }} + args: + - -nri-plugin-registration-timeout + - {{ .Values.nri.runtime.config.pluginRegistrationTimeout }} + - -nri-plugin-request-timeout + - {{ .Values.nri.runtime.config.pluginRequestTimeout }} + {{- end }} + image: {{ .Values.initContainerImage.name }}:{{ .Values.initContainerImage.tag | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.initContainerImage.pullPolicy }} + volumeMounts: + - name: containerd-config + mountPath: /etc/containerd + - name: crio-config + mountPath: /etc/crio/crio.conf.d + - name: dbus-socket + mountPath: /var/run/dbus/system_bus_socket + securityContext: + privileged: true + {{- end }} + containers: + - name: nri-resctrl-mon + command: + - nri-resctrl-mon + - --idx + - "{{ .Values.nri.plugin.index | int | printf "%02d" }}" + - --config + - /etc/nri/resctrl-mon/config.yaml + - -v + image: {{ .Values.image.name }}:{{ .Values.image.tag | default .Chart.AppVersion }} + imagePullPolicy: {{ .Values.image.pullPolicy }} + resources: + requests: + cpu: {{ .Values.resources.cpu }} + memory: {{ .Values.resources.memory }} + securityContext: + allowPrivilegeEscalation: false + capabilities: + drop: + - ALL + add: + - SYS_ADMIN + - DAC_OVERRIDE + volumeMounts: + - name: resctrl-mon-config-vol + mountPath: /etc/nri/resctrl-mon + - name: nrisockets + mountPath: /var/run/nri + - name: resctrlfs + mountPath: /sys/fs/resctrl + {{- if .Values.podPriorityClassNodeCritical }} + priorityClassName: system-node-critical + {{- end }} + volumes: + - name: resctrl-mon-config-vol + configMap: + name: nri-resctrl-mon-config.default + - name: nrisockets + hostPath: + path: /var/run/nri + type: DirectoryOrCreate + - name: resctrlfs + hostPath: + path: /sys/fs/resctrl + type: Directory + {{- if .Values.nri.runtime.patchConfig }} + - name: containerd-config + hostPath: + path: /etc/containerd/ + type: DirectoryOrCreate + - name: crio-config + hostPath: + path: /etc/crio/crio.conf.d/ + type: DirectoryOrCreate + - name: dbus-socket + hostPath: + path: /var/run/dbus/system_bus_socket + type: Socket + {{- end }} diff --git a/deployment/helm/resctrl-mon/values.schema.json b/deployment/helm/resctrl-mon/values.schema.json new file mode 100644 index 000000000..07514a2c3 --- /dev/null +++ b/deployment/helm/resctrl-mon/values.schema.json @@ -0,0 +1,117 @@ +{ + "$schema": "http://json-schema.org/schema#", + "required": [ + "image", + "resources" + ], + "properties": { + "image": { + "type": "object", + "required": [ + "name", + "pullPolicy" + ], + "properties": { + "name": { + "type": "string" + }, + "tag": { + "type": "string" + }, + "pullPolicy": { + "type": "string", + "enum": ["Never", "Always", "IfNotPresent"] + } + } + }, + "initContainerImage": { + "type": "object", + "required": [ + "name", + "pullPolicy" + ], + "properties": { + "name": { + "type": "string" + }, + "tag": { + "type": "string" + }, + "pullPolicy": { + "type": "string", + "enum": ["Never", "Always", "IfNotPresent"] + } + } + }, + "resources": { + "type": "object", + "required": [ + "cpu", + "memory" + ], + "properties": { + "cpu": { + "type": "string" + }, + "memory": { + "type": "string" + } + } + }, + "nri": { + "type": "object", + "required": [ + "plugin", + "runtime" + ], + "properties": { + "plugin": { + "type": "object", + "required": [ + "index" + ], + "properties": { + "index": { + "type": "integer", + "minimum": 0, + "maximum": 99 + } + } + }, + "runtime": { + "type": "object", + "required": [ + "patchConfig" + ], + "properties": { + "patchConfig": { + "type": "boolean" + }, + "config": { + "type": "object", + "required": [ + "pluginRegistrationTimeout", + "pluginRequestTimeout" + ], + "properties": { + "pluginRegistrationTimeout": { + "type": "string", + "$comment": "allowed range is 5-30s", + "pattern": "^(([5-9])|([1-2][0-9])|(30))s$" + }, + "pluginRequestTimeout": { + "type": "string", + "$comment": "allowed range is 2-30s", + "pattern": "^(([2-9])|([1-2][0-9])|(30))s$" + } + } + } + } + } + } + }, + "podPriorityClassNodeCritical": { + "type": "boolean" + } + } +} diff --git a/deployment/helm/resctrl-mon/values.yaml b/deployment/helm/resctrl-mon/values.yaml new file mode 100644 index 000000000..58e0915bd --- /dev/null +++ b/deployment/helm/resctrl-mon/values.yaml @@ -0,0 +1,66 @@ +# Default values for resctrl-mon. +# This is a YAML-formatted file. +# Declare variables to be passed into your templates. +--- +image: + name: ghcr.io/containers/nri-plugins/nri-resctrl-mon + # tag, if defined will use the given image tag, otherwise Chart.AppVersion will be used + #tag: unstable + pullPolicy: Always + +resources: + cpu: 10m + memory: 50Mi + +nri: + plugin: + index: 90 + runtime: + patchConfig: false +# config: +# pluginRegistrationTimeout: 5s +# pluginRequestTimeout: 2s + +initContainerImage: + name: ghcr.io/containers/nri-plugins/nri-config-manager + # If not defined Chart.AppVersion will be used + #tag: unstable + pullPolicy: Always + +tolerations: [] +# +# Example: +# +# tolerations: +# - key: "node-role.kubernetes.io/control-plane" +# operator: "Exists" +# effect: "NoSchedule" + +affinity: [] +# +# Example: +# +# affinity: +# nodeAffinity: +# requiredDuringSchedulingIgnoredDuringExecution: +# nodeSelectorTerms: +# - matchExpressions: +# - key: feature.node.kubernetes.io/cpu-rdt.mon +# operator: In +# values: +# - "true" + +nodeSelector: [] +# +# Example: +# +# nodeSelector: +# feature.node.kubernetes.io/cpu-rdt.mon: "true" + +# NRI plugins should be considered as part of the container runtime. +# By default we make them part of the system-node-critical priority +# class. This should mitigate the potential risk of a plugin getting +# evicted under heavy system load. It should also ensure that during +# autoscaling enough new nodes are brought up to leave room for the +# plugin on each new node. +podPriorityClassNodeCritical: true diff --git a/docs/deployment/helm/index.md b/docs/deployment/helm/index.md index 183dacfe0..d34d9d84e 100644 --- a/docs/deployment/helm/index.md +++ b/docs/deployment/helm/index.md @@ -52,5 +52,6 @@ template.md memory-qos.md memtierd.md sgx-epc.md +resctrl-mon.md resource-annotator.md ``` diff --git a/docs/deployment/helm/resctrl-mon.md b/docs/deployment/helm/resctrl-mon.md new file mode 100644 index 000000000..8e87b8257 --- /dev/null +++ b/docs/deployment/helm/resctrl-mon.md @@ -0,0 +1,2 @@ +```{include} ../../../deployment/helm/resctrl-mon/README.md +``` diff --git a/docs/index.md b/docs/index.md index d6e4a7a36..dcb17f837 100644 --- a/docs/index.md +++ b/docs/index.md @@ -8,6 +8,7 @@ caption: Contents introduction.md resource-policy/index.rst memory/index.md +monitoring/index.md deployment/index.md contributing.md Project GitHub repository diff --git a/docs/monitoring/index.md b/docs/monitoring/index.md new file mode 100644 index 000000000..33ef9f208 --- /dev/null +++ b/docs/monitoring/index.md @@ -0,0 +1,9 @@ +# Monitoring plugins + +```{toctree} +--- +maxdepth: 2 +caption: Contents +--- +resctrl-mon.md +``` diff --git a/docs/monitoring/resctrl-mon.md b/docs/monitoring/resctrl-mon.md new file mode 100644 index 000000000..09eebf2d6 --- /dev/null +++ b/docs/monitoring/resctrl-mon.md @@ -0,0 +1,160 @@ +# Resctrl-Mon NRI Plugin + +The resctrl-mon NRI plugin creates per-pod resctrl monitoring groups +(`mon_groups`) to support Kepler's passive mode for Application Energy +Telemetry (AET). + +When a container is created, the plugin assigns its init process to a +`mon_group` before the process starts executing. The Linux kernel then +propagates the RMID (Resource Monitoring ID) to all child processes +automatically, eliminating the fork race that affects userspace-based +approaches. + +## How It Works + +1. The container runtime creates a container process (paused). +2. The NRI `PostCreateContainer` hook fires. +3. The plugin creates a `mon_group` named with the pod's UUID under + the appropriate resctrl control group. +4. The NRI `StartContainer` hook fires with the container's init PID. +5. The plugin writes the init PID to the `mon_group`'s `tasks` file. + (If the PID is not yet available, `PostStartContainer` retries.) +6. The runtime starts the container. All child processes inherit the RMID. +7. Kepler scans the resctrl filesystem and reads monitoring data. +8. When the last container in a pod stops, the plugin removes the `mon_group`. + +The plugin DaemonSet runs with `hostPID: true` so that it can write +host-namespace PIDs to the resctrl `tasks` file. Without `hostPID`, +the kernel rejects the write with `ESRCH` because the PID does not +exist in the plugin's PID namespace. + +## Mon_Group Naming + +Mon_groups are named with the Kubernetes pod UID: + +``` +/sys/fs/resctrl/[/]mon_groups// +``` + +This enables Kepler to correlate monitoring data with Kubernetes metadata +by querying the K8s API using the pod UID extracted from the directory name. + +## Plugin Configuration + +Configuration is loaded from a YAML file specified with the `-config` flag +or pushed by the container runtime via NRI. + +```yaml +# Path to the resctrl filesystem. Override for testing. +resctrlPath: /sys/fs/resctrl + +# Namespace filter: only create mon_groups for pods in these namespaces. +# Empty list = all namespaces. +namespaces: [] + +# Pod label selector: only create mon_groups for pods matching these labels. +# Empty = all pods. +labelSelector: {} +``` + +## Coexistence with Allocation Plugins + +If an NRI resource allocation plugin (balloons, topology-aware) is running, +it assigns containers to RDT classes via `SetLinuxRDTClass`. The resctrl-mon +plugin reads the effective RDT class from the NRI container spec and creates +`mon_groups` under the corresponding control group: + +``` +/sys/fs/resctrl//mon_groups// +``` + +The container keeps its CLOSID (allocation) and gets a distinct RMID +(monitoring). If no allocation plugin is active, `mon_groups` are created +under the root resctrl directory. + +## RMID Management + +RMID allocation is delegated entirely to the Linux kernel: + +- **Allocation**: `mkdir` on a `mon_group` directory assigns an RMID. If + none are available, the kernel returns `ENOSPC` and the plugin logs a + warning and skips the pod. +- **Deallocation**: `rmdir` releases the RMID. The kernel handles the + hardware recycling window. + +## Developer's Guide + +### Prerequisites + +- Containerd v1.7+ or CRI-O v1.26+ +- Enable NRI in /etc/containerd/config.toml: + + ```toml + [plugins."io.containerd.nri.v1.nri"] + disable = false + disable_connections = false + plugin_config_path = "/etc/nri/conf.d" + plugin_path = "/opt/nri/plugins" + plugin_registration_timeout = "5s" + plugin_request_timeout = "2s" + socket_path = "/var/run/nri/nri.sock" + ``` + +- Intel CPU with RDT monitoring support +- resctrl filesystem mounted at `/sys/fs/resctrl` + +### Build + +```bash +make PLUGINS=nri-resctrl-mon build-plugins +``` + +### Run + +```bash +./build/bin/nri-resctrl-mon -config sample-configs/nri-resctrl-mon.yaml -idx 90 -vv +``` + +### Manual Test + +Verify that `mon_groups` are created when pods start: + +```bash +# Start a test pod +kubectl run test-pod --image=busybox -- sleep 3600 + +# Check that a mon_group was created with the pod UID +POD_UID=$(kubectl get pod test-pod -o jsonpath='{.metadata.uid}') + +# Without an RDT allocation plugin, mon_groups are under the root class: +MON_GROUP_BASE=/sys/fs/resctrl/mon_groups +# With an allocation plugin that assigns an RDT class (e.g. BestEffort): +# MON_GROUP_BASE=/sys/fs/resctrl/BestEffort/mon_groups + +ls "$MON_GROUP_BASE/$POD_UID/" + +# Verify monitoring data is available +cat "$MON_GROUP_BASE/$POD_UID/mon_data/mon_L3_00/llc_occupancy" +``` + +### Debug + +```bash +go install github.com/go-delve/delve/cmd/dlv@latest +dlv exec build/bin/nri-resctrl-mon -- -config sample-configs/nri-resctrl-mon.yaml -idx 90 +(dlv) break plugin.PostCreateContainer +(dlv) continue +``` + +### Deploy + +Build an image, import it on the node, and deploy the plugin by +running the following in `nri-plugins`: + +```bash +rm -rf build +make clean +make PLUGINS=nri-resctrl-mon IMAGE_VERSION=devel images +ctr -n k8s.io images import build/images/nri-resctrl-mon-image-*.tar +kubectl create -f build/images/nri-resctrl-mon-deployment.yaml +``` diff --git a/sample-configs/nri-resctrl-mon.yaml b/sample-configs/nri-resctrl-mon.yaml new file mode 100644 index 000000000..9cbd339a0 --- /dev/null +++ b/sample-configs/nri-resctrl-mon.yaml @@ -0,0 +1,3 @@ +resctrlPath: /sys/fs/resctrl +namespaces: [] +labelSelector: {} From f7883b9ec17e70567764db26eb9a081a01db2714 Mon Sep 17 00:00:00 2001 From: "Christopher M. Cantalupo" Date: Thu, 4 Jun 2026 14:21:26 -0700 Subject: [PATCH 2/3] resctrl-mon: read container PID from CRI-O pidfile CRI-O versions up to and including 1.35 do not populate Container.Pid in NRI events (the field was added to containerToNRI in CRI-O main, commit 19d319695, not yet in any release branch). This causes GetPid() to return 0 in PostStartContainer and Synchronize, preventing the plugin from writing PIDs to the resctrl mon_group tasks file. Add a readContainerPID() fallback that reads the init PID from CRI-O's container pidfile at: /run/containers/storage/overlay-containers//userdata/pidfile The fallback is only attempted when GetPid() returns 0, so it is a no-op on runtimes that populate the PID (containerd, future CRI-O). On non-CRI-O runtimes, the pidfile simply doesn't exist and the read returns 0 gracefully. Signed-off-by: Christopher M. Cantalupo --- cmd/plugins/resctrl-mon/plugin.go | 39 ++++++++++++++++++++++++++++--- 1 file changed, 36 insertions(+), 3 deletions(-) diff --git a/cmd/plugins/resctrl-mon/plugin.go b/cmd/plugins/resctrl-mon/plugin.go index 52b067b27..b90530db5 100644 --- a/cmd/plugins/resctrl-mon/plugin.go +++ b/cmd/plugins/resctrl-mon/plugin.go @@ -19,6 +19,8 @@ import ( "fmt" "os" "path/filepath" + "strconv" + "strings" "sync" "time" @@ -141,6 +143,9 @@ func (p *plugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, contai continue } pid := int(ctr.GetPid()) + if pid == 0 { + pid = readContainerPID(ctr.GetId()) + } if pid > 0 { monGroupDir := p.state.getMonGroupDir(podUID) if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { @@ -270,15 +275,23 @@ func (p *plugin) PostStartContainer(ctx context.Context, pod *api.PodSandbox, ct return nil } - // Fallback: write the init PID if StartContainer didn't. + // Fallback: if the NRI event has no PID (CRI-O <= 1.35 does not + // populate Container.Pid), read it from the CRI-O pidfile. + if pid == 0 { + pid = readContainerPID(ctr.GetId()) + if pid > 0 { + log.Debugf("PostStartContainer %s: read pid %d from pidfile", ctrName, pid) + } + } + if pid > 0 { if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil { log.Warnf("PostStartContainer %s: failed to write PID %d to tasks: %v", ctrName, pid, err) } else { - log.Infof("PostStartContainer %s: fallback assigned pid %d to mon_group %s", ctrName, pid, monGroupDir) + log.Infof("PostStartContainer %s: assigned pid %d to mon_group %s", ctrName, pid, monGroupDir) } } else { - log.Warnf("PostStartContainer %s: PID still 0 after start, unexpected", ctrName) + log.Warnf("PostStartContainer %s: PID not available from NRI or pidfile", ctrName) } return nil @@ -382,3 +395,23 @@ func getRDTClass(ctr *api.Container) string { func pprintCtr(pod *api.PodSandbox, ctr *api.Container) string { return fmt.Sprintf("%s/%s:%s", pod.GetNamespace(), pod.GetName(), ctr.GetName()) } + +// readContainerPID reads the container init PID from the CRI-O pidfile. +// CRI-O versions <= 1.35 do not populate Container.Pid in NRI events. +// As a fallback, we read the PID from the container's pidfile at +// /run/containers/storage/overlay-containers//userdata/pidfile. +// Returns 0 if the PID cannot be read (e.g., running under containerd). +func readContainerPID(containerID string) int { + pidfile := filepath.Join("/run/containers/storage/overlay-containers", containerID, "userdata/pidfile") + data, err := os.ReadFile(pidfile) + if err != nil { + log.Debugf("readContainerPID: cannot read %s: %v", pidfile, err) + return 0 + } + pid, err := strconv.Atoi(strings.TrimSpace(string(data))) + if err != nil { + log.Debugf("readContainerPID: invalid pid in %s: %v", pidfile, err) + return 0 + } + return pid +} From 9c51639f7f759fdb1740eeace1ec6d20de36b06e Mon Sep 17 00:00:00 2001 From: "Christopher M. Cantalupo" Date: Fri, 5 Jun 2026 08:41:20 -0700 Subject: [PATCH 3/3] resctrl-mon: address review feedback - Add Kepler project link in documentation (docs/monitoring/resctrl-mon.md) - Make resctrlPath configurable via Helm values.yaml instead of hardcoding in the ConfigMap template - Log warning when removeContainer/podHasNoContainers is called for an untracked pod (defensive corner-case visibility) Signed-off-by: Christopher M. Cantalupo --- cmd/plugins/resctrl-mon/state.go | 4 ++++ deployment/helm/resctrl-mon/templates/configmap.yaml | 2 +- deployment/helm/resctrl-mon/values.schema.json | 8 +++++++- deployment/helm/resctrl-mon/values.yaml | 2 ++ docs/monitoring/resctrl-mon.md | 4 ++-- 5 files changed, 16 insertions(+), 4 deletions(-) diff --git a/cmd/plugins/resctrl-mon/state.go b/cmd/plugins/resctrl-mon/state.go index 47dba1f1a..b3c58d8ab 100644 --- a/cmd/plugins/resctrl-mon/state.go +++ b/cmd/plugins/resctrl-mon/state.go @@ -63,6 +63,8 @@ func (s *podState) removeContainer(podUID, containerID string) { defer s.mu.Unlock() if info, ok := s.pods[podUID]; ok { delete(info.containers, containerID) + } else { + log.Warnf("removeContainer: pod %s not tracked (container %s)", podUID, containerID) } } @@ -84,12 +86,14 @@ func (s *podState) getMonGroupDir(podUID string) string { } // podHasNoContainers returns true if the pod has no remaining containers. +// Returns true for untracked pods since there is nothing to protect. func (s *podState) podHasNoContainers(podUID string) bool { s.mu.Lock() defer s.mu.Unlock() if info, ok := s.pods[podUID]; ok { return len(info.containers) == 0 } + log.Warnf("podHasNoContainers: pod %s not tracked, treating as empty", podUID) return true } diff --git a/deployment/helm/resctrl-mon/templates/configmap.yaml b/deployment/helm/resctrl-mon/templates/configmap.yaml index 3ad305503..75562a9bf 100644 --- a/deployment/helm/resctrl-mon/templates/configmap.yaml +++ b/deployment/helm/resctrl-mon/templates/configmap.yaml @@ -7,6 +7,6 @@ metadata: {{- include "nri-plugin.labels" . | nindent 4 }} data: config.yaml: | - resctrlPath: /sys/fs/resctrl + resctrlPath: {{ .Values.resctrlPath }} namespaces: [] labelSelector: {} diff --git a/deployment/helm/resctrl-mon/values.schema.json b/deployment/helm/resctrl-mon/values.schema.json index 07514a2c3..8fdae0d8f 100644 --- a/deployment/helm/resctrl-mon/values.schema.json +++ b/deployment/helm/resctrl-mon/values.schema.json @@ -2,9 +2,15 @@ "$schema": "http://json-schema.org/schema#", "required": [ "image", - "resources" + "resources", + "resctrlPath" ], "properties": { + "resctrlPath": { + "type": "string", + "description": "Mount point of the resctrl filesystem", + "default": "/sys/fs/resctrl" + }, "image": { "type": "object", "required": [ diff --git a/deployment/helm/resctrl-mon/values.yaml b/deployment/helm/resctrl-mon/values.yaml index 58e0915bd..26abf0cd0 100644 --- a/deployment/helm/resctrl-mon/values.yaml +++ b/deployment/helm/resctrl-mon/values.yaml @@ -2,6 +2,8 @@ # This is a YAML-formatted file. # Declare variables to be passed into your templates. --- +resctrlPath: /sys/fs/resctrl + image: name: ghcr.io/containers/nri-plugins/nri-resctrl-mon # tag, if defined will use the given image tag, otherwise Chart.AppVersion will be used diff --git a/docs/monitoring/resctrl-mon.md b/docs/monitoring/resctrl-mon.md index 09eebf2d6..7cd6c3198 100644 --- a/docs/monitoring/resctrl-mon.md +++ b/docs/monitoring/resctrl-mon.md @@ -1,8 +1,8 @@ # Resctrl-Mon NRI Plugin The resctrl-mon NRI plugin creates per-pod resctrl monitoring groups -(`mon_groups`) to support Kepler's passive mode for Application Energy -Telemetry (AET). +(`mon_groups`) to support [Kepler](https://sustainable-computing.io/)'s +passive mode for Application Energy Telemetry (AET). When a container is created, the plugin assigns its init process to a `mon_group` before the process starts executing. The Linux kernel then