Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
55 changes: 28 additions & 27 deletions cmd/plugins/resctrl-mon/plugin.go
Original file line number Diff line number Diff line change
Expand Up @@ -74,6 +74,9 @@ func newPlugin() *plugin {
// Configure handles connecting to container runtime's NRI server.
func (p *plugin) Configure(ctx context.Context, config, runtime, version string) (stub.EventMask, error) {
log.Infof("Connected to %s %s...", runtime, version)
if err := checkRuntimeVersion(runtime, version); err != nil {
return 0, err
}
if config != "" {
log.Debugf("loading configuration from NRI server")
if err := p.setConfig([]byte(config)); err != nil {
Expand Down Expand Up @@ -143,9 +146,6 @@ func (p *plugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, contai
continue
}
pid := int(ctr.GetPid())
if pid == 0 {
pid = readContainerPID(ctr.GetId())
}
if pid > 0 {
monGroupDir := p.state.getMonGroupDir(podUID)
if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil {
Expand Down Expand Up @@ -275,23 +275,14 @@ func (p *plugin) PostStartContainer(ctx context.Context, pod *api.PodSandbox, ct
return nil
}

// Fallback: if the NRI event has no PID (CRI-O <= 1.35 does not
// populate Container.Pid), read it from the CRI-O pidfile.
if pid == 0 {
pid = readContainerPID(ctr.GetId())
if pid > 0 {
log.Debugf("PostStartContainer %s: read pid %d from pidfile", ctrName, pid)
}
}

if pid > 0 {
if err := p.rdt.writeTaskPID(monGroupDir, pid); err != nil {
log.Warnf("PostStartContainer %s: failed to write PID %d to tasks: %v", ctrName, pid, err)
} else {
log.Infof("PostStartContainer %s: assigned pid %d to mon_group %s", ctrName, pid, monGroupDir)
}
} else {
log.Warnf("PostStartContainer %s: PID not available from NRI or pidfile", ctrName)
log.Warnf("PostStartContainer %s: PID=0, cannot assign to mon_group (runtime did not provide PID via NRI)", ctrName)
}

return nil
Expand Down Expand Up @@ -396,22 +387,32 @@ func pprintCtr(pod *api.PodSandbox, ctr *api.Container) string {
return fmt.Sprintf("%s/%s:%s", pod.GetNamespace(), pod.GetName(), ctr.GetName())
}

// readContainerPID reads the container init PID from the CRI-O pidfile.
// CRI-O versions <= 1.35 do not populate Container.Pid in NRI events.
// As a fallback, we read the PID from the container's pidfile at
// /run/containers/storage/overlay-containers/<id>/userdata/pidfile.
// Returns 0 if the PID cannot be read (e.g., running under containerd).
func readContainerPID(containerID string) int {
pidfile := filepath.Join("/run/containers/storage/overlay-containers", containerID, "userdata/pidfile")
data, err := os.ReadFile(pidfile)
// checkRuntimeVersion verifies that the container runtime provides PIDs via NRI.
// CRI-O versions before 1.36 do not populate Container.Pid in NRI events,
// making the plugin unable to assign tasks to monitoring groups.
func checkRuntimeVersion(runtime, version string) error {
if !strings.EqualFold(runtime, "cri-o") {
return nil
}
// Normalize: strip leading "v" and any pre-release/build suffix.
version = strings.TrimPrefix(version, "v")
if idx := strings.IndexAny(version, "-+"); idx != -1 {
version = version[:idx]
}
parts := strings.SplitN(version, ".", 3)
if len(parts) < 2 {
return fmt.Errorf("CRI-O version %q: unable to parse; require >= 1.36 for NRI PID support", version)
}
major, err := strconv.Atoi(parts[0])
if err != nil {
log.Debugf("readContainerPID: cannot read %s: %v", pidfile, err)
return 0
return fmt.Errorf("CRI-O version %q: unable to parse major version: %w", version, err)
}
pid, err := strconv.Atoi(strings.TrimSpace(string(data)))
minor, err := strconv.Atoi(parts[1])
if err != nil {
log.Debugf("readContainerPID: invalid pid in %s: %v", pidfile, err)
return 0
return fmt.Errorf("CRI-O version %q: unable to parse minor version: %w", version, err)
}
if major < 1 || (major == 1 && minor < 36) {
return fmt.Errorf("CRI-O %s does not provide container PIDs via NRI (requires >= 1.36)", version)
}
return pid
return nil
}
37 changes: 37 additions & 0 deletions cmd/plugins/resctrl-mon/plugin_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -367,3 +367,40 @@ func TestStopContainer_RemovesStateOnRmdirFailure(t *testing.T) {
require.NoError(t, err)
assert.Equal(t, 0, p.state.podCount())
}

func TestCheckRuntimeVersion(t *testing.T) {
tests := []struct {
name string
runtime string
version string
wantErr bool
}{
{"containerd any version", "containerd", "2.0.0", false},
{"cri-o 1.36.0", "cri-o", "1.36.0", false},
{"cri-o 1.37.0", "cri-o", "1.37.0", false},
{"cri-o 2.0.0", "cri-o", "2.0.0", false},
{"cri-o 1.35.0 rejected", "cri-o", "1.35.0", true},
{"cri-o 1.35.2 rejected", "cri-o", "1.35.2", true},
{"cri-o 1.31.5 rejected", "cri-o", "1.31.5", true},
{"cri-o 0.99.0 rejected", "cri-o", "0.99.0", true},
{"CRI-O case insensitive", "CRI-O", "1.35.0", true},
{"cri-o no patch", "cri-o", "1.36", false},
{"cri-o unparsable", "cri-o", "latest", true},
{"cri-o v prefix accepted", "cri-o", "v1.36.0", false},
{"cri-o v prefix rejected", "cri-o", "v1.35.0", true},
{"cri-o strips pre-release suffix", "cri-o", "1.36.0-rc1", false},
{"cri-o strips pre-release old version", "cri-o", "1.35.0-beta.1", true},
{"cri-o strips build metadata", "cri-o", "1.36.0+build123", false},
{"cri-o strips v prefix and pre-release", "cri-o", "v1.35.0-alpha.0", true},
}
for _, tt := range tests {
t.Run(tt.name, func(t *testing.T) {
err := checkRuntimeVersion(tt.runtime, tt.version)
if tt.wantErr {
assert.Error(t, err)
} else {
assert.NoError(t, err)
}
})
}
}
6 changes: 4 additions & 2 deletions deployment/helm/resctrl-mon/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -30,8 +30,10 @@ to support Application Energy Telemetry (AET) via Kepler passive mode.
installation.

- CRI-O
- At least [v1.26.0](https://github.com/cri-o/cri-o/releases/tag/v1.26.0)
release version to use the NRI feature
- At least [v1.36.0](https://github.com/cri-o/cri-o/releases/tag/v1.36.0)
release version. CRI-O >= 1.36 is required because earlier versions do
not provide container PIDs via NRI, which prevents the plugin from
assigning tasks to monitoring groups.
- Enable NRI feature by following
[these](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table)
detailed instructions. You can optionally enable the NRI in CRI-O using
Expand Down
16 changes: 14 additions & 2 deletions docs/monitoring/resctrl-mon.md
Original file line number Diff line number Diff line change
Expand Up @@ -86,8 +86,10 @@ RMID allocation is delegated entirely to the Linux kernel:

### Prerequisites

- Containerd v1.7+ or CRI-O v1.26+
- Enable NRI in /etc/containerd/config.toml:
- Containerd v1.7+ or CRI-O v1.36+
- Enable NRI in the container runtime:

**containerd** — in `/etc/containerd/config.toml`:

```toml
[plugins."io.containerd.nri.v1.nri"]
Expand All @@ -100,6 +102,16 @@ RMID allocation is delegated entirely to the Linux kernel:
socket_path = "/var/run/nri/nri.sock"
```

**CRI-O** — in `/etc/crio/crio.conf.d/10-nri.conf` (or equivalent):

```toml
[crio.nri]
enable_nri = true
```

See the [CRI-O NRI documentation](https://github.com/cri-o/cri-o/blob/main/docs/crio.conf.5.md#crionri-table)
for additional options.

- Intel CPU with RDT monitoring support
- resctrl filesystem mounted at `/sys/fs/resctrl`

Expand Down
Loading