diff --git a/cmd/plugins/balloons/Dockerfile b/cmd/plugins/balloons/Dockerfile index d81fe949c..1c44dc0b2 100644 --- a/cmd/plugins/balloons/Dockerfile +++ b/cmd/plugins/balloons/Dockerfile @@ -19,10 +19,17 @@ RUN --mount=type=cache,target=/go/pkg/mod/ \ GOBIN=/debug-extras/bin go install -tags osusergo,netgo -ldflags "-extldflags=-static" github.com/go-delve/delve/cmd/dlv@latest; \ fi -# Fetch go dependencies in a separate layer for caching +# Fetch go dependencies in a separate layer for caching. +# If vendor/ is present in the build context (e.g. when using a +# local replace directive), use vendor mode and skip the download. COPY go.mod go.sum . COPY pkg/topology/ pkg/topology/ -RUN --mount=type=cache,target=/go/pkg/mod/ go mod download +RUN --mount=type=cache,target=/go/pkg/mod/ \ + if grep -q '^replace .* => /' go.mod 2>/dev/null; then \ + echo "go.mod contains local replace; will rely on vendor/"; \ + else \ + go mod download; \ + fi # Build nri-resource-policy COPY . . diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index a2f31323f..db8e61372 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -28,7 +28,7 @@ import ( "github.com/containers/nri-plugins/pkg/kubernetes" logger "github.com/containers/nri-plugins/pkg/log" "github.com/containers/nri-plugins/pkg/resmgr/cache" - cpucontrol "github.com/containers/nri-plugins/pkg/resmgr/control/cpu" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass" "github.com/containers/nri-plugins/pkg/resmgr/events" libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory" policy "github.com/containers/nri-plugins/pkg/resmgr/policy" @@ -91,6 +91,7 @@ type balloons struct { cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy memAllocator *libmem.Allocator // memory allocator used by the policy + cpuClasses *cpuclass.Handler // CPU class handler (cpufreq + PCT internals) loadVirtDev map[string]*loadClassVirtDev // map LoadClasses to virtual devices } @@ -252,6 +253,7 @@ func (p *balloons) Start() error { func (p *balloons) Sync(add []cache.Container, del []cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() log.Debugf("synchronizing state...") for _, c := range del { @@ -275,6 +277,7 @@ func (p *balloons) Sync(add []cache.Container, del []cache.Container) error { func (p *balloons) AllocateResources(c cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() if c.PreserveCpuResources() { log.Infof("not handling resources of container %s, preserving CPUs %q and memory %q", c.PrettyName(), c.GetCpusetCpus(), c.GetCpusetMems()) @@ -328,6 +331,7 @@ func (p *balloons) AllocateResources(c cache.Container) error { func (p *balloons) ReleaseResources(c cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() log.Debugf("releasing container %s...", c.PrettyName()) if bln := p.balloonByContainer(c); bln != nil { @@ -361,6 +365,7 @@ func (p *balloons) ReleaseResources(c cache.Container) error { func (p *balloons) UpdateResources(c cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() log.Debugf("(not) updating container %s...", c.PrettyName()) return nil @@ -573,6 +578,40 @@ func (p *balloons) GetTopologyZones() []*policy.TopologyZone { return zones } +// GetExtendedResources returns the node-level extended resources +// the balloons policy publishes for the local Node. +func (p *balloons) GetExtendedResources() map[string]int64 { + out := map[string]int64{} + if p.cpuClasses == nil || !p.cpuClasses.PctActive() { + return out + } + if p.bpoptions == nil { + return out + } + for _, cc := range p.bpoptions.CPUClasses { + if cc == nil || !cc.PublishExtendedResource { + continue + } + if cc.PctPriority == "" && cc.PctClosID == nil { + log.Warnf("ignoring publishExtendedResource on non-PCT cpuClass %q", cc.Name) + continue + } + held := cpuset.New() + for _, bln := range p.balloons { + if p.resolveCpuClassName(bln.Def.CpuClass) == cc.Name { + continue + } + held = held.Union(bln.Cpus) + } + free := p.cpuClasses.PctFreeClassCapacity(cc.Name, held) + if free < 0 { + free = 0 + } + out["cpuclass.balloons.nri.io/"+cc.Name] = int64(free) + } + return out +} + // balloonByContainer returns a balloon that contains a container. func (p *balloons) balloonByContainer(c cache.Container) *Balloon { podID := c.GetPodID() @@ -788,83 +827,104 @@ func largest(sliceLen int, valueOf func(i int) int) ([]int, int) { return largestIndices, largestValue } +// defaultCpuClassName is the name of the implicit "default" +// CPU class. When IdleCpuClass or a balloon type's CpuClass is left +// unset and a class with this name is configured, that class is used +// as the implicit fallback. This balloons-specific convention is kept +// out of the policy-neutral cpuclass package and applied here. +const defaultCpuClassName = "default" + +// resolveCpuClassName substitutes the configured "default" CPU class +// for an empty name when such a class exists. Non-empty names are +// returned unchanged. +func (p *balloons) resolveCpuClassName(name string) string { + if name != "" { + return name + } + for _, cc := range p.bpoptions.CPUClasses { + if cc.Name == defaultCpuClassName { + return defaultCpuClassName + } + } + return name +} + // resetCpuClass resets CPU configurations globally. All balloons can // be ignored, their CPU configurations will be applied later. func (p *balloons) resetCpuClass() error { - // Usual inputs: - // - p.allowed (cpuset.CPUset): all CPUs available for this - // policy. - // - p.IdleCpuClass (string): CPU class for allowed CPUs. - // - // Other inputs, if needed: - // - p.reserved (cpuset.CPUset): CPUs of ReservedResources - // (typically for kube-system containers). - // - // Note: p.useCpuClass(balloon) will be called before assigning - // containers on the balloon, including the reserved balloon. - // - // TODO: don't depend on cpu controller directly - if err := cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, p.allowed.UnsortedList()...); err != nil { + // p.useCpuClass(balloon) will be called later for every balloon, + // including the reserved balloon, to set the per-balloon CPU + // class. Here we only assign the idle class to all allowed CPUs. + if p.cpuClasses == nil { + return nil + } + idle := p.resolveCpuClassName(p.bpoptions.IdleCpuClass) + if err := p.cpuClasses.UseClass(idle, p.allowed); err != nil { log.Warnf("failed to reset class of available cpus: %v", err) } else { - log.Debugf("reset class of available cpus: %q (reserved: %q)", p.allowed, p.reserved) + log.Debugf("reset class of available cpus: %q to idle class %q (reserved: %q)", + p.allowed, idle, p.reserved) } return nil } -// useCpuClass configures CPUs of a balloon. +// commitCpuClasses flushes any pending cpufreq, cpuidle and uncore +// sysfs writes accumulated by previous UseClass / Configure calls +// since the last commit. Called from the deferred path of the +// public balloons lifecycle entry points so multiple class +// reassignments within one NRI request batch coalesce into a +// minimal set of writes. +func (p *balloons) commitCpuClasses() { + if p.cpuClasses == nil { + return + } + if err := p.cpuClasses.Commit(); err != nil { + log.Warnf("cpu class commit produced an error: %v", err) + } +} + +// useCpuClass configures CPUs of a balloon by delegating to the CPU +// class handler. func (p *balloons) useCpuClass(bln *Balloon) error { - // Usual inputs: - // - CPUs that cpuallocator has reserved for this balloon: - // bln.Cpus (cpuset.CPUSet). - // - User-defined CPU configuration for CPUs of balloon of this type: - // bln.Def.CpuClass (string). - // - Current configuration(?): feel free to add data - // structure for this. For instance policy-global p.cpuConfs, - // or balloon-local bln.cpuConfs. - // - // Other input examples, if needed: - // - Requested CPU resources by all containers in the balloon: - // p.requestedMilliCpus(bln). - // - Free CPU resources in the balloon: p.freeMilliCpus(bln). - // - Number of assigned containers: bln.ContainerCount(). - // - Container details: access p.cch with bln.ContainerIDs(). - // - User-defined CPU AllocatorPriority: bln.Def.AllocatorPriority. - // - All existing balloon instances: p.balloons. - // - CPU configurations by user: bln.Def.CpuClass (for bln in p.balloons) if len(bln.components) > 0 { - // If this is a composite balloon, CPU class is - // defined in the component balloons. - log.Debugf("apply CPU class %q on CPUs %s of composite balloon %q", - bln.Def.CpuClass, bln.Cpus, bln.PrettyName()) + // Composite balloon: each component carries its own CpuClass. + log.Debugf("apply CPU classes of components of composite balloon %q on CPUs %s", + bln.PrettyName(), bln.Cpus) for _, compBln := range bln.components { if err := p.useCpuClass(compBln); err != nil { log.Warnf("failed to apply CPU class %q on CPUs %s of %q in composite balloon %q: %v", compBln.Def.CpuClass, compBln.Cpus, compBln.PrettyName(), bln.PrettyName(), err) } - } return nil } - if err := cpucontrol.Assign(p.cch, bln.Def.CpuClass, bln.Cpus.UnsortedList()...); err != nil { - log.Warnf("failed to apply class %q on CPUs %q: %v", bln.Def.CpuClass, bln.Cpus, err) - } else { - log.Debugf("apply CPU class %q on CPUs %q of %q", bln.Def.CpuClass, bln.Cpus, bln.PrettyName()) + if p.cpuClasses == nil { + return nil + } + cpuClass := p.resolveCpuClassName(bln.Def.CpuClass) + log.Debugf("apply CPU class %q on CPUs %q of %q", cpuClass, bln.Cpus, bln.PrettyName()) + if err := p.cpuClasses.UseClass(cpuClass, bln.Cpus); err != nil { + log.Warnf("failed to apply class %q on CPUs %q: %v", cpuClass, bln.Cpus, err) } return nil } // forgetCpuClass is called when CPUs of a balloon are released from duty. +// It reassigns those CPUs to the configured idle class - the handler +// has no separate "forget" concept; every CPU is always in some class. func (p *balloons) forgetCpuClass(bln *Balloon) { - // Use p.IdleCpuClass for bln.Cpus. - // Usual inputs: see useCpuClass - if err := cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, bln.Cpus.UnsortedList()...); err != nil { - log.Warnf("failed to forget class %q of cpus %q: %v", bln.Def.CpuClass, bln.Cpus, err) + if p.cpuClasses == nil { + return + } + idle := p.resolveCpuClassName(p.bpoptions.IdleCpuClass) + if err := p.cpuClasses.UseClass(idle, bln.Cpus); err != nil { + log.Warnf("failed to forget class of cpus %q (idle class %q): %v", bln.Cpus, idle, err) } else { if len(bln.components) > 0 { - log.Debugf("forget classes of composite balloon %q cpus %q", bln.Def.Name, bln.Cpus) + log.Debugf("forget classes of composite balloon %q cpus %q (idle class %q)", + bln.Def.Name, bln.Cpus, idle) } else { - log.Debugf("forget class %q of cpus %q", bln.Def.CpuClass, bln.Cpus) + log.Debugf("forget class of cpus %q (idle class %q)", bln.Cpus, idle) } } } @@ -1063,8 +1123,8 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro allocatorOptions := cpuTreeAllocatorOptions{ topologyBalancing: p.bpoptions.AllocatorTopologyBalancing, preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores, - preferCloseToDevices: blnDef.PreferCloseToDevices, - preferFarFromDevices: blnDef.PreferFarFromDevices, + preferCloseToDevices: append([]string(nil), blnDef.PreferCloseToDevices...), + preferFarFromDevices: append([]string(nil), blnDef.PreferFarFromDevices...), virtDevCpusets: map[string][]cpuset.CPUSet{ virtDevReservedCpus: {p.reserved}, virtDevIsolatedCpus: {p.options.System.Isolated()}, @@ -1072,6 +1132,7 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro virtDevPCores: {p.cpuAllocator.GetCPUPriorities()[cpuallocator.PriorityHigh]}, }, } + p.applyCpuClassHints(&allocatorOptions, p.resolveCpuClassName(blnDef.CpuClass), cpuset.New(), 0) if blnDef.AllocatorTopologyBalancing != nil { allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing } @@ -1399,14 +1460,28 @@ func changesBalloons(opts0, opts1 *BalloonsOptions) bool { } o0 := opts0.DeepCopy() o1 := opts1.DeepCopy() - // Ignore differences in CPU class names. Every other change - // potentially changes balloons or workloads. + // Ignore differences in BalloonsOptions that do not affect + // CPU-to-balloon or container-to-balloon mapping. Such + // differences include: + // + // 1. CPUClass related parameters o0.IdleCpuClass = "" o1.IdleCpuClass = "" + o0.TurboDomain = "" + o1.TurboDomain = "" + o0.CPUClasses = nil + o1.CPUClasses = nil for i := range o0.BalloonDefs { o0.BalloonDefs[i].CpuClass = "" o1.BalloonDefs[i].CpuClass = "" } + // 2. Schedulingpolicy parameters + o0.SchedulingClasses = nil + o1.SchedulingClasses = nil + for i := range o0.BalloonDefs { + o0.BalloonDefs[i].SchedulingClass = "" + o1.BalloonDefs[i].SchedulingClass = "" + } return utils.DumpJSON(o0) != utils.DumpJSON(o1) } @@ -1424,6 +1499,9 @@ func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool { if opts0.IdleCpuClass != opts1.IdleCpuClass { return true } + if opts0.TurboDomain != opts1.TurboDomain { + return true + } if len(opts0.BalloonDefs) != len(opts1.BalloonDefs) { return true } @@ -1432,12 +1510,20 @@ func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool { return true } } + // Detect changes in CPUClasses definitions (turbo attributes, frequencies, etc.) + if len(opts0.CPUClasses) != len(opts1.CPUClasses) { + return true + } + if utils.DumpJSON(opts0.CPUClasses) != utils.DumpJSON(opts1.CPUClasses) { + return true + } return false } func (p *balloons) Reconfigure(newCfg interface{}) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() balloonsOptions, ok := newCfg.(*BalloonsOptions) if !ok { @@ -1454,6 +1540,19 @@ func (p *balloons) Reconfigure(newCfg interface{}) error { log.Infof("no configuration changes") } else { log.Infof("configuration changes only on CPU classes") + // Update CPUClasses definitions. + p.bpoptions.CPUClasses = newBalloonsOptions.CPUClasses + p.bpoptions.IdleCpuClass = newBalloonsOptions.IdleCpuClass + p.bpoptions.TurboDomain = newBalloonsOptions.TurboDomain + if p.cpuClasses != nil { + if err := p.cpuClasses.Configure(cpuclass.ConfigSpec{ + Classes: p.bpoptions.CPUClasses, + TurboDomain: p.bpoptions.TurboDomain, + Allowed: p.allowed, + }); err != nil { + log.Warnf("failed to reconfigure CPU class handler: %v", err) + } + } // Update new CPU classes to existing balloon // definitions. The same BalloonDef instances // must be kept in use, because each Balloon @@ -1600,6 +1699,95 @@ func (p *balloons) validateConfig(bpoptions *BalloonsOptions) error { if len(undefinedSchedulingClasses) > 0 { return balloonsError("schedulingClass(es) defined in balloonTypes but missing from schedulingClasses: %v", undefinedSchedulingClasses) } + // Validate CPUClasses. + cpuClassNames := map[string]struct{}{} + pctManaged := map[string]string{} // class name -> "high"/"low" + pctAssocOnly := map[string]int{} // class name -> CLOS id + for _, cc := range bpoptions.CPUClasses { + if cc.Name == "" { + return balloonsError("missing or empty name in a cpuClasses entry") + } + if _, dup := cpuClassNames[cc.Name]; dup { + return balloonsError("duplicate cpuClasses name: %q", cc.Name) + } + cpuClassNames[cc.Name] = struct{}{} + // Validate PCT fields. + if cc.PctPriority != "" && cc.PctClosID != nil { + return balloonsError("cpuClass %q: pctPriority and pctClosID are mutually exclusive", cc.Name) + } + switch cc.PctPriority { + case "", "high", "low": + default: + return balloonsError("cpuClass %q: invalid pctPriority %q (allowed: \"high\", \"low\")", cc.Name, cc.PctPriority) + } + if cc.PctPriority != "" { + pctManaged[cc.Name] = cc.PctPriority + } + if cc.PctClosID != nil { + if *cc.PctClosID < 0 { + return balloonsError("cpuClass %q: pctClosID must be >= 0, got %d", cc.Name, *cc.PctClosID) + } + pctAssocOnly[cc.Name] = *cc.PctClosID + } + // pctMinFreq/pctMaxFreq only take effect in managed + // mode (pctPriority); they program the SST CLOS that + // balloons owns. With pctClosID the CLOS is + // pre-programmed by intel-speed-select/BIOS, and + // without any PCT field the cpuClass is not a PCT + // class at all. In both cases these fields are silent + // no-ops; reject them so users don't tweak values that + // have no effect. + if cc.PctMinFreq != 0 || cc.PctMaxFreq != 0 { + switch { + case cc.PctClosID != nil: + return balloonsError("cpuClass %q: pctMinFreq/pctMaxFreq require pctPriority (managed mode); they are incompatible with pctClosID, where the SST CLOS is pre-programmed by intel-speed-select/BIOS", cc.Name) + case cc.PctPriority == "": + return balloonsError("cpuClass %q: pctMinFreq/pctMaxFreq require pctPriority (managed mode); the cpuClass is currently not a PCT class", cc.Name) + } + } + // publishExtendedResource only makes sense for PCT + // classes -- the agent computes capacity from a PCT + // plan. Reject it on non-PCT classes so users don't + // expect a node-level resource that will never be + // published. + if cc.PublishExtendedResource && cc.PctPriority == "" && cc.PctClosID == nil { + return balloonsError("cpuClass %q: publishExtendedResource requires the cpuClass to be a PCT class (set pctPriority or pctClosID)", cc.Name) + } + } + if len(pctManaged) > 0 && len(pctAssocOnly) > 0 { + return balloonsError("mixing managed (pctPriority) and assoc-only (pctClosID) PCT cpuClasses is not allowed: managed=%v, assocOnly=%v", pctManaged, pctAssocOnly) + } + if len(pctManaged) > 0 { + hpClasses, lpClasses := []string{}, []string{} + for name, prio := range pctManaged { + if prio == "high" { + hpClasses = append(hpClasses, name) + } else { + lpClasses = append(lpClasses, name) + } + } + if len(hpClasses) > 1 { + return balloonsError("at most one managed PCT cpuClass with pctPriority=high allowed, got %d: %v", len(hpClasses), hpClasses) + } + if len(lpClasses) > 1 { + return balloonsError("at most one managed PCT cpuClass with pctPriority=low allowed, got %d: %v", len(lpClasses), lpClasses) + } + } + // Verify that cpuClass references in balloon types are + // defined in cpuClasses. Using the legacy control.cpu.classes + // configuration is discouraged and it is possibly out-of-date + // at this point because resource-manager starts controllers + // only after policies. + for _, blnDef := range bpoptions.BalloonDefs { + if blnDef.CpuClass == "" { + continue + } + _, inCPUClasses := cpuClassNames[blnDef.CpuClass] + if !inCPUClasses { + log.Warnf("cpuClass %q referenced by balloon type %q is not defined in cpuClasses", + blnDef.CpuClass, blnDef.Name) + } + } var circularCheck func(name string, seen map[string]int) error circularCheck = func(name string, seen map[string]int) error { if seen[name] > 0 { @@ -1671,6 +1859,20 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { setOmittedDefaults(bpoptions) + // Set bpoptions early so the turbo allocator construction below + // has access to CPUClasses. + p.bpoptions = bpoptions + + // Construct the CPU class handler that fronts both cpufreq and + // PCT internals. + if p.cpuClasses == nil { + h, err := cpuclass.New(p.options.System) + if err != nil { + return balloonsError("failed to create CPU class handler: %w", err) + } + p.cpuClasses = h + } + reservedBalloonDef, defaultBalloonDef, err := p.fillBuiltinBalloonDefs(bpoptions) if err != nil { return err @@ -1678,6 +1880,16 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { if err = p.validateConfig(bpoptions); err != nil { return balloonsError("invalid configuration: %w", err) } + // Configure the CPU class handler. Done after validation so we + // don't program platform state (e.g. SST CLOSes) if the + // user-facing config is malformed. + if err := p.cpuClasses.Configure(cpuclass.ConfigSpec{ + Classes: bpoptions.CPUClasses, + TurboDomain: bpoptions.TurboDomain, + Allowed: p.allowed, + }); err != nil { + return balloonsError("failed to configure CPU class handler: %w", err) + } p.fillLoadVirtDevices(bpoptions.LoadClasses) p.fillCloseToDevices(bpoptions.BalloonDefs) p.fillFarFromDevices(bpoptions.BalloonDefs) @@ -1873,6 +2085,95 @@ func (p *balloons) fillCloseToDevices(blnDefs []*BalloonDef) { } } +// cpuClassHintDevPrefix is the prefix used for synthetic virtual +// device names that carry cpuClass placement hints. All such entries +// are owned exclusively by applyCpuClassHints and are discarded on +// every new allocation round, because hints are only valid for the +// allocation they were requested for. +const cpuClassHintDevPrefix = "__cls_" + +// applyCpuClassHints queries the CPU class handler for placement +// hints for an upcoming allocation under cpuClass and merges them +// into opts as synthetic virtual devices. The names start with the +// reserved cpuClassHintDevPrefix so they cannot collide with +// user-configured device names. +// +// Any stale cpuClass hints left in opts from a previous allocation +// round are removed first: hints reflect the cpuClass handler's +// view at one specific moment and must not accumulate across +// resize cycles. +// +// - opts: allocator options to extend in place. +// - cpuClass: the cpuClass that the upcoming allocation will use. +// - currentCpus: CPUs the balloon already owns (excluded from HP +// room accounting in PCT hints). +// - requestedCount: number of CPUs the upcoming allocation wants. +// Pass 0 when unknown (e.g. balloon creation before sizing). +func (p *balloons) applyCpuClassHints(opts *cpuTreeAllocatorOptions, cpuClass string, currentCpus cpuset.CPUSet, requestedCount int) { + if p.cpuClasses == nil || opts == nil { + return + } + mergeCpuClassHints(opts, p.cpuClasses, cpuclass.AllocationIntent{ + ClassName: cpuClass, + CurrentCpus: currentCpus, + FreeCpus: p.freeCpus, + RequestedCount: requestedCount, + }) +} + +// cpuClassHints is the minimum surface of cpuclass.Handler that +// policy code relies on for placement hints. It exists so tests +// can substitute a fake provider. +type cpuClassHints interface { + Hints(cpuclass.AllocationIntent) cpuclass.AllocationHints +} + +// mergeCpuClassHints queries provider for placement hints described +// by intent and merges them into opts. It first removes any cpuClass +// hint entries left in opts from a previous allocation round so +// hints from this round are the only ones in effect. +func mergeCpuClassHints(opts *cpuTreeAllocatorOptions, provider cpuClassHints, intent cpuclass.AllocationIntent) { + if opts == nil || provider == nil { + return + } + if opts.virtDevCpusets == nil { + opts.virtDevCpusets = map[string][]cpuset.CPUSet{} + } + opts.preferCloseToDevices = filterOutHintDevs(opts.preferCloseToDevices) + opts.preferFarFromDevices = filterOutHintDevs(opts.preferFarFromDevices) + for name := range opts.virtDevCpusets { + if strings.HasPrefix(name, cpuClassHintDevPrefix) { + delete(opts.virtDevCpusets, name) + } + } + hints := provider.Hints(intent) + for i, pref := range hints.Prefer { + name := fmt.Sprintf("%spref_%d_%s", cpuClassHintDevPrefix, i, pref.Name) + opts.virtDevCpusets[name] = []cpuset.CPUSet{pref.Cpus} + opts.preferCloseToDevices = append(opts.preferCloseToDevices, name) + log.Debugf("cpuclass hint: prefer %q -> %s", name, pref.Cpus) + } + for i, av := range hints.Avoid { + name := fmt.Sprintf("%savoid_%d_%s", cpuClassHintDevPrefix, i, av.Name) + opts.virtDevCpusets[name] = []cpuset.CPUSet{av.Cpus} + opts.preferFarFromDevices = append(opts.preferFarFromDevices, name) + log.Debugf("cpuclass hint: avoid %q -> %s", name, av.Cpus) + } +} + +// filterOutHintDevs returns devs with all cpuClass hint device names +// (those carrying cpuClassHintDevPrefix) removed. The returned slice +// reuses devs' backing array. +func filterOutHintDevs(devs []string) []string { + out := devs[:0] + for _, d := range devs { + if !strings.HasPrefix(d, cpuClassHintDevPrefix) { + out = append(out, d) + } + } + return out +} + // fillFarFromDevices adds BalloonDefs implicit device anti-affinities // towards devices that other BalloonDefs prefer to be close to. func (p *balloons) fillFarFromDevices(blnDefs []*BalloonDef) { @@ -2010,6 +2311,7 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { } }() p.updateLoadedVirtDevsInAllocatorOptions(&bln.cpuTreeAlloc.options, bln.Def.Loads) + p.applyCpuClassHints(&bln.cpuTreeAlloc.options, p.resolveCpuClassName(bln.Def.CpuClass), bln.Cpus, cpuCountDelta) if cpuCountDelta > 0 { // Inflate the balloon. addFromCpus, _, err := bln.cpuTreeAlloc.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) diff --git a/cmd/plugins/balloons/policy/cpuclass_test.go b/cmd/plugins/balloons/policy/cpuclass_test.go new file mode 100644 index 000000000..c2aa63345 --- /dev/null +++ b/cmd/plugins/balloons/policy/cpuclass_test.go @@ -0,0 +1,175 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package balloons + +import ( + "strings" + "testing" + + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// fakeHintProvider returns a scripted sequence of cpuclass.AllocationHints, +// one per Hints() call. After the script is exhausted it keeps +// returning the last entry. +type fakeHintProvider struct { + script []cpuclass.AllocationHints + calls int +} + +func (f *fakeHintProvider) Hints(_ cpuclass.AllocationIntent) cpuclass.AllocationHints { + i := f.calls + if i >= len(f.script) { + i = len(f.script) - 1 + } + f.calls++ + return f.script[i] +} + +func countHintDevs(devs []string) int { + n := 0 + for _, d := range devs { + if strings.HasPrefix(d, cpuClassHintDevPrefix) { + n++ + } + } + return n +} + +func countHintMapKeys(m map[string][]cpuset.CPUSet) int { + n := 0 + for k := range m { + if strings.HasPrefix(k, cpuClassHintDevPrefix) { + n++ + } + } + return n +} + +// TestMergeCpuClassHintsNoAccumulation verifies that repeated +// allocation rounds do not cause cpuClass hint entries to accumulate +// in cpuTreeAllocatorOptions. Each round must leave behind exactly +// the hint count reported by the provider on that round, regardless +// of how many earlier rounds added different hints. +func TestMergeCpuClassHintsNoAccumulation(t *testing.T) { + cpusA := cpuset.MustParse("2-3") + cpusB := cpuset.MustParse("4-5") + cpusC := cpuset.MustParse("6-7") + cpusAvoid := cpuset.MustParse("0-1") + + provider := &fakeHintProvider{ + script: []cpuclass.AllocationHints{ + // Round 1: one prefer (A), one avoid. + { + Prefer: []cpuclass.CpuPreference{{Name: "hp-reserve", Cpus: cpusA}}, + Avoid: []cpuclass.CpuPreference{{Name: "lp-clos", Cpus: cpusAvoid}}, + }, + // Round 2: two prefers (A, B) - different name at index 1 + // so the slot-0 name stays stable, slot-1 is new. + { + Prefer: []cpuclass.CpuPreference{ + {Name: "hp-reserve", Cpus: cpusA}, + {Name: "extra", Cpus: cpusB}, + }, + Avoid: []cpuclass.CpuPreference{{Name: "lp-clos", Cpus: cpusAvoid}}, + }, + // Round 3: name at slot 0 CHANGES to C - without proper + // cleanup the stale "__cls_pref_0_hp-reserve" map key from + // rounds 1+2 would survive into round 3. + { + Prefer: []cpuclass.CpuPreference{{Name: "third", Cpus: cpusC}}, + Avoid: nil, + }, + }, + } + + opts := &cpuTreeAllocatorOptions{ + preferCloseToDevices: []string{"user-dev-A", "user-dev-B"}, + preferFarFromDevices: []string{"user-far"}, + virtDevCpusets: map[string][]cpuset.CPUSet{}, + } + + for round := 1; round <= 3; round++ { + mergeCpuClassHints(opts, provider, cpuclass.AllocationIntent{}) + + gotPrefDevs := countHintDevs(opts.preferCloseToDevices) + gotFarDevs := countHintDevs(opts.preferFarFromDevices) + gotMapKeys := countHintMapKeys(opts.virtDevCpusets) + + var expPref, expFar int + switch round { + case 1: + expPref, expFar = 1, 1 + case 2: + expPref, expFar = 2, 1 + case 3: + expPref, expFar = 1, 0 + } + if gotPrefDevs != expPref { + t.Errorf("round %d: preferCloseToDevices hint count = %d, want %d (slice=%v)", + round, gotPrefDevs, expPref, opts.preferCloseToDevices) + } + if gotFarDevs != expFar { + t.Errorf("round %d: preferFarFromDevices hint count = %d, want %d (slice=%v)", + round, gotFarDevs, expFar, opts.preferFarFromDevices) + } + if gotMapKeys != expPref+expFar { + t.Errorf("round %d: virtDevCpusets hint key count = %d, want %d (keys=%v)", + round, gotMapKeys, expPref+expFar, mapKeys(opts.virtDevCpusets)) + } + } + + // Sanity: user-supplied (non-hint) devices must survive untouched. + if got := userDevs(opts.preferCloseToDevices); len(got) != 2 || + got[0] != "user-dev-A" || got[1] != "user-dev-B" { + t.Errorf("user preferCloseToDevices were modified: got %v", got) + } + if got := userDevs(opts.preferFarFromDevices); len(got) != 1 || got[0] != "user-far" { + t.Errorf("user preferFarFromDevices were modified: got %v", got) + } +} + +func TestFilterOutHintDevs(t *testing.T) { + in := []string{"a", "__cls_pref_0_x", "b", "__cls_avoid_0_y", "c"} + got := filterOutHintDevs(in) + want := []string{"a", "b", "c"} + if len(got) != len(want) { + t.Fatalf("filterOutHintDevs len=%d, want %d: got=%v", len(got), len(want), got) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("filterOutHintDevs[%d] = %q, want %q", i, got[i], want[i]) + } + } +} + +func userDevs(devs []string) []string { + out := []string{} + for _, d := range devs { + if !strings.HasPrefix(d, cpuClassHintDevPrefix) { + out = append(out, d) + } + } + return out +} + +func mapKeys(m map[string][]cpuset.CPUSet) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/cmd/plugins/balloons/policy/flags.go b/cmd/plugins/balloons/policy/flags.go index 19889ea15..e39809ca3 100644 --- a/cmd/plugins/balloons/policy/flags.go +++ b/cmd/plugins/balloons/policy/flags.go @@ -24,6 +24,8 @@ type ( BalloonDef = cfgapi.BalloonDef LoadClass = cfgapi.LoadClass SchedulingClass = cfgapi.SchedulingClass + CPUClass = cfgapi.CPUClass + Frequency = cfgapi.Frequency CPUTopologyLevel = cfgapi.CPUTopologyLevel ) diff --git a/cmd/plugins/template/policy/template-policy.go b/cmd/plugins/template/policy/template-policy.go index 4a31092b9..cb9ede8c1 100644 --- a/cmd/plugins/template/policy/template-policy.go +++ b/cmd/plugins/template/policy/template-policy.go @@ -119,6 +119,12 @@ func (p *policy) GetTopologyZones() []*policyapi.TopologyZone { return nil } +// GetExtendedResources returns the node-level extended resources +// to publish for this policy. The template policy publishes none. +func (p *policy) GetExtendedResources() map[string]int64 { + return nil +} + // ExportResourceData provides resource data to export for the container. func (p *policy) ExportResourceData(c cache.Container) map[string]string { return nil diff --git a/cmd/plugins/topology-aware/policy/topology-aware-policy.go b/cmd/plugins/topology-aware/policy/topology-aware-policy.go index becdac4fb..65616bc62 100644 --- a/cmd/plugins/topology-aware/policy/topology-aware-policy.go +++ b/cmd/plugins/topology-aware/policy/topology-aware-policy.go @@ -395,6 +395,12 @@ func (p *policy) GetTopologyZones() []*policyapi.TopologyZone { return zones } +// GetExtendedResources returns the node-level extended resources +// to publish for this policy. The topology-aware policy publishes none. +func (p *policy) GetExtendedResources() map[string]int64 { + return nil +} + // ExportResourceData provides resource data to export for the container. func (p *policy) ExportResourceData(c cache.Container) map[string]string { grant, ok := p.allocations.grants[c.GetID()] diff --git a/config/crd/bases/config.nri_balloonspolicies.yaml b/config/crd/bases/config.nri_balloonspolicies.yaml index 321f3f417..2a6f6402a 100644 --- a/config/crd/bases/config.nri_balloonspolicies.yaml +++ b/config/crd/bases/config.nri_balloonspolicies.yaml @@ -733,6 +733,127 @@ spec: type: boolean type: object type: object + cpuClasses: + description: |- + CPUClasses define CPU frequency, C-state, and turbo + attributes for CPU classes referenced by balloon types. + Exclusive turbo frequency access is controlled via + turboPriority. + items: + description: |- + CPUClass specifies CPU frequency, C-state, and turbo attributes + for a CPU class. + properties: + disabledCstates: + description: |- + DisabledCstates lists C-states disabled for CPUs in this class. + Example: ["C4", "C6", "C8", "C10"] + items: + type: string + type: array + energyPerformancePreference: + description: EnergyPerformancePreference for CPUs in this class. + minimum: 0 + type: integer + freqGovernor: + description: |- + FreqGovernor is the CPUFreq governor for this class + (e.g., "performance", "powersave", "schedutil"). + type: string + maxFreq: + description: |- + MaxFreq is the maximum CPU frequency for this class. + Same format and symbolic names as MinFreq. + type: string + minFreq: + description: |- + MinFreq is the minimum CPU frequency for this class. + Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + or a plain number in kHz. Also accepts symbolic names: "min" + (platform minimum), "base" (CPU base frequency), "turbo" + (maximum turbo frequency), resolved at runtime from sysfs. + When turboPriority is set, "turbo" resolves to actual turbo + only for the highest-priority active class; others get base. + type: string + name: + description: Name of the CPU class. + type: string + pctClosID: + description: |- + PctClosID pins this class to a specific SST-CP CLOS ID + (0..ClosCount-1, typically 0..3) and signals "assoc-only" + mode: nri-plugin will only associate this class's CPUs to + the given CLOS, without touching the SoC-wide SST state + (no CPReset, no TFEnable, no CLOS reconfiguration). Use + this when an operator or the BIOS has pre-configured the + CLOSes. Mutually exclusive with PctPriority. + minimum: 0 + type: integer + pctMaxFreq: + description: |- + PctMaxFreq overrides the CLOS maximum frequency that + nri-plugin programs in managed mode. Defaults to MaxFreq. + Same caveat as PctMinFreq. + type: string + pctMinFreq: + description: |- + PctMinFreq overrides the CLOS minimum frequency that + nri-plugin programs in managed mode. Defaults to MinFreq. + Uses the same format as MinFreq but resolves "turbo" + directly to the hardware maximum turbo frequency, + without participating in the soft turboPriority + arbitration. Ignored in assoc-only mode. + type: string + pctPriority: + description: |- + PctPriority requests Intel Priority Core Turbo (PCT) + hardware support, via SST-CP CLOSes, for CPUs in this + class. "high" associates the CPUs to the high-priority + CLOS (HP cores, typically running at Pmax). "low" + associates them to the low-priority CLOS (LP cores, + typically capped at P1). Unset = PCT is not requested + for this class. Mutually exclusive with PctClosID. + enum: + - high + - low + type: string + publishExtendedResource: + description: |- + PublishExtendedResource opts this CPU class into publishing + a node-level extended resource named + "cpuclass.balloons.nri.io/" whose value reflects + the number of logical CPUs that the balloons policy is + currently able to route into this class on the node. The + scheduler can then bin-pack/spread balloons by adding the + same resource to pod requests, avoiding HP-CPU + over-subscription on a single node. Has effect only when + the class also carries PctPriority or PctClosID. Experimental. + type: boolean + turboPriority: + description: |- + TurboPriority controls exclusive turbo frequency access. + Among CPU classes with active balloons, only the class with + the highest turboPriority gets the symbolic frequency "turbo" + resolved to the actual turbo frequency. All other classes get + "turbo" resolved to the base frequency instead. + If all classes have turboPriority 0 (default), every class + gets actual turbo frequencies -- no competition occurs. + minimum: 0 + type: integer + uncoreMaxFreq: + description: |- + UncoreMaxFreq is the maximum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + uncoreMinFreq: + description: |- + UncoreMinFreq is the minimum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + required: + - name + type: object + type: array idleCPUClass: description: |- IdleCpuClass controls how unusded CPUs outside any a @@ -1076,6 +1197,20 @@ spec: options has no effect unless agent:NodeResourceTopology enables basic topology exposure. type: boolean + turboDomain: + default: package + description: |- + TurboDomain selects the scope over which TurboPriority + arbitration happens. The default is "package": every CPU + package independently picks its own TurboPriority winner, + so a low-priority balloon on one socket can keep turbo even + when a higher-priority balloon is running on another + socket. Set to "system" to pick single TurboPriority winner + for the whole system. + enum: + - package + - system + type: string required: - reservedResources type: object diff --git a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml index 321f3f417..2a6f6402a 100644 --- a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml +++ b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml @@ -733,6 +733,127 @@ spec: type: boolean type: object type: object + cpuClasses: + description: |- + CPUClasses define CPU frequency, C-state, and turbo + attributes for CPU classes referenced by balloon types. + Exclusive turbo frequency access is controlled via + turboPriority. + items: + description: |- + CPUClass specifies CPU frequency, C-state, and turbo attributes + for a CPU class. + properties: + disabledCstates: + description: |- + DisabledCstates lists C-states disabled for CPUs in this class. + Example: ["C4", "C6", "C8", "C10"] + items: + type: string + type: array + energyPerformancePreference: + description: EnergyPerformancePreference for CPUs in this class. + minimum: 0 + type: integer + freqGovernor: + description: |- + FreqGovernor is the CPUFreq governor for this class + (e.g., "performance", "powersave", "schedutil"). + type: string + maxFreq: + description: |- + MaxFreq is the maximum CPU frequency for this class. + Same format and symbolic names as MinFreq. + type: string + minFreq: + description: |- + MinFreq is the minimum CPU frequency for this class. + Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + or a plain number in kHz. Also accepts symbolic names: "min" + (platform minimum), "base" (CPU base frequency), "turbo" + (maximum turbo frequency), resolved at runtime from sysfs. + When turboPriority is set, "turbo" resolves to actual turbo + only for the highest-priority active class; others get base. + type: string + name: + description: Name of the CPU class. + type: string + pctClosID: + description: |- + PctClosID pins this class to a specific SST-CP CLOS ID + (0..ClosCount-1, typically 0..3) and signals "assoc-only" + mode: nri-plugin will only associate this class's CPUs to + the given CLOS, without touching the SoC-wide SST state + (no CPReset, no TFEnable, no CLOS reconfiguration). Use + this when an operator or the BIOS has pre-configured the + CLOSes. Mutually exclusive with PctPriority. + minimum: 0 + type: integer + pctMaxFreq: + description: |- + PctMaxFreq overrides the CLOS maximum frequency that + nri-plugin programs in managed mode. Defaults to MaxFreq. + Same caveat as PctMinFreq. + type: string + pctMinFreq: + description: |- + PctMinFreq overrides the CLOS minimum frequency that + nri-plugin programs in managed mode. Defaults to MinFreq. + Uses the same format as MinFreq but resolves "turbo" + directly to the hardware maximum turbo frequency, + without participating in the soft turboPriority + arbitration. Ignored in assoc-only mode. + type: string + pctPriority: + description: |- + PctPriority requests Intel Priority Core Turbo (PCT) + hardware support, via SST-CP CLOSes, for CPUs in this + class. "high" associates the CPUs to the high-priority + CLOS (HP cores, typically running at Pmax). "low" + associates them to the low-priority CLOS (LP cores, + typically capped at P1). Unset = PCT is not requested + for this class. Mutually exclusive with PctClosID. + enum: + - high + - low + type: string + publishExtendedResource: + description: |- + PublishExtendedResource opts this CPU class into publishing + a node-level extended resource named + "cpuclass.balloons.nri.io/" whose value reflects + the number of logical CPUs that the balloons policy is + currently able to route into this class on the node. The + scheduler can then bin-pack/spread balloons by adding the + same resource to pod requests, avoiding HP-CPU + over-subscription on a single node. Has effect only when + the class also carries PctPriority or PctClosID. Experimental. + type: boolean + turboPriority: + description: |- + TurboPriority controls exclusive turbo frequency access. + Among CPU classes with active balloons, only the class with + the highest turboPriority gets the symbolic frequency "turbo" + resolved to the actual turbo frequency. All other classes get + "turbo" resolved to the base frequency instead. + If all classes have turboPriority 0 (default), every class + gets actual turbo frequencies -- no competition occurs. + minimum: 0 + type: integer + uncoreMaxFreq: + description: |- + UncoreMaxFreq is the maximum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + uncoreMinFreq: + description: |- + UncoreMinFreq is the minimum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + required: + - name + type: object + type: array idleCPUClass: description: |- IdleCpuClass controls how unusded CPUs outside any a @@ -1076,6 +1197,20 @@ spec: options has no effect unless agent:NodeResourceTopology enables basic topology exposure. type: boolean + turboDomain: + default: package + description: |- + TurboDomain selects the scope over which TurboPriority + arbitration happens. The default is "package": every CPU + package independently picks its own TurboPriority winner, + so a low-priority balloon on one socket can keep turbo even + when a higher-priority balloon is running on another + socket. Set to "system" to pick single TurboPriority winner + for the whole system. + enum: + - package + - system + type: string required: - reservedResources type: object diff --git a/deployment/helm/balloons/templates/clusterrole.yaml b/deployment/helm/balloons/templates/clusterrole.yaml index 3c40d3e47..4ff4199b1 100644 --- a/deployment/helm/balloons/templates/clusterrole.yaml +++ b/deployment/helm/balloons/templates/clusterrole.yaml @@ -12,6 +12,14 @@ rules: verbs: - get - watch +- apiGroups: + - "" + resources: + - nodes/status + verbs: + - get + - patch + - update - apiGroups: - topology.node.k8s.io resources: diff --git a/deployment/helm/balloons/templates/daemonset.yaml b/deployment/helm/balloons/templates/daemonset.yaml index 2d190fb94..0af965d6e 100644 --- a/deployment/helm/balloons/templates/daemonset.yaml +++ b/deployment/helm/balloons/templates/daemonset.yaml @@ -99,6 +99,9 @@ spec: image: {{ .Values.image.name }}:{{ .Values.image.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.image.pullPolicy }} securityContext: + {{- if .Values.pct }} + privileged: true + {{- else }} allowPrivilegeEscalation: false capabilities: drop: @@ -108,6 +111,7 @@ spec: - SYS_ADMIN - DAC_OVERRIDE {{- end }} + {{- end }} resources: requests: cpu: {{ .Values.resources.cpu }} @@ -124,6 +128,10 @@ spec: - name: pod-resources-socket mountPath: /var/lib/kubelet/pod-resources readOnly: true + {{- if .Values.pct }} + - name: hostdev + mountPath: /host/dev + {{- end }} {{- if .Values.podPriorityClassNodeCritical }} priorityClassName: system-node-critical {{- end }} @@ -147,6 +155,12 @@ spec: hostPath: path: /var/lib/kubelet/pod-resources type: DirectoryOrCreate + {{- if .Values.pct }} + - name: hostdev + hostPath: + path: /dev + type: Directory + {{- end }} {{- if .Values.nri.runtime.patchConfig }} - name: containerd-config hostPath: diff --git a/deployment/helm/balloons/values.yaml b/deployment/helm/balloons/values.yaml index a4e00d23d..e7f9bd595 100644 --- a/deployment/helm/balloons/values.yaml +++ b/deployment/helm/balloons/values.yaml @@ -160,6 +160,14 @@ nodeSelector: [] # nodeSelector: # kubernetes.io/disk: "ssd" +# Enable support for Intel Speed Select Technology (SST), required by +# the Priority Core Turbo (PCT) feature of the balloons policy. When +# true, the plugin pod is granted access to the host SST device by +# running as privileged and mounting /dev from the host at /host/dev. +# Enable this only on nodes where PCT cpuClasses (with pctPriority or +# pctClosID) are used. +pct: false + # NRI plugins should be considered as part of the container runtime. # By default we make them part of the system-node-critical priority # class. This should mitigate the potential risk of a plugin getting diff --git a/docs/resource-policy/policy/balloons-pct-example-auto.md b/docs/resource-policy/policy/balloons-pct-example-auto.md new file mode 100644 index 000000000..05881312c --- /dev/null +++ b/docs/resource-policy/policy/balloons-pct-example-auto.md @@ -0,0 +1,1046 @@ +# Balloons + Priority Core Turbo (managed) example + +This example demonstrates how to let the balloons policy **own** the +Intel Speed Select Technology - Core Power (SST-CP) and Speed +Select Technology - Turbo Frequency (SST-TF) configuration on a +node, so that some containers run on **High Priority (HP)** cores +that reach maximum turbo frequency while others run on **Low +Priority (LP)** cores that are capped at base. This is the +"managed" PCT mode: the operator configures cpuClasses with +`pctPriority: high` and `pctPriority: low`, and the balloons +plugin programs the corresponding SST-CP CLOSes, enables SST-TF, +and associates container CPUs to the right CLOS at admission time. + +A companion document, +[balloons-pct-example-manual.md](balloons-pct-example-manual.md), +walks through the same demo with the "assoc-only" PCT mode in +which the operator owns the SST configuration and balloons only +associates CPUs. The two documents share build steps and pod +YAMLs; the differences are concentrated in the BalloonsPolicy +(step 4) and the inspection step (step 6). + +For background on the feature, see the +[Intel(R) Xeon(R) 6 with Priority Core Turbo Technical +Brief](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/6-priority-core-turbo-brief.html), +the [PCT section of the balloons policy +documentation](balloons.md#priority-core-turbo-pct), and the +[Intel Speed Select kernel +documentation](https://docs.kernel.org/admin-guide/pm/intel-speed-select.html). + +The full session below is meant to be copy-pasted into a bash prompt +on a workstation that has `kubectl` configured to talk to a single +target node. Commands that must run *on the node itself* are marked +with `# node:`. + +## What you will see + +Four HP pods and one LP pod running the same benchmark image, on +the same node, in balloons that pin them to SST-CP CLOSes +programmed by the balloons policy itself. The HP balloons spread +across separate SST power domains (punits), so each gets its own +SST-TF turbo budget. Each pod prints, once per `sysbench cpu` +iteration, the CPUs it is pinned to, the sysbench thread count, +sysbench events/s and the average `Bzy_MHz` (APERF/MPERF-derived +effective frequency, sampled by `turbostat` from inside the pod) +across the pinned CPUs: + +```text +[hp-1] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[hp-2] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[hp-3] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[hp-4] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[lp] cpus=<...> threads= events_per_sec=<...> mhz_avg= +``` + +With PCT in effect, `mhz_avg` and per-thread `events_per_sec` are +visibly higher in the HP pods than in the LP pod. + +## 1. Prerequisites + +Hardware and platform: + +- A server with Intel(R) Xeon(R) 6 CPUs that support SST-PP, SST-CP + and SST-TF. This example was written against a dual-socket Xeon + 6776P. +- SST features enabled at the platform level (SST-PP profile + selected so that SST-TF is available; on most platforms this is + the default). The balloons policy will turn SST-CP and SST-TF + on at runtime, but it does not select an SST-PP profile. +- A Linux kernel with the `isst_if_*` (or `isst_tpmi_*`) modules + loaded. Modern distro kernels include them. +- The `msr` kernel module loaded on the node so the in-pod + `turbostat` can read APERF/MPERF (`sudo modprobe msr`; see + step 2). + +Kubernetes: + +- A working cluster. All commands target a single node; on a + multi-node cluster, schedule the demo pods on the PCT-capable node + (e.g. with `nodeSelector` or by tainting other nodes). +- Container runtime: containerd 1.7+ or CRI-O 1.26+ with NRI + enabled (the default in current versions). +- The balloons policy installed with PCT enabled (see step 4). + +Optional tools used in this example: + +- `intel-speed-select` on the node, **only for inspection** + (step 6). In managed mode the balloons policy programs SST-CP + and SST-TF for you; you do not need to invoke + `intel-speed-select` to *configure* anything. Most Linux + distributions package it as part of `linux-tools` or + `intel-speed-select`; otherwise build it from the Linux source + tree under `tools/power/x86/intel-speed-select/` (see the + upstream [documentation](https://docs.kernel.org/admin-guide/pm/intel-speed-select.html)). +- `turbostat`. The benchmark image already includes it (from the + Debian `linux-cpupower` package) and the demo pods use it to + report `Bzy_MHz` from inside the container. You only need + `turbostat` on the *node* if you want to cross-check the demo + numbers from outside the pod. +- `crictl` and `ctr` (containerd) or `podman` (CRI-O) on the node + for loading the benchmark image without a registry. + +> **No manual SST step.** Unlike the +> [assoc-only example](balloons-pct-example-manual.md), there is +> no `intel-speed-select turbo-freq enable -a` step here. +> Programming SST-CP CLOS bounds, enabling SST-CP in ordered +> priority mode, and enabling SST-TF on every package are all +> done by the balloons policy when it processes the cpuClasses in +> step 4. Pre-configuring SST in BIOS or via `intel-speed-select` +> is still compatible; the balloons policy resets and reprograms +> SST when it enters managed mode. + +## 2. Build the benchmark image + +The benchmark image runs `sysbench cpu` in a loop and prints one +status line per iteration. The effective frequency is measured with +`turbostat --cpu` over the same time window as the `sysbench` run, +restricted to the CPUs the container is pinned to. + +`turbostat` is used instead of `scaling_cur_freq` / +`/proc/cpuinfo`'s `cpu MHz` because the latter reflect what the OS +*requests* from the firmware; on HWP/`intel_pstate` kernels they +can lag or under-report when the firmware boosts autonomously. +`Bzy_MHz` is derived from the `APERF`/`MPERF` MSRs over the +sampling window and is the actual *busy* frequency the cores ran +at. + +Reading those MSRs requires access to `/dev/cpu/*/msr` and +`CAP_SYS_RAWIO`. In a standard Kubernetes cluster the simplest way +to get both is to run the benchmark pod as `privileged: true` with +the host `/dev` mounted. The pod yaml in step 5 does that. Make +sure the `msr` kernel module is loaded on the node: + +```bash +# node: +sudo modprobe msr +ls /dev/cpu/0/msr # must exist +``` + +Create the build context: + +```bash +mkdir -p pct-reporter && cd pct-reporter + +cat > reporter.sh <<'EOF' +#!/bin/bash +# Continuously run sysbench cpu and report, per iteration: +# label, cpus the container is pinned to (from /proc/self/status, +# which is correct even when running as privileged), thread count, +# sysbench events/s, and the average Bzy_MHz across the pinned +# CPUs as measured by turbostat over the same interval. +set -u +LABEL="${LABEL:-reporter}" +INTERVAL="${INTERVAL:-5}" + +CPUS_LIST="$(awk '/Cpus_allowed_list/ {print $2}' /proc/self/status)" + +expand_count() { + local list="$1" n=0 part lo hi + IFS="," read -ra parts <<< "$list" + for part in "${parts[@]}"; do + if [[ "$part" == *-* ]]; then + lo="${part%-*}"; hi="${part#*-}" + n=$(( n + hi - lo + 1 )) + else + n=$(( n + 1 )) + fi + done + echo "$n" +} +# Default: one sysbench thread per pinned logical CPU. Override +# with THREADS env (used by the A/B pod in step 7). +NTHREADS="${THREADS:-$(expand_count "$CPUS_LIST")}" + +echo "[$LABEL] starting: cpus=$CPUS_LIST threads=$NTHREADS interval=${INTERVAL}s" + +while true; do + TS_OUT="$(mktemp)" + turbostat --quiet --cpu "$CPUS_LIST" --show CPU,Bzy_MHz \ + --num_iterations 1 --interval "$INTERVAL" \ + > "$TS_OUT" 2>/dev/null & + TS_PID=$! + + SB_OUT="$(sysbench cpu --threads="$NTHREADS" --time="$INTERVAL" \ + run 2>/dev/null)" + wait "$TS_PID" + + EVPS="$(echo "$SB_OUT" | awk -F: '/events per second/ {gsub(/ /,"",$2); print $2}')" + # Average Bzy_MHz across the requested CPUs. Skip header and + # turbostat's "-" all-CPUs summary row. + MHZ_AVG="$(awk 'NR>1 && $1 ~ /^[0-9]+$/ {s+=$2; n++} END {if (n) printf "%.0f", s/n}' "$TS_OUT")" + rm -f "$TS_OUT" + + printf '[%s] cpus=%s threads=%d events_per_sec=%s mhz_avg=%s\n' \ + "$LABEL" "$CPUS_LIST" "$NTHREADS" "${EVPS:-?}" "${MHZ_AVG:-?}" +done +EOF +chmod +x reporter.sh + +cat > Dockerfile <<'EOF' +FROM debian:stable-slim +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + sysbench linux-cpupower util-linux ca-certificates \ + && rm -rf /var/lib/apt/lists/* +COPY reporter.sh /usr/local/bin/reporter.sh +ENTRYPOINT ["/usr/local/bin/reporter.sh"] +EOF +``` + +`linux-cpupower` ships `/usr/sbin/turbostat`. `util-linux` provides +`taskset` and the rest of the standard userspace. + +Build the image. Use whichever tool is available on your build host. +With docker, prefix with `sudo` if your user is not in the `docker` +group: + +```bash +# With docker: +docker build -t localhost/pct-reporter:demo . + +# Or with podman: +podman build -t localhost/pct-reporter:demo . +``` + +If the build host is behind an HTTP proxy, pass it through: + +```bash +docker build \ + --build-arg http_proxy=$http_proxy \ + --build-arg https_proxy=$https_proxy \ + -t localhost/pct-reporter:demo . +``` + +## 3. Make the image available to the kubelet (no registry) + +If you built the image on the same machine as the kubelet, import it +directly into the container runtime's image store. Pick the +subsection that matches your runtime. + +### 3.1. containerd + +```bash +# On the build host: +docker save localhost/pct-reporter:demo -o /tmp/pct-reporter.tar +# (or: podman save -o /tmp/pct-reporter.tar localhost/pct-reporter:demo) + +# node: +sudo ctr -n k8s.io images import /tmp/pct-reporter.tar +sudo crictl images | grep pct-reporter +``` + +The `-n k8s.io` namespace is the one kubelet uses; without it the +image will not be visible to Kubernetes. + +### 3.2. CRI-O + +```bash +# On the build host: +docker save localhost/pct-reporter:demo -o /tmp/pct-reporter.tar +# (or: podman save -o /tmp/pct-reporter.tar localhost/pct-reporter:demo) + +# node: +sudo podman --root /var/lib/containers/storage load -i /tmp/pct-reporter.tar +sudo crictl images | grep pct-reporter +``` + +`--root /var/lib/containers/storage` makes `podman` load the image +into the same storage CRI-O reads from. If you built the image +directly on the node with `sudo podman build`, this step is not +needed. + +The demo pods set `imagePullPolicy: IfNotPresent` and use the image +reference `localhost/pct-reporter:demo`, so the kubelet will not +attempt to pull from a registry. Note that the kubelet garbage- +collects unused local images: re-import the image if pod creation +later fails with `ErrImagePull`. + +## 4. Install / reconfigure the balloons policy with PCT enabled + +The Intel SST device (`/dev/isst_interface`) is owned by `root` and +is normally not visible inside non-privileged Kubernetes pods. The +balloons Helm chart exposes a `pct` value that grants the plugin +pod the access it needs to drive PCT. PCT cpuClass support +(`pctPriority`, `pctClosID`) is not in a released balloons chart +yet, so install the unstable build that includes it: + +```bash +helm install \ + --devel \ + -n kube-system \ + balloons \ + oci://ghcr.io/askervin/nri-plugins/helm-charts/nri-resource-policy-balloons \ + --version v0.12-pct2-unstable \ + --set image.name=ghcr.io/askervin/nri-plugins/nri-resource-policy-balloons \ + --set image.tag=v0.12-pct2-unstable \ + --set image.pullPolicy=Always \ + --set pct=true +``` + +`--set pct=true` makes the plugin pod privileged and mounts the +host `/dev` at `/host/dev`. Enable it only on nodes where PCT +cpuClasses are used. + +Once PCT support is in a released balloons chart, the equivalent +install command will be the standard one from +[balloons.md](balloons.md#deployment) plus `--set pct=true`: + +```bash +# Stable form (use this once PCT support is released): +helm repo add nri-plugins https://containers.github.io/nri-plugins +helm repo update +helm install balloons nri-plugins/nri-resource-policy-balloons \ + --namespace kube-system \ + --set pct=true +``` + +Verify the plugin pod has the privileged settings the chart's +`pct=true` flag enables: + +```bash +kubectl -n kube-system get pod \ + -l app.kubernetes.io/name=nri-resource-policy-balloons \ + -o jsonpath='{.items[0].spec.containers[0].securityContext}{"\n"}' +# Expect: {"privileged":true} + +kubectl -n kube-system get pod \ + -l app.kubernetes.io/name=nri-resource-policy-balloons \ + -o jsonpath='{.items[0].spec.containers[0].volumeMounts[?(@.name=="hostdev")]}{"\n"}' +# Expect a mount of /host/dev. +``` + +Now apply the policy configuration. The `BalloonsPolicy` below +defines three cpuClasses. Two of them (`hp-pct`, `lp-pct`) use +`pctPriority` -- this is what selects **managed** mode for the +PCT allocator: + +- `hp-pct` requests `pctPriority: high`. balloons assigns it to + CLOS 0, programs that CLOS with min frequency `base` and max + frequency `turbo` (which resolves to the hardware maximum turbo + frequency on this SKU, 4600 MHz on Xeon 6776P), and enables + SST-TF on every package so the bucket-0 turbo budget becomes + available. +- `lp-pct` requests `pctPriority: low`. balloons assigns it to + CLOS 3 and programs that CLOS with min frequency `min` and max + frequency `base`, so LP cores are capped at base while idle LP + cores still drop to Pmin (freeing turbo budget for HP cores). +- `default` has no PCT fields. It is the implicit fallback for + idle CPUs and balloons that do not specify their `cpuClass`. + In managed mode, when an LP class is defined, balloons routes + these CPUs to the LP CLOS automatically (logged as `pct: + fallback CLOS for non-PCT CPUs set to N (LP)`). This is + essential: leaving idle CPUs on the HP CLOS would inflate the + SST-TF active-HP-core count per punit and prevent bucket-0 + turbo selection on punits that also host an LP balloon. + +`pctMinFreq` / `pctMaxFreq` accept the same symbolic names as +`minFreq` / `maxFreq` (`min`, `base`, `turbo`) and also explicit +values like `3.2GHz`. In managed mode, `turbo` resolves directly +to the hardware turbo maximum (not subject to `turboPriority` +arbitration). + +The HP cpuClass additionally disables the deep C-states `C6` and +`C6P`. The HP cores in this demo are continuously busy with +`sysbench`, so C-state entry would normally not happen anyway; +the setting is included because removing C-state wake-up latency +is the typical reason latency-sensitive workloads ask for priority +cores. List the C-state names available on the node with +`grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name`. **Do +not** disable C-states on the `default` / `lp-pct` classes: idle +CPUs in deep C-states do not count toward the package's active- +core count and therefore free turbo budget for the HP cores. + +The HP balloon type uses `preferNewBalloons: true` and +`maxCPUs: 8` (the SST-TF bucket-0 HP-core limit per punit on +Xeon 6776P), so each HP pod lands in its own balloon and the +balloons spread across separate punits. `minCPUs` is left unset +so the balloon size equals what the pod requests; with no +`hideHyperthreads` the container sees exactly the logical CPUs +the balloon allocated. + +`agent.nodeResourceTopology: true` and `showContainersInNrt: true` +make the plugin publish per-balloon and per-container CPU sets in +the cluster's `NodeResourceTopology` (NRT) CRs. The verification +queries in step 6 read those CRs to confirm exactly which CPUs +each pod's container ended up pinned to. The NRT CRD must exist +in the cluster (`kubectl get crd +noderesourcetopologies.topology.node.k8s.io`). + +`availableResources` is intentionally left unset: balloons manages +all CPUs of the node, as in the normal mode of operation. The +`reservedResources` covers physical CPU 0 (`0` and its SMT sibling +`128`) and physical CPU 1 (`1` and its SMT sibling `129`); adjust +the sibling numbers if your topology differs (`lscpu -e` shows +them). + +```bash +cat > balloons-pct-managed.yaml < idle/default CPUs follow the fallback CLOS + - name: hp-pct + pctPriority: high + pctMinFreq: base + pctMaxFreq: turbo + disabledCstates: [C6, C6P] + - name: lp-pct + pctPriority: low + pctMinFreq: min + pctMaxFreq: base + + log: + debug: + - policy + - cpu +EOF + +kubectl apply -f balloons-pct-managed.yaml +``` + +Confirm that balloons picked up the configuration and entered +managed mode -- you should see `mode=managed`, `programmed CLOS N` +lines for every CLOS used by a PCT cpuClass, `PrepareManagedMode +done` (which resets SST-CP, enables SST-TF and sets ordered +priority), and `EnableCP done`: + +```bash +kubectl -n kube-system logs ds/nri-resource-policy-balloons \ + | grep -E 'pct(:| mock:)' | tail -n 20 +``` + +Expected: + +```text +pct: SST discovered: pkg=0 punit=0 level=1 cpus=<...> ... +pct: mode=managed, 2 PCT cpuClass(es), 4 punit(s) across 2 package(s) +pct: programmed CLOS 0 min=2300000 max=4600000 kHz +pct: programmed CLOS 3 min=800000 max=2300000 kHz +pct: cpuClass "hp-pct" classified HP (managed: pctPriority=high, CLOS 0) +pct: cpuClass "lp-pct" classified LP (managed: pctPriority=low, CLOS 3) +pct: fallback CLOS for non-PCT CPUs set to 3 (LP) +``` + +## 5. Deploy the HP and LP pods + +Four HP pods and one LP pod. Each HP pod requests 2 CPUs; with +`preferNewBalloons: true` and `maxCPUs: 8` on `hp-bln`, each pod +gets its own balloon, and PCT placement spreads the balloons +across separate punits (one per HP pod, up to four on a +dual-socket Xeon 6776P). Because `hideHyperthreads` is not set, +the container sees exactly the requested logical CPUs and the +reporter starts that many sysbench threads. + +The pods are `privileged: true` and mount the host `/dev` because +`turbostat` inside the container reads `/dev/cpu/*/msr` to compute +`Bzy_MHz` (see step 2). + +```bash +for i in 1 2 3 4; do +cat > pod-hp-$i.yaml < pod-lp.yaml < max= kHz` +- `pct: programmed CLOS 3 min= max= kHz` +- `pct: cpuClass "hp-pct" classified HP (managed: pctPriority=high, CLOS 0)` +- `pct: cpuClass "lp-pct" classified LP (managed: pctPriority=low, CLOS 3)` +- one `associated cpus <...> to CLOS 0` per HP pod admitted +- one `associated cpus <...> to CLOS 3` per LP pod admitted + +### 6.2. From the node with `intel-speed-select` + +These commands read SST state directly from the hardware. They +must agree with what the plugin logged. None of them write +anything. + +```bash +# node: + +# SST-PP profile (must report a level that has TF supported / enabled). +sudo intel-speed-select perf-profile info 2>&1 \ + | grep -E 'current|speed-select-turbo-freq|speed-select-core-power' + +# SST-CP state per package (must report enable-status: enabled +# and priority-type: 1 (ordered) -- this is what +# PrepareManagedMode + EnableCP set). +sudo intel-speed-select core-power info 2>&1 \ + | grep -E 'package-|powerdomain-|enable-status|priority-type' + +# SST-CP CLOS bounds. CLOS 0 should show max-frequency matching +# the "programmed CLOS 0 max=<...> kHz" line in the plugin log; +# CLOS 3 the corresponding LP cap. +sudo intel-speed-select core-power get-config -c 0 2>&1 \ + | grep -E 'powerdomain-|clos-min|clos-max' +sudo intel-speed-select core-power get-config -c 3 2>&1 \ + | grep -E 'powerdomain-|clos-min|clos-max' + +# SST-TF enable state on every punit (plugin called TFEnable in +# PrepareManagedMode). +sudo intel-speed-select perf-profile info 2>&1 \ + | grep -E 'package-|powerdomain-|speed-select-turbo-freq:' +``` + +### 6.3. Per-CPU associations + +```bash +# node: + +# Build the list of pinned CPUs from the pods. (Bash expansion +# below assumes a single-container pod; adjust if you changed the +# layout.) +HP_CPUS=$(for p in pct-hp-1 pct-hp-2 pct-hp-3 pct-hp-4; do + kubectl logs $p 2>/dev/null | awk -F'cpus=| ' '/starting/ {print $4}' +done | paste -sd,) +LP_CPUS=$(kubectl logs pct-lp 2>/dev/null \ + | awk -F'cpus=| ' '/starting/ {print $4}') +echo "HP_CPUS=$HP_CPUS" +echo "LP_CPUS=$LP_CPUS" + +# Expected: clos:0 for every CPU in HP_CPUS, clos:3 for every CPU +# in LP_CPUS. +sudo intel-speed-select -c "$HP_CPUS" core-power get-assoc 2>&1 | grep -E 'cpu-|clos:' +sudo intel-speed-select -c "$LP_CPUS" core-power get-assoc 2>&1 | grep -E 'cpu-|clos:' +``` + +### 6.4. Verify punit spread + +The four HP balloons should each land on a different punit. The +mapping from CPU to punit is visible in `sst info`: + +```bash +# node: +sudo ./sst info | awk '/SST-PP/,/SST-BF/' | grep -E '^\s+[0-9]' +# Sample on Xeon 6776P: +# 0 0 0-31,128-159 +# 0 1 32-63,160-191 +# 1 0 64-95,192-223 +# 1 1 96-127,224-255 +``` + +The pinned CPUs of `pct-hp-1` .. `pct-hp-4` should each fall in a +different `(pkg, punit)` row. + +### 6.5. Verify container-to-balloon-to-CPU mapping via NRT + +The `agent.nodeResourceTopology: true` and `showContainersInNrt: +true` settings in step 4 make the plugin publish per-balloon and +per-container CPU sets in the +`noderesourcetopologies.topology.node.k8s.io` CR for the node. +Print every balloon (zone type `balloon`) and every container +(zone type `allocation for container`) assigned to it: + +```bash +kubectl get noderesourcetopologies.topology.node.k8s.io -o json | jq -r ' + ["NODE","BALLOON","CPUSET"], + ( + .items.[] as $node + | $node.zones[] + | select(.type == "balloon") + | [ + $node.metadata.name, + .name, + (.attributes[] | select(.name=="cpuset") | .value) + ] + ) | @tsv' + +kubectl get noderesourcetopologies.topology.node.k8s.io -o json | jq -r ' + ["NODE","BALLOON","CONTAINER","CPUS"], + ( + .items.[] as $node + | $node.zones[] + | select(.type == "allocation for container") + | [ + $node.metadata.name, + .parent, + .name, + (.attributes[] | select(.name=="cpuset") | .value) + ] + ) | @tsv' +``` + +Expected: + +- Four `hp-bln[0]`..`hp-bln[3]` zones, each with a 2-CPU set on a + distinct punit, and the matching `pct-hp-N/bench` container + pinned to that same set. +- One `lp-bln[0]` zone with the 8-CPU set, and `pct-lp/bench` + pinned to the same set. +- A `reserved[0]` zone covering the currently-used subset of the + reserved pool (the SMT pair of physical CPU 0 -- `0,128` -- is the + typical outcome on this layout; balloons compacts the reserved + balloon to what its containers actually need). +- An empty `default[0]` zone may also appear; it is the unused + default balloon and can be ignored. + +The CPU sets here must match the `cpus=` value printed by the +benchmark inside each pod (step 7), the `clos:0` / `clos:3` +reported by `core-power get-assoc` (step 6.3), and the punit +mapping (step 6.4). + +## 7. Observe performance + +Tail every pod's log: + +```bash +for p in pct-hp-1 pct-hp-2 pct-hp-3 pct-hp-4 pct-lp; do + kubectl logs -f --prefix=true --max-log-requests=5 $p & +done +wait +``` + +Sample shape on a dual-socket Intel(R) Xeon(R) 6776P (replace +`<...>` with your own measurements): + +```text +[hp-1] cpus=32,160 threads=2 events_per_sec=4155.16 mhz_avg=4600 +[hp-2] cpus=64,192 threads=2 events_per_sec=4153.55 mhz_avg=4600 +[hp-3] cpus=100,228 threads=2 events_per_sec=4152.00 mhz_avg=4600 +[hp-4] cpus=10,138 threads=2 events_per_sec=4155.50 mhz_avg=4600 +[lp] cpus=65-68,193-196 threads=8 events_per_sec=8296.69 mhz_avg=2138 +``` + +Per-thread throughput on this run: + +| Tag | threads | mhz_avg | events_per_sec | events_per_sec per thread | +|--------|---------|---------|----------------|---------------------------| +| hp-1 | 2 | 4600 | 4155.16 | 2077.58 | +| hp-2 | 2 | 4600 | 4153.55 | 2076.78 | +| hp-3 | 2 | 4600 | 4152.00 | 2076.00 | +| hp-4 | 2 | 4600 | 4155.50 | 2077.75 | +| lp | 8 | 2138 | 8296.69 | 1037.09 | + +Optionally cross-check the same numbers from outside the pod with +`turbostat` on the node: + +```bash +# node: +sudo turbostat --show CPU,Bzy_MHz --quiet -c -i 2 -n 2 +``` + +## 8. A/B comparison + +Run the same 2-thread workload on the LP CLOS instead of an HP +CLOS. The pod below pins to the LP balloon (CLOS 3, base +frequency cap) and uses `THREADS=2` to keep the sysbench workload +identical to a single `pct-hp-*`: + +```bash +kubectl delete pod pct-lp --now # free LP-balloon CPUs + +cat > pod-hp-on-lp.yaml < TFDisable -> CPDisable` per package and +return the platform to its initial SST state. In practice +`helm uninstall` may not give the daemonset enough termination +grace for that hook to complete, so always verify and, if SST +is still enabled, run the teardown explicitly: + +```bash +# node: +sudo intel-speed-select core-power info 2>&1 \ + | grep -E 'enable-status' | sort -u +sudo intel-speed-select perf-profile info 2>&1 \ + | grep -E 'speed-select-turbo-freq:' | sort -u + +# If any value above is "enabled", reset: +sudo intel-speed-select turbo-freq disable -a +sudo intel-speed-select core-power disable + +# Re-verify (both expected to be disabled): +sudo intel-speed-select core-power info 2>&1 \ + | grep -E 'enable-status' | sort -u +# Expect (both lines): +# clos-enable-status:disabled +# enable-status:disabled +sudo intel-speed-select perf-profile info 2>&1 \ + | grep -E 'speed-select-turbo-freq:' | sort -u +# Expect: speed-select-turbo-freq:disabled +``` + +### 9.3. Restore `cpufreq` defaults on the node + +The managed-mode plugin does not write `scaling_min_freq` / +`scaling_max_freq`, but earlier workloads or kernel modules might +have. Reset them to the hardware limits as a precaution: + +```bash +# node: +for f in /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq; do + base=${f%scaling_max_freq}cpuinfo_max_freq + sudo tee "$f" < "$base" > /dev/null +done +for f in /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq; do + base=${f%scaling_min_freq}cpuinfo_min_freq + sudo tee "$f" < "$base" > /dev/null +done + +# Verify (should print exactly the hardware min and the hardware +# max in kHz): +for i in $(seq 0 $(($(nproc) - 1))); do + cat /sys/devices/system/cpu/cpu$i/cpufreq/scaling_max_freq \ + /sys/devices/system/cpu/cpu$i/cpufreq/scaling_min_freq +done | sort -u +``` + +### 9.4. Remove leftover files + +```bash +rm -f balloons-pct-managed.yaml \ + pod-hp-1.yaml pod-hp-2.yaml pod-hp-3.yaml pod-hp-4.yaml \ + pod-lp.yaml pod-hp-on-lp.yaml +# Optional: +rm -rf pct-reporter +# Optional, on the node, free disk used by the demo image: +# sudo crictl rmi localhost/pct-reporter:demo +``` + +## 10. Optional: help the scheduler avoid HP over-subscription (experimental) + +The default Kubernetes scheduler is unaware of how many CPUs on +a node can become HP cores. Two HP pods can land on the same +node even when a second node would have given them HP capacity, +and HP pods can pile up beyond the platform's HP budget. + +The balloons policy ships an experimental opt-in that publishes +a per-cpuClass extended resource on the local Node so the +default scheduler can bin-pack on it. Set +`publishExtendedResource: true` on every PCT-enabled cpuClass +(i.e. classes that carry `pctClosID` or `pctPriority`) and the +agent advertises: + +```text +status.capacity: + cpuclass.balloons.nri.io/: +``` + +The capacity reflects "CPUs eligible for this class that are +not currently held by balloons of other classes", and is +re-published on every container create/update/release, so +cross-class consumption (e.g. an LP balloon eating CPUs that +would otherwise have been available for HP) is reflected +immediately. + +For managed-mode HP classes, the per-punit cap used in the +capacity formula is the *guaranteed top-turbo HP CPU count* +(the smallest non-zero SST-TF bucket `HighPriorityCoreCount`, +or the SST-BF `HighPriorityCPUs` count when TF is +unsupported). This is the number of HP CPUs per punit that +can simultaneously sustain the highest turbo frequency this +platform exposes -- not the larger `MaxHpCpus` the allocator +uses internally. On a Xeon 6 with four 8-core SST-TF buckets +per punit and four active punits, that is 4 x 8 = 32 HP CPUs +of guaranteed top-turbo headroom, which is what the scheduler +should bin-pack on. + +Add the flag to the policy: + +```yaml + cpuClasses: + - name: hp-pct + pctPriority: high + pctMinFreq: base + pctMaxFreq: turbo + disabledCstates: [C6, C6P] + publishExtendedResource: true # experimental + - name: lp-pct + pctPriority: low + pctMinFreq: min + pctMaxFreq: base + publishExtendedResource: true # experimental +``` + +...and to every HP/LP pod, alongside the existing `cpu` request: + +```yaml + resources: + requests: + cpu: "2" + memory: "128Mi" + cpuclass.balloons.nri.io/hp-pct: "2" + limits: + cpu: "2" + memory: "128Mi" + cpuclass.balloons.nri.io/hp-pct: "2" +``` + +Verify on the node after applying: + +```bash +kubectl get node -o jsonpath='{.items[0].status.capacity}' \ + | jq 'with_entries(select(.key | startswith("cpuclass")))' +``` + +A pod whose request exceeds the published capacity gets +`FailedScheduling: Insufficient cpuclass.balloons.nri.io/` +and stays `Pending` until another pod releases the resource. + +This is an experimental flag: the resource name, semantics +(capacity vs. allocatable, conservative-on-grow), and update +cadence may change before becoming stable. + +## 11. Troubleshooting + +- Plugin pod log shows `Speed Select Technology (SST) support not + detected`: the pod cannot access `/dev/isst_interface`. Re-install + the chart with `--set pct=true`. Verify with `kubectl -n + kube-system get pod -l app.kubernetes.io/name=nri-resource-policy-balloons + -o jsonpath='{.items[0].spec.containers[0].securityContext}'` + that it shows `privileged:true`. +- Plugin log shows `pct: failed to prepare managed mode` or + `pct: failed to configure CLOS N`: another agent on the node may + already hold SST exclusively, or a previous balloons instance + exited without releasing it. Try + `sudo intel-speed-select core-power disable` on the node, then + restart the plugin (`kubectl -n kube-system delete pod -l + app.kubernetes.io/name=nri-resource-policy-balloons`). +- Validation error `cpuClass "X": pctPriority and pctClosID are + mutually exclusive`: only one of the two PCT fields may be set + on a cpuClass. For managed mode use `pctPriority` and leave + `pctClosID` unset. +- Validation error `at most one managed PCT cpuClass with + pctPriority=high allowed`: in managed mode balloons programs + exactly one HP and one LP CLOS, so at most one cpuClass with + `pctPriority: high` and one with `pctPriority: low` may be + defined. +- Validation error `pct: cannot mix managed (pctPriority) and + assoc-only (pctClosID) modes`: the configuration mixes the two + modes. Pick one and apply it to every PCT cpuClass. +- Pods stuck in `ErrImagePull` with image + `localhost/pct-reporter:demo`: the image was not imported into + the kubelet's container runtime store, or the kubelet has + garbage-collected it. Repeat step 3, then `kubectl delete pod + ...` to retry. +- Pod log shows `turbostat: no /dev/cpu/0/msr` or `mhz_avg=?`: the + `msr` kernel module is not loaded on the node. Run `sudo modprobe + msr` on the node and recreate the pod. If the pod is not + privileged or `/dev` is not mounted, fix the pod yaml (step 5). +- HP CPUs do not reach Pmax under load: confirm `turbo-freq info` + reports `enable-status: enabled` on every punit and that the HP + balloons each landed on a different punit (step 6.4). Two HP + balloons on the same punit share the bucket-0 turbo budget and + may run below Pmax. +- All four HP balloons end up on the same punit: confirm + `preferNewBalloons: true` on `hp-bln` and that the plugin + build includes PCT-aware balloon placement. The plugin log + prints the punit each new balloon is assigned to. +- `mhz_avg` for HP equals the standard turbo (not bucket-0 turbo): + SST-TF did not enable. Check the plugin log for + `PrepareManagedMode done` and `TFEnable`/`TFDisable` errors, and + the host SST-PP profile (`intel-speed-select perf-profile info`) + to confirm the selected SST-PP level lists SST-TF as supported. diff --git a/docs/resource-policy/policy/balloons-pct-example-manual.md b/docs/resource-policy/policy/balloons-pct-example-manual.md new file mode 100644 index 000000000..049e6d6ce --- /dev/null +++ b/docs/resource-policy/policy/balloons-pct-example-manual.md @@ -0,0 +1,1046 @@ +# Balloons + Priority Core Turbo (assoc-only) example + +This example demonstrates how to use the balloons policy to associate +container CPUs to *pre-configured* Intel Speed Select Technology - +Core Power (SST-CP) classes of service (CLOSes), so that some +containers run on **High Priority (HP)** cores that reach maximum +turbo frequency while others run on **Low Priority (LP)** cores that +are capped at base. This is the "assoc-only" PCT mode: the operator +(or BIOS) owns the SST-CP configuration; balloons only associates +container CPUs to the chosen CLOSes and does not reconfigure SST-CP. + +For background on the feature, see the +[Intel(R) Xeon(R) 6 with Priority Core Turbo Technical +Brief](https://www.intel.com/content/www/us/en/products/docs/processors/xeon/6-priority-core-turbo-brief.html), +the [PCT section of the balloons policy +documentation](balloons.md#priority-core-turbo-pct), and the +[Intel Speed Select kernel +documentation](https://docs.kernel.org/admin-guide/pm/intel-speed-select.html). + +The full session below is meant to be copy-pasted into a bash prompt +on a workstation that has `kubectl` configured to talk to a single +target node. Commands that must run *on the node itself* are marked +with `# node:`. + +## What you will see + +Four HP pods and one LP pod running the same benchmark image, on +the same node, in balloons that pin them to different SST-CP +CLOSes. The HP balloons spread across separate SST power domains +(punits), so each gets its own SST-TF turbo budget. Each pod +prints, once per `sysbench cpu` iteration, the CPUs it is pinned +to, the sysbench thread count, sysbench events/s and the average +`Bzy_MHz` (APERF/MPERF-derived effective frequency, sampled by +`turbostat` from inside the pod) across the pinned CPUs: + +```text +[hp-1] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[hp-2] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[hp-3] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[hp-4] cpus=<...> threads= events_per_sec=<...> mhz_avg= +[lp] cpus=<...> threads= events_per_sec=<...> mhz_avg= +``` + +With PCT in effect, `mhz_avg` and per-thread `events_per_sec` are +visibly higher in the HP pods than in the LP pod. + +## 1. Prerequisites + +Hardware and platform: + +- A server with Intel(R) Xeon(R) 6 CPUs that support SST-PP and SST-CP. + This example was written against a dual-socket Xeon 6776P. +- SST-PP and SST-CP enabled on the platform (see step 2). +- A Linux kernel with the `isst_if_*` (or `isst_tpmi_*`) modules + loaded. Modern distro kernels include them. + +Kubernetes: + +- A working cluster. All commands target a single node; on a + multi-node cluster, schedule the demo pods on the PCT-capable node + (e.g. with `nodeSelector` or by tainting other nodes). +- Container runtime: containerd 1.7+ or CRI-O 1.26+ with NRI + enabled (the default in current versions). +- The balloons policy installed with PCT enabled (see step 5). + +Optional tools used in this example: + +- `intel-speed-select`. Most Linux distributions package it as part + of `linux-tools` or `intel-speed-select`; otherwise build it from + the Linux source tree under + `tools/power/x86/intel-speed-select/` (see the upstream + [documentation](https://docs.kernel.org/admin-guide/pm/intel-speed-select.html)). + Used only to configure and inspect SST-CP. Configuration via BIOS + is an alternative (see step 2.1). +- `turbostat`. The benchmark image already includes it (from the + Debian `linux-cpupower` package) and the demo pods use it to + report `Bzy_MHz` from inside the container. You only need + `turbostat` on the *node* if you want to cross-check the demo + numbers from outside the pod; in that case install it from your + distro's kernel-tools package. +- `crictl` and `ctr` (containerd) or `podman` (CRI-O) on the node + for loading the benchmark image without a registry. + +## 2. Prepare the node + +### 2.1. Clear stale `cpufreq` caps + +The Linux `cpufreq` subsystem caps each CPU at the lower of its +per-CPU `scaling_max_freq` and the SST-CP CLOS max. If a previous +workload (e.g. another resource policy, an earlier balloons run +with a `maxFreq:` cpuClass, or a manual `cpupower frequency-set`) +left `scaling_max_freq` below the hardware maximum on some CPUs, +those CPUs will stay capped even after they are associated to +CLOS 0. Reset every CPU's per-CPU `scaling_min_freq` and +`scaling_max_freq` to the hardware limits before starting the +demo: + +```bash +# node: +for f in /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq; do + base=${f%scaling_max_freq}cpuinfo_max_freq + sudo tee "$f" < "$base" > /dev/null +done +for f in /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq; do + base=${f%scaling_min_freq}cpuinfo_min_freq + sudo tee "$f" < "$base" > /dev/null +done + +# Verify (should print exactly two lines: the hardware min and max +# in kHz, e.g. "800000" and "4600000" on Xeon 6776P). +for i in $(seq 0 $(($(nproc) - 1))); do + cat /sys/devices/system/cpu/cpu$i/cpufreq/scaling_max_freq \ + /sys/devices/system/cpu/cpu$i/cpufreq/scaling_min_freq +done | sort -u +``` + +The cpuClasses below (step 5) deliberately leave `minFreq` / +`maxFreq` unset, so balloons will not write to these files; once +reset they stay at the hardware limits and the SST-CP CLOS bounds +become the effective frequency caps. This is also what the Linux +SST documentation recommends: *"Once associated, avoid changing +Linux cpufreq subsystem scaling frequency limits."* + +### 2.2. Enable SST-TF and SST-CP + +In assoc-only mode the balloons policy does **not** enable SST +features or program CLOS frequency bounds. Those must be in place +*before* deploying pods. With SST-TF enabled in ordered priority +mode the CLOS frequency bounds come from the SST-TF buckets +themselves (CLOS 0 = the bucket-0 HP turbo limit; CLOS 3 = the LP +clip frequency), so no manual `core-power config` is needed. + +`intel-speed-select turbo-freq enable --auto` enables, on every +punit that contains at least one of the CPUs passed via `-c`: + +- SST-TF (so HP cores can exceed the standard turbo-ratio bucket + limit), +- SST-CP with `priority-type:ordered`, +- the initial CPU-to-CLOS association (the passed CPUs -> CLOS 0, + every other CPU on the punit -> CLOS 3). + +The balloons policy overwrites the CPU-to-CLOS associations at pod +admission time, but it does **not** enable SST-TF or SST-CP for +you, so the initial designation must cover every punit you plan to +run HP pods on. Pick one CPU per punit on the node: + +```bash +# node: +# Discover punits and one representative CPU each. The "sst" tool +# (https://github.com/intel/intel-speed-select) prints this cleanly; +# you can also read it from goresctrl debug or from sysfs. +sudo ./sst info | awk '/SST-PP/,/SST-BF/' | grep -E '^\s+[0-9]' +# Output on a dual-socket Xeon 6776P: +# 0 0 0-31,128-159 +# 0 1 32-63,160-191 +# 1 0 64-95,192-223 +# 1 1 96-127,224-255 + +# Pick one CPU from each of the four punits, then: +export TF_INIT_CPUS=2,34,66,98 +sudo intel-speed-select -c $TF_INIT_CPUS turbo-freq enable -a +``` + +### 2.3. Configure SST-TF from BIOS (alternative) + +Many OEM BIOSes for Intel Xeon 6 expose SST-PP profile selection +and SST-TF enablement directly in Setup. If your platform supports +it, do the equivalent of the `turbo-freq enable -a` command from +BIOS and skip the `intel-speed-select` step. Consult your server +vendor's BIOS guide for the exact menu paths. + +### 2.4. Verify + +```bash +# node: +# SST-TF status on every punit (should print "enabled" for every +# punit you intend to host HP pods on). +sudo intel-speed-select perf-profile info 2>&1 \ + | grep -E 'package-|powerdomain-|speed-select-turbo-freq:' + +# Per-CPU CLOS association (initial; balloons will overwrite later). +sudo intel-speed-select -c 0,2,34,66,98 core-power get-assoc 2>&1 \ + | grep -E 'cpu-|clos:' +``` + +`get-assoc` should show `clos:0` for the CPUs in `$TF_INIT_CPUS` +and `clos:3` for every other CPU (including CPU 0, even though +its punit received an HP designation, because CPU 0 itself was +not in `$TF_INIT_CPUS`). + +## 3. Build the benchmark image + +The benchmark image runs `sysbench cpu` in a loop and prints one +status line per iteration. The effective frequency is measured with +`turbostat --cpu` over the same time window as the `sysbench` run, +restricted to the CPUs the container is pinned to. + +`turbostat` is used instead of `scaling_cur_freq` / +`/proc/cpuinfo`'s `cpu MHz` because the latter reflect what the OS +*requests* from the firmware; on HWP/`intel_pstate` kernels they +can lag or under-report when the firmware boosts autonomously. +`Bzy_MHz` is derived from the `APERF`/`MPERF` MSRs over the +sampling window and is the actual *busy* frequency the cores ran +at. + +Reading those MSRs requires access to `/dev/cpu/*/msr` and +`CAP_SYS_RAWIO`. In a standard Kubernetes cluster the simplest way +to get both is to run the benchmark pod as `privileged: true` with +the host `/dev` mounted. The pod yaml in step 6 does that. Make +sure the `msr` kernel module is loaded on the node: + +```bash +# node: +sudo modprobe msr +ls /dev/cpu/0/msr # must exist +``` + +Create the build context: + +```bash +mkdir -p pct-reporter && cd pct-reporter + +cat > reporter.sh <<'EOF' +#!/bin/bash +# Continuously run sysbench cpu and report, per iteration: +# label, cpus the container is pinned to (from /proc/self/status, +# which is correct even when running as privileged), thread count, +# sysbench events/s, and the average Bzy_MHz across the pinned +# CPUs as measured by turbostat over the same interval. +set -u +LABEL="${LABEL:-reporter}" +INTERVAL="${INTERVAL:-5}" + +CPUS_LIST="$(awk '/Cpus_allowed_list/ {print $2}' /proc/self/status)" + +expand_count() { + local list="$1" n=0 part lo hi + IFS="," read -ra parts <<< "$list" + for part in "${parts[@]}"; do + if [[ "$part" == *-* ]]; then + lo="${part%-*}"; hi="${part#*-}" + n=$(( n + hi - lo + 1 )) + else + n=$(( n + 1 )) + fi + done + echo "$n" +} +# Default: one sysbench thread per pinned logical CPU. Override +# with THREADS env (used by the A/B pod in step 8). +NTHREADS="${THREADS:-$(expand_count "$CPUS_LIST")}" + +echo "[$LABEL] starting: cpus=$CPUS_LIST threads=$NTHREADS interval=${INTERVAL}s" + +while true; do + TS_OUT="$(mktemp)" + turbostat --quiet --cpu "$CPUS_LIST" --show CPU,Bzy_MHz \ + --num_iterations 1 --interval "$INTERVAL" \ + > "$TS_OUT" 2>/dev/null & + TS_PID=$! + + SB_OUT="$(sysbench cpu --threads="$NTHREADS" --time="$INTERVAL" \ + run 2>/dev/null)" + wait "$TS_PID" + + EVPS="$(echo "$SB_OUT" | awk -F: '/events per second/ {gsub(/ /,"",$2); print $2}')" + # Average Bzy_MHz across the requested CPUs. Skip header and + # turbostat's "-" all-CPUs summary row. + MHZ_AVG="$(awk 'NR>1 && $1 ~ /^[0-9]+$/ {s+=$2; n++} END {if (n) printf "%.0f", s/n}' "$TS_OUT")" + rm -f "$TS_OUT" + + printf '[%s] cpus=%s threads=%d events_per_sec=%s mhz_avg=%s\n' \ + "$LABEL" "$CPUS_LIST" "$NTHREADS" "${EVPS:-?}" "${MHZ_AVG:-?}" +done +EOF +chmod +x reporter.sh + +cat > Dockerfile <<'EOF' +FROM debian:stable-slim +RUN apt-get update \ + && apt-get install -y --no-install-recommends \ + sysbench linux-cpupower util-linux ca-certificates \ + && rm -rf /var/lib/apt/lists/* +COPY reporter.sh /usr/local/bin/reporter.sh +ENTRYPOINT ["/usr/local/bin/reporter.sh"] +EOF +``` + +`linux-cpupower` ships `/usr/sbin/turbostat`. `util-linux` provides +`taskset` and the rest of the standard userspace. + +Build the image. Use whichever tool is available on your build host. +With docker, prefix with `sudo` if your user is not in the `docker` +group: + +```bash +# With docker: +docker build -t localhost/pct-reporter:demo . + +# Or with podman: +podman build -t localhost/pct-reporter:demo . +``` + +If the build host is behind an HTTP proxy, pass it through: + +```bash +docker build \ + --build-arg http_proxy=$http_proxy \ + --build-arg https_proxy=$https_proxy \ + -t localhost/pct-reporter:demo . +``` + +## 4. Make the image available to the kubelet (no registry) + +If you built the image on the same machine as the kubelet, import it +directly into the container runtime's image store. Pick the +subsection that matches your runtime. + +### 4.1. containerd + +```bash +# On the build host: +docker save localhost/pct-reporter:demo -o /tmp/pct-reporter.tar +# (or: podman save -o /tmp/pct-reporter.tar localhost/pct-reporter:demo) + +# node: +sudo ctr -n k8s.io images import /tmp/pct-reporter.tar +sudo crictl images | grep pct-reporter +``` + +The `-n k8s.io` namespace is the one kubelet uses; without it the +image will not be visible to Kubernetes. + +### 4.2. CRI-O + +```bash +# On the build host: +docker save localhost/pct-reporter:demo -o /tmp/pct-reporter.tar +# (or: podman save -o /tmp/pct-reporter.tar localhost/pct-reporter:demo) + +# node: +sudo podman --root /var/lib/containers/storage load -i /tmp/pct-reporter.tar +sudo crictl images | grep pct-reporter +``` + +`--root /var/lib/containers/storage` makes `podman` load the image +into the same storage CRI-O reads from. If you built the image +directly on the node with `sudo podman build`, this step is not +needed. + +The demo pods set `imagePullPolicy: IfNotPresent` and use the image +reference `localhost/pct-reporter:demo`, so the kubelet will not +attempt to pull from a registry. Note that the kubelet garbage- +collects unused local images: re-import the image if pod creation +later fails with `ErrImagePull`. + +## 5. Install / reconfigure the balloons policy with PCT enabled + +The Intel SST device (`/dev/isst_interface`) is owned by `root` and +is normally not visible inside non-privileged Kubernetes pods. The +balloons Helm chart exposes a `pct` value that grants the plugin +pod the access it needs to drive PCT. PCT cpuClass support +(`pctPriority`, `pctClosID`) is not in a released balloons chart +yet, so install the unstable build that includes it: + +```bash +helm install \ + --devel \ + -n kube-system \ + balloons \ + oci://ghcr.io/askervin/nri-plugins/helm-charts/nri-resource-policy-balloons \ + --version v0.12-pct2-unstable \ + --set image.name=ghcr.io/askervin/nri-plugins/nri-resource-policy-balloons \ + --set image.tag=v0.12-pct2-unstable \ + --set image.pullPolicy=Always \ + --set pct=true +``` + +`--set pct=true` makes the plugin pod privileged and mounts the +host `/dev` at `/host/dev`. Enable it only on nodes where PCT +cpuClasses are used. + +Once PCT support is in a released balloons chart, the equivalent +install command will be the standard one from +[balloons.md](balloons.md#deployment) plus `--set pct=true`: + +```bash +# Stable form (use this once PCT support is released): +helm repo add nri-plugins https://containers.github.io/nri-plugins +helm repo update +helm install balloons nri-plugins/nri-resource-policy-balloons \ + --namespace kube-system \ + --set pct=true +``` + +Verify the plugin pod has the privileged settings the chart's +`pct=true` flag enables: + +```bash +kubectl -n kube-system get pod \ + -l app.kubernetes.io/name=nri-resource-policy-balloons \ + -o jsonpath='{.items[0].spec.containers[0].securityContext}{"\n"}' +# Expect: {"privileged":true} + +kubectl -n kube-system get pod \ + -l app.kubernetes.io/name=nri-resource-policy-balloons \ + -o jsonpath='{.items[0].spec.containers[0].volumeMounts[?(@.name=="hostdev")]}{"\n"}' +# Expect a mount of /host/dev. +``` + +Now apply the policy configuration. The `BalloonsPolicy` below +defines three cpuClasses with only `pctClosID` set (no +`pctPriority`, no frequency overrides), which selects assoc-only +mode for PCT and lets the SST-CP CLOS bounds -- set by the +`intel-speed-select turbo-freq enable -a` recipe in step 2 -- define +the actual frequency caps. Following the Linux SST guidance, the +cpuClasses do not touch `minFreq` / `maxFreq` at all. + +The CLOS layout matches what `turbo-freq enable -a` programs in +ordered priority mode: + +- CLOS 0 -- HP -- bucket-0 turbo (Pmax), +- CLOS 3 -- LP -- LP clip (= base on this platform), +- the class named `default` is the implicit fallback for idle CPUs + and balloons that do not specify their `cpuClass`. It is mapped + to the LP CLOS so idle CPUs do not consume HP turbo budget. + +The HP cpuClass additionally disables the deep C-states `C6` and +`C6P`. The HP cores in this demo are continuously busy with +`sysbench`, so C-state entry would normally not happen anyway; +the setting is included because removing C-state wake-up latency +is the typical reason latency-sensitive workloads ask for priority +cores. List the C-state names available on the node with +`grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name`. **Do +not** disable C-states on the default / LP classes: idle CPUs in +deep C-states do not count toward the package's active-core count +and therefore free turbo budget for the HP cores. + +The HP balloon type uses `preferNewBalloons: true` and +`maxCPUs: 8` (the SST-TF bucket-0 HP-core limit per punit), so +each HP pod lands in its own balloon and the balloons spread +across separate punits. `minCPUs` is intentionally left unset so +the balloon size equals what the pod requests; with no +`hideHyperthreads` the container sees exactly the logical CPUs the +balloon allocated. + +`agent.nodeResourceTopology: true` and `showContainersInNrt: true` +make the plugin publish per-balloon and per-container CPU sets in +the cluster's `NodeResourceTopology` (NRT) CRs. The verification +queries in step 7 read those CRs to confirm exactly which CPUs +each pod's container ended up pinned to. The NRT CRD must exist +in the cluster (`kubectl get crd +noderesourcetopologies.topology.node.k8s.io`). + +`availableResources` is intentionally left unset: balloons manages +all CPUs of the node, as in the normal mode of operation. The +`reservedResources` covers physical CPU 0 (`0` and its SMT sibling +`128`) and physical CPU 1 (`1` and its SMT sibling `129`); adjust +the sibling numbers if your topology differs (`lscpu -e` shows +them). + +```bash +cat > balloons-pct-assoconly.yaml < ... +pct: mode=assoc-only, 3 PCT cpuClass(es), 4 punit(s) across 2 package(s) +pct: assoc-only: CLOS 0 programmed min=0 max= kHz +pct: assoc-only: CLOS 3 programmed min=0 max= kHz +pct: cpuClass "hp-clos0" classified HP (assoc-only: CLOS 0 ...) +``` + +The `assoc-only: CLOS N programmed` lines record permissive +(min=0, max=hardware ceiling) bounds that the plugin writes when +entering assoc-only mode; they leave the SST-CP CLOS bounds that +`turbo-freq enable -a` programmed in step 2 unchanged in practice, +because the effective frequency is the minimum of the per-CLOS +cap and the SST-TF bucket-0 limit. The plugin only classifies one +cpuClass per priority bucket on the same CLOS, so when both +`default` and `lp-clos3` use CLOS 3 only one of them is reported in +the classification log. + +If any punit you intend to host HP pods on shows up with an +`assoc-only: SST-TF disabled on pkg=N punit=M` warning, repeat +step 2 with a CPU from that punit included in `$TF_INIT_CPUS`. + +## 6. Deploy the HP and LP pods + +Four HP pods and one LP pod. Each HP pod requests 2 CPUs; with +`preferNewBalloons: true` and `maxCPUs: 8` on `hp-bln`, each pod +gets its own balloon, and PCT placement spreads the balloons +across separate punits (one per HP pod, up to four on a +dual-socket Xeon 6776P). Because `hideHyperthreads` is not set, +the container sees exactly the requested logical CPUs and the +reporter starts that many sysbench threads. + +The pods are `privileged: true` and mount the host `/dev` because +`turbostat` inside the container reads `/dev/cpu/*/msr` to compute +`Bzy_MHz` (see step 3). + +```bash +for i in 1 2 3 4; do +cat > pod-hp-$i.yaml < pod-lp.yaml <` with your own measurements): + +```text +[hp-1] cpus=32,160 threads=2 events_per_sec=4154.72 mhz_avg=4600 +[hp-2] cpus=100,228 threads=2 events_per_sec=4152.06 mhz_avg=4600 +[hp-3] cpus=10,138 threads=2 events_per_sec=4154.54 mhz_avg=4600 +[hp-4] cpus=64,192 threads=2 events_per_sec=4151.79 mhz_avg=4600 +[lp] cpus=65-68,193-196 threads=8 events_per_sec=8295.63 mhz_avg=2300 +``` + +Per-thread throughput on this run: + +| Tag | threads | mhz_avg | events_per_sec | events_per_sec per thread | +|--------|---------|---------|----------------|---------------------------| +| hp-1 | 2 | 4600 | 4154.72 | 2077.36 | +| hp-2 | 2 | 4600 | 4152.06 | 2076.03 | +| hp-3 | 2 | 4600 | 4154.54 | 2077.27 | +| hp-4 | 2 | 4600 | 4151.79 | 2075.89 | +| lp | 8 | 2300 | 8295.63 | 1036.95 | + +Verify that the four HP balloons landed on four distinct punits. +With the policy's `cpu` debug log enabled, balloons logs the +(pkg, punit) of each balloon at admission time. You can also map +the `cpus` line of each HP pod back to a punit through the +`sst info` output from step 2 -- each HP pod's CPUs should fall +into a different punit row. + +Optionally cross-check the same numbers from outside the pod with +`turbostat` on the node: + +```bash +# node: +# Replace the CPU list with the union of cpus= reported by the +# five pods. +sudo turbostat --show CPU,Bzy_MHz --quiet -c -i 2 -n 2 +``` + +The pod-reported `mhz_avg` and the node-side `Bzy_MHz` come from +the same source (APERF/MPERF), so they should agree to within a +few MHz. + +Verify the CLOS association of the pinned CPUs: + +```bash +# node: +sudo intel-speed-select -c core-power get-assoc 2>&1 \ + | grep -E 'cpu-|clos:' +``` + +Expected: `clos:0` for every CPU in any HP pod, `clos:3` for every +CPU in the LP pod. + +Confirm the policy decision from its log: + +```bash +kubectl -n kube-system logs ds/nri-resource-policy-balloons \ + | grep -E 'assigning container|associated cpus .* to CLOS' +``` + +### 7.1. Verify container-to-balloon-to-CPU mapping via NRT + +The `agent.nodeResourceTopology: true` and `showContainersInNrt: +true` settings in step 5 make the plugin publish per-balloon and +per-container CPU sets in the +`noderesourcetopologies.topology.node.k8s.io` CR for the node. +Print every balloon (zone type `balloon`) with its CPU set, and +every container assigned to it (zone type `allocation for +container`): + +```bash +kubectl get noderesourcetopologies.topology.node.k8s.io -o json | jq -r ' + ["NODE","BALLOON","CPUSET"], + ( + .items.[] as $node + | $node.zones[] + | select(.type == "balloon") + | [ + $node.metadata.name, + .name, + (.attributes[] | select(.name=="cpuset") | .value) + ] + ) | @tsv' + +kubectl get noderesourcetopologies.topology.node.k8s.io -o json | jq -r ' + ["NODE","BALLOON","CONTAINER","CPUS"], + ( + .items.[] as $node + | $node.zones[] + | select(.type == "allocation for container") + | [ + $node.metadata.name, + .parent, + .name, + (.attributes[] | select(.name=="cpuset") | .value) + ] + ) | @tsv' +``` + +Expected (one row per balloon and one row per pod's container): + +- One `hp-bln[0]`..`hp-bln[3]` zone, each with a 2-CPU set on a + distinct punit, and the corresponding `pct-hp-N/bench` container + pinned to that exact set. +- One `lp-bln[0]` zone with the 8-CPU set, and `pct-lp/bench` + pinned to the same set. +- A `reserved[0]` zone covering the currently-used subset of the + reserved pool (the SMT pair of physical CPU 0 -- `0,128` -- is the + typical outcome on this layout; balloons compacts the reserved + balloon to what its containers actually need). +- An empty `default[0]` zone may also appear; it is the unused + default balloon and can be ignored. + +The CPU sets here must match the `cpus=` value printed by the +benchmark inside each pod (step 7) and the `clos:N` reported by +`core-power get-assoc` for those same CPUs. + +## 8. A/B comparison + +Run the same 2-thread workload on the LP CLOS instead of an HP +CLOS. The pod below pins to the LP balloon (CLOS 3, base +frequency cap) and uses `THREADS=2` to keep the sysbench workload +identical to a single `pct-hp-*`: + +```bash +kubectl delete pod pct-lp --now # free LP-balloon CPUs + +cat > pod-hp-on-lp.yaml <&1 \ + | grep -E 'enable-status' | sort -u +# Expect (both lines): +# clos-enable-status:disabled +# enable-status:disabled + +sudo intel-speed-select perf-profile info 2>&1 \ + | grep -E 'speed-select-turbo-freq:' | sort -u +# Expect: speed-select-turbo-freq:disabled +``` + +### 9.3. Restore `cpufreq` defaults on the node + +```bash +# node: +for f in /sys/devices/system/cpu/cpu*/cpufreq/scaling_max_freq; do + base=${f%scaling_max_freq}cpuinfo_max_freq + sudo tee "$f" < "$base" > /dev/null +done +for f in /sys/devices/system/cpu/cpu*/cpufreq/scaling_min_freq; do + base=${f%scaling_min_freq}cpuinfo_min_freq + sudo tee "$f" < "$base" > /dev/null +done + +# Verify (should print exactly the hardware min and the hardware +# max in kHz): +for i in $(seq 0 $(($(nproc) - 1))); do + cat /sys/devices/system/cpu/cpu$i/cpufreq/scaling_max_freq \ + /sys/devices/system/cpu/cpu$i/cpufreq/scaling_min_freq +done | sort -u +``` + +### 9.4. Remove leftover files + +```bash +rm -f balloons-pct-assoconly.yaml \ + pod-hp-1.yaml pod-hp-2.yaml pod-hp-3.yaml pod-hp-4.yaml \ + pod-lp.yaml pod-hp-on-lp.yaml +# Optional: +rm -rf pct-reporter +# Optional, on the node, free disk used by the demo image: +# sudo crictl rmi localhost/pct-reporter:demo +``` + +## 10. Optional: help the scheduler avoid HP over-subscription (experimental) + +By default the Kubernetes scheduler is unaware of how many CPUs +on a node can become HP cores: it sees the BalloonsPolicy +neither as a CRD it understands nor as a resource it can +bin-pack on. Two HP pods can therefore land on the same node +even if a second node would have given them HP capacity, and +HP pods can pile up beyond the platform's actual HP budget. + +The balloons policy ships an experimental opt-in that publishes +a per-cpuClass extended resource on the local Node so that the +default scheduler can do that bin-packing for you. Set +`publishExtendedResource: true` on every PCT-enabled cpuClass +(i.e. classes that carry `pctClosID` or `pctPriority`) and the +agent advertises: + +```text +status.capacity: + cpuclass.balloons.nri.io/: +``` + +The capacity reflects "CPUs eligible for this class that are +not currently held by balloons of other classes", and is +re-published on every container create/update/release, so +cross-class consumption (e.g. an LP balloon eating CPUs that +would otherwise have been available for HP) is reflected +immediately. + +For HP classes, the per-punit cap used in the capacity +formula is the *guaranteed top-turbo HP CPU count* (the +smallest non-zero SST-TF bucket `HighPriorityCoreCount`, or +the SST-BF `HighPriorityCPUs` count when TF is unsupported) +-- not the larger `MaxHpCpus`. That is the number of HP CPUs +per punit that can simultaneously sustain the highest turbo +frequency this platform exposes, which is the right figure +for the scheduler to bin-pack on. In assoc-only mode a punit +contributes to HP capacity only when SST-TF is currently +enabled on it (the operator's responsibility -- typically via +`intel-speed-select ... turbo-freq enable -a`); a punit where +SST-TF is disabled cannot exceed the standard turbo-ratio +bucket frequency and contributes `0`, so the scheduler will +not bin-pack HP pods onto nodes that cannot deliver top +turbo. Same-class consumption inside HP is intentionally not +subtracted (an admitted HP pod does not shrink the published +HP capacity); only cross-class consumption is. LP capacity +equals `|Allowed \ held|`. + +Add the flag to the policy: + +```yaml + cpuClasses: + - name: hp-clos0 + pctClosID: 0 + disabledCstates: [C6, C6P] + publishExtendedResource: true # experimental + - name: lp-clos3 + pctClosID: 3 + publishExtendedResource: true # experimental +``` + +...and to every HP/LP pod, alongside the existing `cpu` +request: + +```yaml + resources: + requests: + cpu: "2" + memory: "128Mi" + cpuclass.balloons.nri.io/hp-clos0: "2" + limits: + cpu: "2" + memory: "128Mi" + cpuclass.balloons.nri.io/hp-clos0: "2" +``` + +Verify on the node after applying: + +```bash +kubectl get node -o jsonpath='{.items[0].status.capacity}' \ + | jq 'with_entries(select(.key | startswith("cpuclass")))' +# Expect (HP capacity = sum_punit GuaranteedHpCpus over +# SST-TF-enabled punits; LP capacity = |Allowed \ held|): +# { +# "cpuclass.balloons.nri.io/hp-clos0": "", +# "cpuclass.balloons.nri.io/lp-clos3": "" +# } +``` + +A pod whose request exceeds the published capacity gets +`FailedScheduling: Insufficient cpuclass.balloons.nri.io/` +and stays `Pending` until another pod releases the resource. + +This is an experimental flag: the resource name, semantics +(capacity vs. allocatable, conservative-on-grow), and update +cadence may change before becoming stable. + +## 11. Troubleshooting + +- Plugin pod log shows `Speed Select Technology (SST) support not + detected`: the pod cannot access `/dev/isst_interface`. Re-install + the chart with `--set pct=true`. Verify with `kubectl -n + kube-system get pod -l app.kubernetes.io/name=nri-resource-policy-balloons + -o jsonpath='{.items[0].spec.containers[0].securityContext}'` + that it shows `privileged:true`. +- Plugin log shows `pct: assoc-only: SST-TF disabled on pkg=N + punit=M`: that punit has SST-TF off, so HP cores on it cannot + exceed the standard turbo-ratio bucket frequency even when + associated to CLOS 0. Add a CPU from that punit to + `$TF_INIT_CPUS` in step 2 and rerun `intel-speed-select -c + $TF_INIT_CPUS turbo-freq enable -a`. +- `intel-speed-select --info` reports SST-TF as *not supported*: the + `isst_if_*` or `isst_tpmi_*` kernel modules may be missing; load + them or use a more recent distro kernel. On some platforms SST + features must be enabled in BIOS first. +- Pods stuck in `ErrImagePull` with image + `localhost/pct-reporter:demo`: the image was not imported into + the kubelet's container runtime store, or the kubelet has + garbage-collected it. Repeat step 4, then `kubectl delete pod + ...` to retry. +- Pod log shows `turbostat: no /dev/cpu/0/msr` or `mhz_avg=?`: the + `msr` kernel module is not loaded on the node. Run `sudo modprobe + msr` on the node and recreate the pod. If the pod is not + privileged or `/dev` is not mounted, fix the pod yaml (step 6). +- HP CPUs do not reach Pmax under load: another HP pod on the same + punit may be consuming the bucket-0 turbo budget. Verify the + per-punit HP CPU count stays within the SST-TF bucket-0 limit + (8 on this platform; check with `sst info`) and that each HP + balloon really ended up on a different punit (see step 7). + Cross-check with `turbostat --show CPU,Bzy_MHz`. +- All four HP balloons end up on the same punit: confirm + `preferNewBalloons: true` on `hp-bln` and that the plugin + build includes PCT-aware balloon placement. The plugin log + prints the punit each new balloon is assigned to. +- Validation error `pctPriority and pctClosID are mutually + exclusive`: only one of the two may be set on a cpuClass. For + assoc-only mode use `pctClosID` and leave `pctPriority` unset. diff --git a/docs/resource-policy/policy/balloons-pct-quickstart.md b/docs/resource-policy/policy/balloons-pct-quickstart.md new file mode 100644 index 000000000..546b0e15b --- /dev/null +++ b/docs/resource-policy/policy/balloons-pct-quickstart.md @@ -0,0 +1,337 @@ +# Quick start: Priority Core Turbo (PCT) in Kubernetes with Balloons + +Balloons policy is an NRI plugin to container runtimes, containerd and +CRI-O. The policy creates balloons that associate sets of containers +with sets of CPUs. Containers belonging to a balloon are allowed to +run on the CPUs belonging to the same balloon and not on CPUs +belonging to other balloons. Balloons policy allows different CPU +tunings on different balloons. This guide shows how to run +high-priority containers with maximum turbo frequencies in Kubernetes +by scheduling them on nodes with free PCT capacity, and by tuning +their CPUs. + +This short guide shows the minimum required to: + +1. install the balloons NRI policy in a Kubernetes cluster, +2. configure it so that nodes with Intel Priority Core Turbo (PCT) + hardware publish a `cpuclass.balloons.nri.io/hp-pct` extended + resource the scheduler can bin-pack on, and +3. run two Burstable pods on the same node -- one that asks for HP + cores and runs at the platform's top turbo frequency, and one + that does not and runs at the base frequency -- and observe the + performance difference with one `kubectl exec` per pod. + +This guide uses balloons plugin's **managed** PCT mode, that is, the +plugin owns the SST-CP/SST-TF configuration, overriding whatever +existing configuration from BIOS settings and/or the +intel-speed-select tool. Balloons supports also **assoc-only mode** +that uses pre-defined and only associates balloons' CPUs to existing +CLOSes. + +This document does not cover manual configuration of underlying SST +technology, step-by-step validation of real CPU frequencies, +CPUs-to-containers mapping, benchmarking. These are covered in longer +balloons PCT examples written separately for +[managed-mode](balloons-pct-example-auto.md) and +[assoc-only-mode](balloons-pct-example-manual.md). + + +## Prerequisites + +- A Kubernetes cluster (1.27 or newer) with NRI enabled in every + node's container runtime (containerd >= 1.7 or CRI-O >= 1.26). +- At least one node that supports Intel SST-CP and SST-TF, e.g. + Intel Xeon 6700P/6900P. Nodes without PCT-capable hardware + will simply not publish the `cpuclass.balloons.nri.io/hp-pct` + extended resource, so HP pods naturally land on nodes that do. +- `kubectl` configured to talk to the cluster. + +## 1. Install balloons with PCT enabled + +NOTE: PCT cpuClass support (`pctPriority`, `pctClosID`, +`publishExtendedResource`) is not in a released balloons chart yet, so +this guide installs balloons from an unstable build. + +```bash +helm install \ + --devel \ + -n kube-system \ + balloons \ + oci://ghcr.io/askervin/nri-plugins/helm-charts/nri-resource-policy-balloons \ + --version v0.12-pct2-unstable \ + --set image.name=ghcr.io/askervin/nri-plugins/nri-resource-policy-balloons \ + --set image.tag=v0.12-pct2-unstable \ + --set image.pullPolicy=Always \ + --set pct=true +``` + +`--set pct=true` gives the plugin pod the `privileged: true` +security context and `/dev` mount it needs to talk to +`/dev/isst_interface`. + +Wait for the daemonset to be Ready (one Pod per node): + +```bash +kubectl -n kube-system rollout status ds/nri-resource-policy-balloons +``` + +## 2. Apply the minimal PCT policy + +The policy below defines two cpuClasses and four balloonTypes. + +cpuClasses: + +- `default` -- the implicit fallback class. It carries + `pctPriority: low`, which makes it the **LP class**: idle + CPUs and every balloon whose `cpuClass` is unset (i.e. uses + `default`) run on cores capped at base frequency. Defining + an LP class is required: balloons routes idle CPUs to the LP + CLOS so they do not inflate the active-HP-core count on each + PCT power domain (punit). +- `hp-pct` -- `pctPriority: high`. Containers in this class + run on cores programmed for top turbo frequency. The + `publishExtendedResource: true` flag is what makes the + scheduler see the per-node HP capacity. + +balloonTypes (this part mirrors a typical Kubernetes +CPU-manager-style split, with one extra balloon for HP pods): + +- `reserved` -- the implicit kube-system balloon that runs on + `reservedResources.cpu`. +- `hp-bln` -- picked by the + `balloon.balloons.resource-policy.nri.io: hp-bln` pod + annotation. Uses the `hp-pct` cpuClass, so its containers + get top-turbo HP cores. `preferNewBalloons: true` puts each + HP pod in a fresh balloon on a separate PCT power domain, + and `maxCPUs: 8` keeps the balloon within one bucket-0 HP + budget per punit on Xeon 6. +- `guaranteed` -- picked by Kubernetes pod QoS class + `Guaranteed`. Containers get exclusive CPUs in the same + spirit as Kubernetes' built-in CPU manager. +- `burstable` -- picked by Kubernetes pod QoS class + `Burstable`. Containers share CPUs with other burstables. + `shareIdleCPUsInSame: package` lets a burstable container + burst onto every otherwise-idle CPU in the same CPU package, + giving the largest pool of burst CPUs that still preserves + data locality (good balance between memory latency and + bandwidth). + +```bash +cat > balloons-pct.yaml < **Bookkeeping rule.** When using +> `publishExtendedResource`, the number of HP CPUs requested +> via `cpuclass.balloons.nri.io/hp-pct` **must equal the pod's +> `cpu` request**. Each HP CPU consumed by the container has +> to be counted once in the scheduler's extended-resource +> bookkeeping and once in normal CPU bookkeeping; mismatched +> counts let the scheduler oversubscribe HP CPUs on the node +> or, conversely, leave them stranded. + +```bash +cat > pods.yaml <&1 | tail -2 +kubectl exec plain-app -- openssl speed -seconds 5 -evp aes-128-cbc 2>&1 | tail -2 +``` + +Example output (Xeon 6776P, 4.6 GHz HP vs. 2.3 GHz LP): + +```text +# hp-app +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +AES-128-CBC 1788838.33k 2156328.92k 2205063.68k 2217644.85k 2221249.33k 2221617.97k + +# plain-app +type 16 bytes 64 bytes 256 bytes 1024 bytes 8192 bytes 16384 bytes +AES-128-CBC 893471.74k 1075806.28k 1100445.44k 1106751.28k 1108534.89k 1108836.35k +``` + +The HP pod processes AES-128-CBC at roughly 2x the throughput +of the plain pod (`2221617.97k` vs. `1108836.35k` bytes/s on +the 16 KB block size). The ratio mirrors the HP/LP frequency +ratio of the node and is reproducible from one invocation to +the next. + +## 5. Clean up + +```bash +kubectl delete -f pods.yaml +kubectl delete -f balloons-pct.yaml +helm -n kube-system uninstall balloons +rm -f balloons-pct.yaml pods.yaml +``` + +## What next + +- For a deeper, fully verified walk-through of managed mode -- + including `intel-speed-select` inspection, `NodeResourceTopology` + verification, and per-pod `sysbench`/`turbostat` reporting -- + see [balloons-pct-example-auto.md](balloons-pct-example-auto.md). +- For the assoc-only PCT mode, where the operator owns the + SST-CP/SST-TF configuration and balloons only associates CPUs + to operator-programmed CLOSes, see + [balloons-pct-example-manual.md](balloons-pct-example-manual.md). +- For the full balloons policy reference (other cpuClass + fields, C-state control, cpufreq, scheduling-class + integration), see [balloons.md](balloons.md). diff --git a/docs/resource-policy/policy/balloons.md b/docs/resource-policy/policy/balloons.md index 741fad8de..9060127dc 100644 --- a/docs/resource-policy/policy/balloons.md +++ b/docs/resource-policy/policy/balloons.md @@ -851,23 +851,33 @@ memory-type.resource-policy.nri.io/container.CONTAINER_NAME: HBM,DRAM These options configure CPU behavior and power management. **`cpuClass`** (string) -- References a CPU class defined in `control.cpu.classes` - (policy-level configuration). +- References a CPU class name defined in `cpuClasses` (preferred) or + in `control.cpu.classes` (legacy). - Applied when balloon is created, inflated, or deflated. - Configures frequency scaling and C-states for CPUs in the balloon. +- If left unset and a `cpuClasses` entry named `default` exists, that + `default` class is applied instead. **`idleCPUClass`** (string, policy-level configuration) - CPU class for idle CPUs (not in any balloon). - Applied when CPUs are removed from balloons. - -**`control.cpu.classes`** (object, policy-level configuration): - -Each CPU class (keyed by name) can define: - -- `minFreq` (integer): Minimum CPU frequency in kHz. -- `maxFreq` (integer): Maximum CPU frequency in kHz. -- `uncoreMinFreq` (integer): Minimum uncore frequency in kHz. -- `uncoreMaxFreq` (integer): Maximum uncore frequency in kHz. +- If left unset and a `cpuClasses` entry named `default` exists, that + `default` class is applied to idle CPUs instead. + +**`cpuClasses`** (list, policy-level configuration): + +User-friendly CPU class definitions. Each class is an object with: + +- `name` (string): Class name referenced by `cpuClass` in balloon types. +- `minFreq` (string): Minimum CPU frequency. Accepts values with + units: `"3.2GHz"`, `"2900MHz"`, `"2900000kHz"`, or a string + containing plain number in kHz: `"2900000"`. Also accepts symbolic + names: `"min"` (platform minimum), `"base"` (CPU base frequency), + `"turbo"` (maximum turbo frequency), which are resolved at runtime + from sysfs. +- `maxFreq` (string): Maximum CPU frequency (same format). +- `uncoreMinFreq` / `uncoreMaxFreq` (string): Uncore frequency limits + (same format). - `disabledCstates` (list): C-state names to disable (e.g., `["C6", "C8"]`). - Disabling deep C-states reduces latency by preventing deep sleep. - Disabling intermediate C-states keeps CPU more responsive longer @@ -875,6 +885,29 @@ Each CPU class (keyed by name) can define: not needed. - List available C-states: `grep . /sys/devices/system/cpu/cpu0/cpuidle/state*/name`. +- `energyPerformancePreference` (integer): EPP value for CPUs. +- `freqGovernor` (string): CPUFreq governor (e.g., `"performance"`). +- `turboPriority` (integer): Controls exclusive turbo frequency + access. Among CPU classes with active balloons, only the class + with the highest `turboPriority` gets the symbolic frequency + `"turbo"` resolved to the actual turbo frequency. All other + classes get `"turbo"` resolved to the base frequency. When the + highest-priority class no longer has active balloons, the next + highest-priority class regains turbo. If all classes have + `turboPriority` 0 (default), every class gets real turbo -- no + competition occurs. `turboPriority` arbitration is scoped to a + *turbo domain* (see `turboDomain` below), so on multi-socket + systems a low-priority class on one socket can keep turbo even + when a higher-priority class is active on another socket. + +**`turboDomain`** (string, policy-level configuration): + +Selects the scope over which `turboPriority` arbitration happens. The +default is `"package"`: every package independently pick its own +turboPriority winner. Set to `"system"` if highest `turboPriority` +classes anywhere should suppress turbo on every other class +independently of CPU core locations. On single-socket systems the two +modes behave identically. ```yaml balloonTypes: @@ -884,6 +917,113 @@ balloonTypes: cpuClass: normal idleCPUClass: powersave +cpuClasses: +- name: turbo + minFreq: "turbo" + maxFreq: "turbo" + disabledCstates: [C6, C8, C10] + turboPriority: 10 +- name: normal + minFreq: "min" + maxFreq: "turbo" + turboPriority: 1 +- name: powersave + minFreq: "min" + maxFreq: "1.2GHz" +``` + +#### Priority Core Turbo (PCT) + +On Intel Xeon CPUs that support [Intel Speed Select +Technology](https://docs.kernel.org/admin-guide/pm/intel-speed-select.html) +(SST), the balloons policy can additionally drive *Priority Core +Turbo* (PCT) on a per-cpuClass basis. PCT lets a small number of +*High Priority* (HP) cores reach the maximum turbo frequency +while the remaining *Low Priority* (LP) cores are capped. The +mapping between cpuClasses and the underlying SST-CP CLOSes is +managed by the *PCT allocator* using the +[goresctrl SST library](https://github.com/intel/goresctrl). + +Two new fields on a `cpuClasses` entry enable PCT: + +- `pctPriority` (string, optional): `"high"` or `"low"`. When set, + the balloons policy enters **managed mode** for PCT: it + performs the full SoC-wide SST setup (CP reset, TF enable, CLOS + configuration, CP enable) and associates CPUs of any balloon + using this cpuClass to the HP CLOS (default CLOS 0) or the LP + CLOS (default CLOS 3). At most one managed `high` and one + managed `low` cpuClass is allowed. +- `pctClosID` (integer, optional, 0..*ClosCount-1*): pins this + cpuClass to a specific CLOS slot and selects **assoc-only + mode**: the policy only associates CPUs to the given CLOS + without reconfiguring the SoC-wide SST state. Use this when an + operator or the BIOS has already configured the CLOSes. + +`pctPriority` and `pctClosID` are **mutually exclusive** on the +same cpuClass. Managed and assoc-only cpuClasses cannot be mixed +in the same configuration. + +By default the CLOS minimum/maximum frequencies programmed in +managed mode come from the cpuClass's own `minFreq`/`maxFreq`. +Two optional overrides exist for cases where the hardware CLOS +bounds should differ from the OS-visible cpufreq limits: + +- `pctMinFreq` (string, optional): CLOS minimum frequency, + defaults to `minFreq`. Accepts the same units and symbolic + names. Resolves `"turbo"` directly to the hardware maximum + turbo frequency, regardless of soft `turboPriority` + arbitration. +- `pctMaxFreq` (string, optional): CLOS maximum frequency, + defaults to `maxFreq`. Same caveats as `pctMinFreq`. + +On hosts without SST support the PCT fields are ignored with a +warning, so a single cpuClass YAML can be portable across PCT and +non-PCT systems. + +**Allocation behaviour.** PCT settings also bias CPU selection: + +- For each `pctClosID: N` referenced by any cpuClass, a static + virtual device `SST CLOS N` is registered with the CPUs the SST + hardware currently maps to that CLOS. Balloon types using that + cpuClass prefer to be close to it; other balloon types + automatically prefer to be far from it. +- In managed mode a dynamic virtual device `SST PCT HP reserve` + is registered with the CPUs of the package that has the most + free HP-capable CPUs. Balloon types whose cpuClass has + `pctPriority: high` prefer to be close to it (so their + containers actually enjoy PCT turbo), while every other balloon + type prefers to be far from it (so they do not drain the package + on which an HP container relies for turbo budget). The membership + is recomputed on every balloon resize. + +```yaml +cpuClasses: +- name: rt-hp + minFreq: "turbo" + maxFreq: "turbo" + pctPriority: high +- name: bg-lp + minFreq: "min" + maxFreq: "base" + pctPriority: low +``` + +**`control.cpu.classes`** (object, legacy policy-level configuration): + +This is the original low-level CPU class configuration. It continues +to work for backwards compatibility. If a class name is defined in +both `cpuClasses` and `control.cpu.classes`, the `cpuClasses` +definition takes precedence. + +Each CPU class (keyed by name) can define: + +- `minFreq` (integer): Minimum CPU frequency in kHz. +- `maxFreq` (integer): Maximum CPU frequency in kHz. +- `uncoreMinFreq` (integer): Minimum uncore frequency in kHz. +- `uncoreMaxFreq` (integer): Maximum uncore frequency in kHz. +- `disabledCstates` (list): C-state names to disable (e.g., `["C6", "C8"]`). + +```yaml control: cpu: classes: @@ -1352,21 +1492,19 @@ spec: overloadsLevelInBalloon: false # Share L2 between CPUs within balloon # CPU classes for frequency and C-state control - control: - cpu: - classes: - ultra-low-latency: - minFreq: 3500000 - maxFreq: 3900000 - uncoreMinFreq: 2400000 - uncoreMaxFreq: 2400000 - disabledCstates: [C6, C7, C8, C10] - normal: - minFreq: 800000 - maxFreq: 2500000 - powersave: - minFreq: 800000 - maxFreq: 800000 + cpuClasses: + - name: ultra-low-latency + minFreq: "base" + maxFreq: "turbo" + uncoreMinFreq: "2.4GHz" + uncoreMaxFreq: "2.4GHz" + disabledCstates: [C6, C7, C8, C10] + - name: normal + minFreq: "min" + maxFreq: "base" + - name: powersave + minFreq: "min" + maxFreq: "min" # Scheduling for high priority schedulingClasses: diff --git a/go.mod b/go.mod index ef04cf5a0..878276984 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/containers/nri-plugins/pkg/topology v0.0.0 github.com/coreos/go-systemd/v22 v22.5.0 github.com/fsnotify/fsnotify v1.6.0 - github.com/intel/goresctrl v0.12.0 + github.com/intel/goresctrl v0.13.0 github.com/intel/memtierd v0.1.1 github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.2 github.com/onsi/ginkgo/v2 v2.21.0 @@ -19,19 +19,19 @@ require ( github.com/prometheus/client_golang v1.23.0 github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.11.1 - go.opentelemetry.io/otel v1.42.0 + go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 go.opentelemetry.io/otel/exporters/prometheus v0.60.0 - go.opentelemetry.io/otel/metric v1.42.0 - go.opentelemetry.io/otel/sdk v1.42.0 - go.opentelemetry.io/otel/sdk/metric v1.42.0 - go.opentelemetry.io/otel/trace v1.42.0 - golang.org/x/sys v0.41.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 + golang.org/x/sys v0.42.0 golang.org/x/time v0.9.0 - google.golang.org/grpc v1.79.3 + google.golang.org/grpc v1.80.0 k8s.io/api v0.31.2 k8s.io/apimachinery v0.33.1 k8s.io/client-go v0.31.2 @@ -86,17 +86,17 @@ require ( github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 // indirect - go.opentelemetry.io/proto/otlp v1.9.0 // indirect - golang.org/x/mod v0.32.0 // indirect - golang.org/x/net v0.51.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect + golang.org/x/mod v0.33.0 // indirect + golang.org/x/net v0.52.0 // indirect golang.org/x/oauth2 v0.35.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/term v0.40.0 // indirect - golang.org/x/text v0.34.0 // indirect - golang.org/x/tools v0.41.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/term v0.41.0 // indirect + golang.org/x/text v0.35.0 // indirect + golang.org/x/tools v0.42.0 // indirect golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index 7aabb344a..551d203bf 100644 --- a/go.sum +++ b/go.sum @@ -840,8 +840,8 @@ github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1: github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= -github.com/intel/goresctrl v0.12.0 h1:F44m7jiVgOdqWfTTWaREF+5HTeX3i06qhvpuzpnrBko= -github.com/intel/goresctrl v0.12.0/go.mod h1:5GWtmPY4BWl/a9rU8apGED9Xul5b5WoLtg/qOWaghWU= +github.com/intel/goresctrl v0.13.0 h1:5fhKjNq4V5MYDFHa//6M6x0jP6Iq5EXwZc6/eYxdEtQ= +github.com/intel/goresctrl v0.13.0/go.mod h1:KFHS91JGOmeeuEog+nTQcsGjLC81nRqdsdhcqf69fjU= github.com/intel/memtierd v0.1.1 h1:hGSN0+dzjaUkwgkJrk6B9SU4dntggXLpXgs9Dm+jfz4= github.com/intel/memtierd v0.1.1/go.mod h1:NFDBvjoDS42gBK/c9q/CYCJ2pt/+g7UQwOOBvQli4z0= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -986,12 +986,12 @@ go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.19.0/go.mod h1:i0QyjOq3UPoTzff0PJB2N66fb4S0+rSbSB15/oyH9fY= -go.opentelemetry.io/otel v1.42.0 h1:lSQGzTgVR3+sgJDAU/7/ZMjN9Z+vUip7leaqBKy4sho= -go.opentelemetry.io/otel v1.42.0/go.mod h1:lJNsdRMxCUIWuMlVJWzecSMuNjE7dOYyWlqOXWkdqCc= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 h1:MdKucPl/HbzckWWEisiNqMPhRrAOQX8r4jTuGr636gk= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0/go.mod h1:RolT8tWtfHcjajEH5wFIZ4Dgh5jpPdFXYV9pTAk/qjc= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 h1:H7O6RlGOMTizyl3R08Kn5pdM06bnH8oscSj7o11tmLA= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0/go.mod h1:mBFWu/WOVDkWWsR7Tx7h6EpQB8wsv7P0Yrh0Pb7othc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 h1:3d+S281UTjM+AbF31XSOYn1qXn3BgIdWl8HNEpx08Jk= @@ -1001,21 +1001,21 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= go.opentelemetry.io/otel/metric v1.19.0/go.mod h1:L5rUsV9kM1IxCj1MmSdS+JQAcVm319EUrDVLrt7jqt8= -go.opentelemetry.io/otel/metric v1.42.0 h1:2jXG+3oZLNXEPfNmnpxKDeZsFI5o4J+nz6xUlaFdF/4= -go.opentelemetry.io/otel/metric v1.42.0/go.mod h1:RlUN/7vTU7Ao/diDkEpQpnz3/92J9ko05BIwxYa2SSI= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= go.opentelemetry.io/otel/sdk v1.19.0/go.mod h1:NedEbbS4w3C6zElbLdPJKOpJQOrGUJ+GfzpjUvI0v1A= -go.opentelemetry.io/otel/sdk v1.42.0 h1:LyC8+jqk6UJwdrI/8VydAq/hvkFKNHZVIWuslJXYsDo= -go.opentelemetry.io/otel/sdk v1.42.0/go.mod h1:rGHCAxd9DAph0joO4W6OPwxjNTYWghRWmkHuGbayMts= -go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA= -go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= go.opentelemetry.io/otel/trace v1.19.0/go.mod h1:mfaSyvGyEJEI0nyV2I4qhNQnbBOUUmYZpYojqMnX2vo= -go.opentelemetry.io/otel/trace v1.42.0 h1:OUCgIPt+mzOnaUTpOQcBiM/PLQ/Op7oq6g4LenLmOYY= -go.opentelemetry.io/otel/trace v1.42.0/go.mod h1:f3K9S+IFqnumBkKhRJMeaZeNk9epyhnCmQh/EysQCdc= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= -go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= -go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -1086,8 +1086,8 @@ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91 golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= -golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1145,8 +1145,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= -golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1194,8 +1194,8 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1279,8 +1279,8 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= -golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -1289,8 +1289,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= -golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1307,8 +1307,8 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= -golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1380,8 +1380,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.3.0/go.mod h1:/rWhSS2+zyEVwoJf8YAX6L2f0ntZ7Kn/mGgAWcipA5k= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= -golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= -golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/tools/go/expect v0.1.0-deprecated h1:jY2C5HGYR5lqex3gEniOQL0r7Dq5+VGVgY1nudX5lXY= golang.org/x/tools/go/expect v0.1.0-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= @@ -1398,8 +1398,8 @@ gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJ gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= -gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= @@ -1603,12 +1603,12 @@ google.golang.org/genproto v0.0.0-20230525234025-438c736192d0/go.mod h1:9ExIQyXL google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54/go.mod h1:zqTuNwFlFRsw5zIts5VnzLQxSRqh+CGOTVMlYbY0Eyk= google.golang.org/genproto/googleapis/api v0.0.0-20230525234020-1aefcd67740a/go.mod h1:ts19tUU+Z0ZShN1y3aPyq2+O3d5FUNNgT6FtOzmrNn8= google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9/go.mod h1:vHYtlOoi6TsQ3Uk2yxR7NI5z8uoV+3pZtR4jmHIkRig= -google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0= -google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234015-3fc162c6f38a/go.mod h1:xURIpW9ES5+/GZhnV6beoEtxQrnkRGIfP5VQG2tCBLc= google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1650,8 +1650,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.57.0/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= -google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= -google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index ac0f8a649..93ab82bc8 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -258,6 +258,10 @@ func (a *Agent) Stop() { defer a.stopLock.Unlock() if a.stopC != nil { + // Remove any extended resources we own on this node so + // a graceful shutdown does not leave orphan capacity + // entries behind. + a.ClearNodeExtendedResources() close(a.stopC) <-a.doneC a.stopC = nil @@ -597,6 +601,10 @@ func (a *Agent) updateGroupConfig(obj runtime.Object) { func (a *Agent) updateConfig(cfg metav1.Object) { if cfg == nil { log.Warnf("node (%s) has no effective configuration", a.nodeName) + // With no effective configuration there is nothing left to + // publish, so drop any extended resources we currently own + // on the node. + a.ClearNodeExtendedResources() return } diff --git a/pkg/agent/node-extended-resources.go b/pkg/agent/node-extended-resources.go new file mode 100644 index 000000000..7378c2aef --- /dev/null +++ b/pkg/agent/node-extended-resources.go @@ -0,0 +1,309 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package agent + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// extendedResourcesLock serializes concurrent node.status PATCHes +// emitted by the policy on container events. Last writer wins. +var extendedResourcesLock sync.Mutex + +// lastPublishedExtendedResources tracks the resources we currently +// own on this node, so that we can issue 'remove' patches for +// resources that the policy stops reporting. +var lastPublishedExtendedResources = map[string]int64{} + +// extendedResourcesSynced is set after the first successful +// node-status scan. Until then, every publish will first try to +// seed lastPublishedExtendedResources from the node so that +// resources left over by a prior plugin process (helm reinstall, +// pod crash, switch to a different policy, etc.) get pruned by +// the regular diff logic on the next publish. +var extendedResourcesSynced bool + +// extendedResourceDomain is the per-domain prefix the agent owns. +// Only resources whose name starts with this prefix are touched +// by the agent (other extended resources advertised by other +// controllers are left alone). +const extendedResourceDomain = "cpuclass.balloons.nri.io/" + +// UpdateNodeExtendedResources publishes the given resource map +// to Node.status.capacity using a JSON patch. Resources previously +// owned by the agent but absent from 'resources' are removed. +// Runs asynchronously to avoid stalling NRI request paths. +func (a *Agent) UpdateNodeExtendedResources(resources map[string]int64) error { + if a.hasLocalConfig() { + return nil + } + if a.k8sCli == nil || a.nodeName == "" { + return nil + } + // Snapshot inputs and run in the background; node-status + // PATCHes can be slow under apiserver load and we never + // want NRI hooks to block on them. + snapshot := make(map[string]int64, len(resources)) + for k, v := range resources { + snapshot[k] = v + } + go func() { + if err := a.updateNodeExtendedResources(snapshot); err != nil { + log.Errorf("failed to publish extended resources: %v", err) + } + }() + return nil +} + +func (a *Agent) updateNodeExtendedResources(resources map[string]int64) error { + extendedResourcesLock.Lock() + defer extendedResourcesLock.Unlock() + + // First call after process start: scan the node for keys we + // already own (from a prior plugin process), so the diff + // below can prune any that the current policy no longer + // publishes. Failure is non-fatal -- we just fall back to + // "trust our in-memory state". + if !extendedResourcesSynced { + if err := a.syncExtendedResourcesFromNode(); err != nil { + log.Warnf("extended-resource startup sync failed (orphans from a prior plugin process may persist): %v", err) + } + extendedResourcesSynced = true + } + + // Compute the patch: add/replace keys present in 'resources', + // remove keys we owned before but are now gone. + type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + Value interface{} `json:"value,omitempty"` + } + + ops := []jsonPatchOp{} + for name, qty := range resources { + if !strings.HasPrefix(name, extendedResourceDomain) { + log.Warnf("refusing to publish resource %q: not in domain %q", + name, extendedResourceDomain) + continue + } + q := resource.NewQuantity(qty, resource.DecimalSI) + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/status/capacity/" + escapeJSONPointer(name), + Value: q.String(), + }) + } + for name := range lastPublishedExtendedResources { + if _, kept := resources[name]; kept { + continue + } + ops = append(ops, jsonPatchOp{ + Op: "remove", + Path: "/status/capacity/" + escapeJSONPointer(name), + }) + } + + if len(ops) == 0 { + return nil + } + + body, err := json.Marshal(ops) + if err != nil { + return fmt.Errorf("marshal patch: %w", err) + } + + ctx := context.Background() + _, err = a.k8sCli.CoreV1().Nodes().Patch( + ctx, a.nodeName, types.JSONPatchType, body, + metav1.PatchOptions{}, "status") + if err != nil { + // JSON patch "add" on a missing path fails when the + // node has no prior resource of that name -- 'add' + // requires the parent to exist, but for a map value + // it should create the key. In practice apiservers + // behave correctly here. If we ever hit issues, fall + // back to a strategic merge patch. + return fmt.Errorf("patch node %s status: %w", a.nodeName, err) + } + + // Record current set for next diff. + lastPublishedExtendedResources = make(map[string]int64, len(resources)) + for k, v := range resources { + lastPublishedExtendedResources[k] = v + } + + publishedSummary := summarizeExtendedResources(resources) + if publishedSummary != "" { + log.Infof("published node extended resources: %s", publishedSummary) + } + return nil +} + +// escapeJSONPointer escapes '~' and '/' per RFC 6901 so that a +// resource name containing slashes survives as a single JSON +// Pointer segment. +func escapeJSONPointer(s string) string { + s = strings.ReplaceAll(s, "~", "~0") + s = strings.ReplaceAll(s, "/", "~1") + return s +} + +// summarizeExtendedResources formats the map deterministically +// for logs: "name1=N1, name2=N2, ...". +func summarizeExtendedResources(m map[string]int64) string { + if len(m) == 0 { + return "" + } + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + // stable order without pulling in sort + for i := 1; i < len(keys); i++ { + for j := i; j > 0 && keys[j-1] > keys[j]; j-- { + keys[j-1], keys[j] = keys[j], keys[j-1] + } + } + parts := make([]string, 0, len(keys)) + for _, k := range keys { + parts = append(parts, fmt.Sprintf("%s=%d", k, m[k])) + } + return strings.Join(parts, ", ") +} + +// syncExtendedResourcesFromNode reads Node.status.capacity and +// seeds lastPublishedExtendedResources with every entry whose +// key carries extendedResourceDomain. Caller must hold +// extendedResourcesLock. +func (a *Agent) syncExtendedResourcesFromNode() error { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + node, err := a.k8sCli.CoreV1().Nodes().Get(ctx, a.nodeName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get node %s: %w", a.nodeName, err) + } + owned := map[string]int64{} + for name, q := range node.Status.Capacity { + key := string(name) + if !strings.HasPrefix(key, extendedResourceDomain) { + continue + } + v, ok := q.AsInt64() + if !ok { + v = q.Value() + } + owned[key] = v + if _, ours := lastPublishedExtendedResources[key]; !ours { + lastPublishedExtendedResources[key] = v + } + } + if len(owned) > 0 { + log.Infof("extended-resource startup sync: found %d existing key(s) on node %s: %s", + len(owned), a.nodeName, summarizeExtendedResources(owned)) + } + return nil +} + +// ClearNodeExtendedResources removes every node-status key the +// agent currently owns (every key in lastPublishedExtendedResources +// plus, for safety, every key currently present on the node that +// carries our domain prefix). Best-effort and synchronous, with a +// short timeout; intended for Agent.Stop() so a graceful shutdown +// does not leave orphan capacity entries behind. +func (a *Agent) ClearNodeExtendedResources() { + if a.hasLocalConfig() { + return + } + if a.k8sCli == nil || a.nodeName == "" { + return + } + + extendedResourcesLock.Lock() + defer extendedResourcesLock.Unlock() + + toRemove := map[string]struct{}{} + for k := range lastPublishedExtendedResources { + toRemove[k] = struct{}{} + } + + // Also fold in anything currently on the node under our + // domain that we may not be tracking (e.g., startup sync + // never ran because no publish happened before Stop). Best + // effort: ignore the read error and fall back to the + // in-memory set. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + if node, err := a.k8sCli.CoreV1().Nodes().Get(ctx, a.nodeName, metav1.GetOptions{}); err == nil { + for name := range node.Status.Capacity { + key := string(name) + if strings.HasPrefix(key, extendedResourceDomain) { + toRemove[key] = struct{}{} + } + } + } + cancel() + + if len(toRemove) == 0 { + return + } + + type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + } + ops := make([]jsonPatchOp, 0, len(toRemove)) + keys := make([]string, 0, len(toRemove)) + for k := range toRemove { + ops = append(ops, jsonPatchOp{ + Op: "remove", + Path: "/status/capacity/" + escapeJSONPointer(k), + }) + keys = append(keys, k) + } + + body, err := json.Marshal(ops) + if err != nil { + log.Warnf("ClearNodeExtendedResources: marshal patch: %v", err) + return + } + + pctx, pcancel := context.WithTimeout(context.Background(), 5*time.Second) + defer pcancel() + _, err = a.k8sCli.CoreV1().Nodes().Patch( + pctx, a.nodeName, types.JSONPatchType, body, + metav1.PatchOptions{}, "status") + if err != nil { + log.Warnf("ClearNodeExtendedResources: patch node %s: %v", a.nodeName, err) + return + } + + // Stable order in the log + for i := 1; i < len(keys); i++ { + for j := i; j > 0 && keys[j-1] > keys[j]; j-- { + keys[j-1], keys[j] = keys[j], keys[j-1] + } + } + log.Infof("cleared node extended resources on shutdown: %s", strings.Join(keys, ", ")) + + lastPublishedExtendedResources = map[string]int64{} +} diff --git a/pkg/apis/config/v1alpha1/balloons-policy.go b/pkg/apis/config/v1alpha1/balloons-policy.go index 259e1afaa..12ab646fa 100644 --- a/pkg/apis/config/v1alpha1/balloons-policy.go +++ b/pkg/apis/config/v1alpha1/balloons-policy.go @@ -14,8 +14,17 @@ package v1alpha1 +import ( + "sort" + + cpucfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control/cpu" + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" +) + var ( - _ ResmgrConfig = &BalloonsPolicy{} + _ ResmgrConfig = &BalloonsPolicy{} + bplog = logger.NewLogger("config-v1alpha1") ) func (c *BalloonsPolicy) AgentConfig() *AgentConfig { @@ -39,13 +48,98 @@ func (c *BalloonsPolicy) CommonConfig() *CommonConfig { } } +// PolicyConfig returns the balloons-specific configuration handed to +// the policy. Before returning, any legacy control.cpu.classes +// entries are folded into Spec.Config.CPUClasses (without overriding +// entries with matching names). The legacy CPU controller is no +// longer used by the balloons policy; this reverse merge preserves +// backwards compatibility so existing configurations keep working +// while users migrate to the cpuClasses syntax. func (c *BalloonsPolicy) PolicyConfig() interface{} { if c == nil { return nil } + mergeLegacyCpuClasses(&c.Spec) return &c.Spec.Config } +// mergeLegacyCpuClasses appends synthetic CPUClass entries derived +// from spec.Control.CPU.Classes for names that do not already exist +// in spec.Config.CPUClasses. Conflicting names log a single warning +// per name. Idempotent: repeated calls do not add duplicate entries +// and do not warn again for the same conflict. +func mergeLegacyCpuClasses(spec *BalloonsPolicySpec) { + legacy := spec.Control.CPU.Classes + if len(legacy) == 0 { + return + } + existing := map[string]*policyapi.CPUClass{} + for _, cc := range spec.CPUClasses { + existing[cc.Name] = cc + } + // Sort the legacy class names so warning order is deterministic. + names := make([]string, 0, len(legacy)) + for name := range legacy { + names = append(names, name) + } + sort.Strings(names) + added := []string{} + for _, name := range names { + cc := legacy[name] + if prev, ok := existing[name]; ok { + // Skip silently when the explicit entry already + // has the exact values converted from the legacy + // entry. That happens when a prior PolicyConfig() + // call already merged this spec. + if cpuClassMatchesLegacy(prev, cc) { + continue + } + bplog.Warn("control.cpu.classes entry %q overridden by cpuClasses entry; remove the legacy entry to silence this warning", name) + continue + } + synth := &policyapi.CPUClass{ + Name: name, + MinFreq: policyapi.Frequency(cc.MinFreq), + MaxFreq: policyapi.Frequency(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: policyapi.Frequency(cc.UncoreMinFreq), + UncoreMaxFreq: policyapi.Frequency(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: append([]string(nil), cc.DisabledCstates...), + } + spec.CPUClasses = append(spec.CPUClasses, synth) + existing[name] = synth + added = append(added, name) + } + if len(added) > 0 { + bplog.Warn("control.cpu.classes is deprecated; converted to cpuClasses: %v", added) + } +} + +// cpuClassMatchesLegacy reports whether cc has the exact field +// values that the reverse converter would produce for legacy. Used +// to suppress spurious "override" warnings when the same spec is +// processed more than once. +func cpuClassMatchesLegacy(cc *policyapi.CPUClass, legacy cpucfg.Class) bool { + if cc.MinFreq != policyapi.Frequency(legacy.MinFreq) || + cc.MaxFreq != policyapi.Frequency(legacy.MaxFreq) || + cc.EnergyPerformancePreference != legacy.EnergyPerformancePreference || + cc.UncoreMinFreq != policyapi.Frequency(legacy.UncoreMinFreq) || + cc.UncoreMaxFreq != policyapi.Frequency(legacy.UncoreMaxFreq) || + cc.FreqGovernor != legacy.FreqGovernor { + return false + } + if len(cc.DisabledCstates) != len(legacy.DisabledCstates) { + return false + } + for i := range cc.DisabledCstates { + if cc.DisabledCstates[i] != legacy.DisabledCstates[i] { + return false + } + } + return true +} + func (c *BalloonsPolicy) Validate() error { if c == nil { return nil diff --git a/pkg/apis/config/v1alpha1/balloons-policy_test.go b/pkg/apis/config/v1alpha1/balloons-policy_test.go new file mode 100644 index 000000000..fec21c3cc --- /dev/null +++ b/pkg/apis/config/v1alpha1/balloons-policy_test.go @@ -0,0 +1,109 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + "testing" + + control "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control" + cpucfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control/cpu" + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + balloonscfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/balloons" +) + +// mkSpec builds a BalloonsPolicySpec carrying the given cpuClasses +// list and legacy control.cpu.classes map. Other fields are left at +// zero values. +func mkSpec(cpuClasses []*policyapi.CPUClass, legacy map[string]cpucfg.Class) *BalloonsPolicySpec { + return &BalloonsPolicySpec{ + Config: balloonscfg.Config{ + CPUClasses: cpuClasses, + }, + Control: control.Config{ + CPU: cpucfg.Config{ + Classes: legacy, + }, + }, + } +} + +// TestMergeLegacy_AddsMissingNames verifies that legacy entries +// whose names do not appear in cpuClasses are appended. +func TestMergeLegacy_AddsMissingNames(t *testing.T) { + spec := mkSpec(nil, map[string]cpucfg.Class{ + "old": {MinFreq: 1_000_000, MaxFreq: 2_000_000, FreqGovernor: "performance"}, + }) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != 1 { + t.Fatalf("want 1 cpuClass after merge, got %d", len(spec.CPUClasses)) + } + cc := spec.CPUClasses[0] + if cc.Name != "old" || cc.MinFreq.KHz() != 1_000_000 || cc.MaxFreq.KHz() != 2_000_000 || cc.FreqGovernor != "performance" { + t.Errorf("merged class wrong: %+v", cc) + } +} + +// TestMergeLegacy_ExplicitWins verifies that explicit cpuClasses +// entries take precedence over legacy entries with the same name. +func TestMergeLegacy_ExplicitWins(t *testing.T) { + explicit := &policyapi.CPUClass{ + Name: "hp", + MinFreq: policyapi.FrequencyBase, + MaxFreq: policyapi.FrequencyTurbo, + } + spec := mkSpec( + []*policyapi.CPUClass{explicit}, + map[string]cpucfg.Class{ + "hp": {MinFreq: 800_000, MaxFreq: 1_500_000}, + }, + ) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != 1 { + t.Fatalf("want 1 cpuClass (explicit unchanged), got %d", len(spec.CPUClasses)) + } + cc := spec.CPUClasses[0] + if cc != explicit { + t.Errorf("explicit entry was replaced") + } + if cc.MinFreq != policyapi.FrequencyBase { + t.Errorf("explicit symbolic MinFreq overwritten, got %v", cc.MinFreq) + } +} + +// TestMergeLegacy_Idempotent verifies that running the merge twice +// does not duplicate appended entries. +func TestMergeLegacy_Idempotent(t *testing.T) { + spec := mkSpec(nil, map[string]cpucfg.Class{ + "a": {MinFreq: 1_000_000}, + "b": {MaxFreq: 2_000_000}, + }) + mergeLegacyCpuClasses(spec) + first := len(spec.CPUClasses) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != first { + t.Errorf("second merge added entries: first=%d second=%d", first, len(spec.CPUClasses)) + } +} + +// TestMergeLegacy_NoLegacy_NoChange verifies that an empty legacy +// map leaves cpuClasses untouched. +func TestMergeLegacy_NoLegacy_NoChange(t *testing.T) { + keep := &policyapi.CPUClass{Name: "x"} + spec := mkSpec([]*policyapi.CPUClass{keep}, nil) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != 1 || spec.CPUClasses[0] != keep { + t.Errorf("cpuClasses unexpectedly modified: %+v", spec.CPUClasses) + } +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go index 496f851a6..a8d6230d6 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go @@ -32,6 +32,8 @@ type ( CPUTopologyLevel = policy.CPUTopologyLevel ComponentCreationStrategy = policy.ComponentCreationStrategy SchedulingClass = policy.SchedulingClass + CPUClass = policy.CPUClass + Frequency = policy.Frequency ) const ( @@ -135,6 +137,21 @@ type Config struct { // SchedulingClasses specify scheduling classes available in // balloon types. SchedulingClasses []*SchedulingClass `json:"schedulingClasses,omitempty"` + // CPUClasses define CPU frequency, C-state, and turbo + // attributes for CPU classes referenced by balloon types. + // Exclusive turbo frequency access is controlled via + // turboPriority. + CPUClasses []*CPUClass `json:"cpuClasses,omitempty"` + // TurboDomain selects the scope over which TurboPriority + // arbitration happens. The default is "package": every CPU + // package independently picks its own TurboPriority winner, + // so a low-priority balloon on one socket can keep turbo even + // when a higher-priority balloon is running on another + // socket. Set to "system" to pick single TurboPriority winner + // for the whole system. + // +kubebuilder:validation:Enum=package;system + // +kubebuilder:default=package + TurboDomain string `json:"turboDomain,omitempty"` } // BalloonDef contains a balloon definition. diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go index 74276ce1d..e4b9ceae1 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go @@ -185,6 +185,17 @@ func (in *Config) DeepCopyInto(out *Config) { } } } + if in.CPUClasses != nil { + in, out := &in.CPUClasses, &out.CPUClasses + *out = make([]*CPUClass, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(CPUClass) + (*in).DeepCopyInto(*out) + } + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Config. diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go b/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go new file mode 100644 index 000000000..aab89472c --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go @@ -0,0 +1,98 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +// CPUClass specifies CPU frequency, C-state, and turbo attributes +// for a CPU class. +// +k8s:deepcopy-gen=true +type CPUClass struct { + // Name of the CPU class. + // +kubebuilder:validation:Required + Name string `json:"name"` + // MinFreq is the minimum CPU frequency for this class. + // Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + // or a plain number in kHz. Also accepts symbolic names: "min" + // (platform minimum), "base" (CPU base frequency), "turbo" + // (maximum turbo frequency), resolved at runtime from sysfs. + // When turboPriority is set, "turbo" resolves to actual turbo + // only for the highest-priority active class; others get base. + MinFreq Frequency `json:"minFreq,omitempty"` + // MaxFreq is the maximum CPU frequency for this class. + // Same format and symbolic names as MinFreq. + MaxFreq Frequency `json:"maxFreq,omitempty"` + // EnergyPerformancePreference for CPUs in this class. + // +kubebuilder:validation:Minimum=0 + EnergyPerformancePreference uint `json:"energyPerformancePreference,omitempty"` + // UncoreMinFreq is the minimum uncore frequency for this class. + // Accepts values with units like MinFreq. + UncoreMinFreq Frequency `json:"uncoreMinFreq,omitempty"` + // UncoreMaxFreq is the maximum uncore frequency for this class. + // Accepts values with units like MinFreq. + UncoreMaxFreq Frequency `json:"uncoreMaxFreq,omitempty"` + // FreqGovernor is the CPUFreq governor for this class + // (e.g., "performance", "powersave", "schedutil"). + FreqGovernor string `json:"freqGovernor,omitempty"` + // DisabledCstates lists C-states disabled for CPUs in this class. + // Example: ["C4", "C6", "C8", "C10"] + DisabledCstates []string `json:"disabledCstates,omitempty"` + // TurboPriority controls exclusive turbo frequency access. + // Among CPU classes with active balloons, only the class with + // the highest turboPriority gets the symbolic frequency "turbo" + // resolved to the actual turbo frequency. All other classes get + // "turbo" resolved to the base frequency instead. + // If all classes have turboPriority 0 (default), every class + // gets actual turbo frequencies -- no competition occurs. + // +kubebuilder:validation:Minimum=0 + TurboPriority int `json:"turboPriority,omitempty"` + // PctPriority requests Intel Priority Core Turbo (PCT) + // hardware support, via SST-CP CLOSes, for CPUs in this + // class. "high" associates the CPUs to the high-priority + // CLOS (HP cores, typically running at Pmax). "low" + // associates them to the low-priority CLOS (LP cores, + // typically capped at P1). Unset = PCT is not requested + // for this class. Mutually exclusive with PctClosID. + // +kubebuilder:validation:Enum=high;low + PctPriority string `json:"pctPriority,omitempty"` + // PctClosID pins this class to a specific SST-CP CLOS ID + // (0..ClosCount-1, typically 0..3) and signals "assoc-only" + // mode: nri-plugin will only associate this class's CPUs to + // the given CLOS, without touching the SoC-wide SST state + // (no CPReset, no TFEnable, no CLOS reconfiguration). Use + // this when an operator or the BIOS has pre-configured the + // CLOSes. Mutually exclusive with PctPriority. + // +kubebuilder:validation:Minimum=0 + PctClosID *int `json:"pctClosID,omitempty"` + // PctMinFreq overrides the CLOS minimum frequency that + // nri-plugin programs in managed mode. Defaults to MinFreq. + // Uses the same format as MinFreq but resolves "turbo" + // directly to the hardware maximum turbo frequency, + // without participating in the soft turboPriority + // arbitration. Ignored in assoc-only mode. + PctMinFreq Frequency `json:"pctMinFreq,omitempty"` + // PctMaxFreq overrides the CLOS maximum frequency that + // nri-plugin programs in managed mode. Defaults to MaxFreq. + // Same caveat as PctMinFreq. + PctMaxFreq Frequency `json:"pctMaxFreq,omitempty"` + // PublishExtendedResource opts this CPU class into publishing + // a node-level extended resource named + // "cpuclass.balloons.nri.io/" whose value reflects + // the number of logical CPUs that the balloons policy is + // currently able to route into this class on the node. The + // scheduler can then bin-pack/spread balloons by adding the + // same resource to pod requests, avoiding HP-CPU + // over-subscription on a single node. Has effect only when + // the class also carries PctPriority or PctClosID. Experimental. + PublishExtendedResource bool `json:"publishExtendedResource,omitempty"` +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go b/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go new file mode 100644 index 000000000..0095117ec --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go @@ -0,0 +1,205 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +import ( + "encoding/json" + "fmt" + "math" + "regexp" + "strconv" + "strings" +) + +// Frequency represents a CPU frequency value that can be specified +// with human-readable units in YAML/JSON configuration. Supported +// formats: +// - "3.2G" or "3.2GHz" = 3200000 (kHz) +// - "2900M" or "2900MHz" = 2900000 (kHz) +// - "2900000k" or "2900000kHz" = 2900000 (kHz) +// - "2900000" (bare number) = 2900000 (kHz, backwards compatible) +// - 2900000 (JSON number) = 2900000 (kHz, backwards compatible) +// - "min" = platform minimum frequency (resolved at runtime) +// - "base" = CPU base frequency (resolved at runtime) +// - "turbo" = maximum turbo frequency (resolved at runtime) +// +// The internal representation is always in kHz (the unit used by Linux +// kernel sysfs cpufreq interface). Symbolic values ("min", "base", +// "turbo") are stored as sentinel constants and must be resolved with +// Resolve() before being passed to the CPU controller. +// +kubebuilder:validation:Type=string +type Frequency uint + +const ( + // FrequencyMin is a sentinel indicating the platform minimum frequency. + FrequencyMin Frequency = math.MaxUint - 2 + // FrequencyBase is a sentinel indicating the CPU base frequency. + FrequencyBase Frequency = math.MaxUint - 1 + // FrequencyTurbo is a sentinel indicating the maximum turbo frequency. + FrequencyTurbo Frequency = math.MaxUint +) + +var frequencyRegexp = regexp.MustCompile(`(?i)^\s*([0-9]*\.?[0-9]+)\s*(GHz|G|MHz|M|kHz|k)?\s*$`) + +// parseFrequency parses a frequency string into kHz. +func parseFrequency(s string) (Frequency, error) { + s = strings.TrimSpace(s) + if s == "" { + return 0, nil + } + + // Check for symbolic frequency names. + switch strings.ToLower(s) { + case "min": + return FrequencyMin, nil + case "base": + return FrequencyBase, nil + case "turbo": + return FrequencyTurbo, nil + } + + matches := frequencyRegexp.FindStringSubmatch(s) + if matches == nil { + return 0, fmt.Errorf("invalid frequency %q: expected number with optional unit (GHz, MHz, kHz) or symbolic name (min, base, turbo)", s) + } + + numStr := matches[1] + unit := strings.ToLower(matches[2]) + + val, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("invalid frequency %q: %w", s, err) + } + if val < 0 { + return 0, fmt.Errorf("invalid frequency %q: negative value", s) + } + + var kHz float64 + switch unit { + case "ghz", "g": + kHz = val * 1_000_000 + case "mhz", "m": + kHz = val * 1_000 + case "khz", "k": + kHz = val + case "": + // Bare number: interpret as kHz for backwards compatibility + // with the existing uint config fields. + kHz = val + } + + result := uint(math.Round(kHz)) + if result == 0 && val > 0 { + return 0, fmt.Errorf("invalid frequency %q: value too small to represent in kHz", s) + } + + return Frequency(result), nil +} + +// UnmarshalJSON implements json.Unmarshaler. Accepts both JSON strings +// with units (e.g., "3.2GHz") and plain JSON numbers (interpreted as kHz). +func (f *Frequency) UnmarshalJSON(data []byte) error { + // Try string first (quoted value with optional unit). + var s string + if err := json.Unmarshal(data, &s); err == nil { + parsed, err := parseFrequency(s) + if err != nil { + return err + } + *f = parsed + return nil + } + + // Try plain number (backwards compatible with uint kHz). + var n float64 + if err := json.Unmarshal(data, &n); err == nil { + if n < 0 { + return fmt.Errorf("invalid frequency: negative value %v", n) + } + *f = Frequency(uint(math.Round(n))) + return nil + } + + return fmt.Errorf("invalid frequency: expected string or number, got %s", string(data)) +} + +// MarshalJSON implements json.Marshaler. Symbolic frequencies are +// marshaled as their string name; numeric values as plain numbers (kHz) +// for backwards compatibility. +func (f Frequency) MarshalJSON() ([]byte, error) { + switch f { + case FrequencyMin: + return json.Marshal("min") + case FrequencyBase: + return json.Marshal("base") + case FrequencyTurbo: + return json.Marshal("turbo") + } + return json.Marshal(uint(f)) +} + +// KHz returns the frequency value in kHz. For symbolic frequencies +// (min, base, turbo) this returns the sentinel value; use Resolve() +// first to obtain the actual platform frequency. +func (f Frequency) KHz() uint { + return uint(f) +} + +// IsSymbolic returns true if this frequency is a symbolic name +// (min, base, or turbo) that requires runtime resolution. +func (f Frequency) IsSymbolic() bool { + return f == FrequencyMin || f == FrequencyBase || f == FrequencyTurbo +} + +// Resolve converts a symbolic frequency to its concrete kHz value +// using platform frequency information. For non-symbolic frequencies, +// the value is returned unchanged. The parameters are: +// - minKHz: platform minimum frequency (cpufreq/cpuinfo_min_freq) +// - baseKHz: CPU base frequency (cpufreq/base_frequency) +// - turboKHz: maximum turbo frequency (cpufreq/cpuinfo_max_freq) +func (f Frequency) Resolve(minKHz, baseKHz, turboKHz uint) uint { + switch f { + case FrequencyMin: + return minKHz + case FrequencyBase: + return baseKHz + case FrequencyTurbo: + return turboKHz + } + return uint(f) +} + +// String returns a human-readable representation. +func (f Frequency) String() string { + switch f { + case FrequencyMin: + return "min" + case FrequencyBase: + return "base" + case FrequencyTurbo: + return "turbo" + } + kHz := uint(f) + if kHz == 0 { + return "0" + } + if kHz >= 1_000_000 && kHz%1_000_000 == 0 { + return fmt.Sprintf("%dGHz", kHz/1_000_000) + } + if kHz >= 1_000 && kHz%1_000 == 0 { + return fmt.Sprintf("%dMHz", kHz/1_000) + } + return fmt.Sprintf("%dkHz", kHz) +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go b/pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go new file mode 100644 index 000000000..53d8d1cc4 --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go @@ -0,0 +1,156 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +import ( + "encoding/json" + "testing" +) + +func TestFrequencyResolve(t *testing.T) { + const ( + minHz uint = 800000 + baseHz uint = 2400000 + turboHz uint = 3800000 + ) + cases := []struct { + name string + f Frequency + want uint + }{ + {"min sentinel", FrequencyMin, minHz}, + {"base sentinel", FrequencyBase, baseHz}, + {"turbo sentinel", FrequencyTurbo, turboHz}, + {"concrete value passed through", Frequency(1500000), 1500000}, + {"zero stays zero", Frequency(0), 0}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := tc.f.Resolve(minHz, baseHz, turboHz) + if got != tc.want { + t.Errorf("Resolve = %d, want %d", got, tc.want) + } + }) + } +} + +func TestFrequencyIsSymbolic(t *testing.T) { + if !FrequencyMin.IsSymbolic() { + t.Errorf("FrequencyMin.IsSymbolic = false, want true") + } + if !FrequencyBase.IsSymbolic() { + t.Errorf("FrequencyBase.IsSymbolic = false, want true") + } + if !FrequencyTurbo.IsSymbolic() { + t.Errorf("FrequencyTurbo.IsSymbolic = false, want true") + } + if Frequency(3000000).IsSymbolic() { + t.Errorf("concrete frequency must not be IsSymbolic") + } + if Frequency(0).IsSymbolic() { + t.Errorf("zero must not be IsSymbolic") + } +} + +func TestFrequencyUnmarshalJSON(t *testing.T) { + cases := []struct { + name string + input string + want Frequency + wantErr bool + }{ + {"symbolic min", `"min"`, FrequencyMin, false}, + {"symbolic base", `"base"`, FrequencyBase, false}, + {"symbolic turbo", `"turbo"`, FrequencyTurbo, false}, + {"symbolic uppercase", `"TURBO"`, FrequencyTurbo, false}, + {"GHz fractional", `"3.2GHz"`, Frequency(3200000), false}, + {"GHz short", `"2G"`, Frequency(2000000), false}, + {"MHz", `"2900MHz"`, Frequency(2900000), false}, + {"MHz short", `"2900M"`, Frequency(2900000), false}, + {"kHz explicit", `"2900000kHz"`, Frequency(2900000), false}, + {"kHz short", `"2900000k"`, Frequency(2900000), false}, + {"bare number as kHz", `"2900000"`, Frequency(2900000), false}, + {"json number as kHz", `2900000`, Frequency(2900000), false}, + {"empty string", `""`, Frequency(0), false}, + {"invalid unit", `"3GBz"`, 0, true}, + {"negative number", `-1000`, 0, true}, + {"garbage", `"abc"`, 0, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var f Frequency + err := json.Unmarshal([]byte(tc.input), &f) + if tc.wantErr { + if err == nil { + t.Errorf("Unmarshal(%s) = nil err, want error", tc.input) + } + return + } + if err != nil { + t.Fatalf("Unmarshal(%s) unexpected err: %v", tc.input, err) + } + if f != tc.want { + t.Errorf("Unmarshal(%s) = %d, want %d", tc.input, uint(f), uint(tc.want)) + } + }) + } +} + +func TestFrequencyMarshalJSON(t *testing.T) { + cases := []struct { + name string + f Frequency + want string + }{ + {"min", FrequencyMin, `"min"`}, + {"base", FrequencyBase, `"base"`}, + {"turbo", FrequencyTurbo, `"turbo"`}, + {"concrete", Frequency(2900000), `2900000`}, + {"zero", Frequency(0), `0`}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + b, err := json.Marshal(tc.f) + if err != nil { + t.Fatalf("Marshal err: %v", err) + } + if string(b) != tc.want { + t.Errorf("Marshal = %s, want %s", string(b), tc.want) + } + }) + } +} + +func TestFrequencyRoundTrip(t *testing.T) { + cases := []Frequency{ + FrequencyMin, FrequencyBase, FrequencyTurbo, + Frequency(0), Frequency(2900000), Frequency(3800000), + } + for _, f := range cases { + t.Run(f.String(), func(t *testing.T) { + b, err := json.Marshal(f) + if err != nil { + t.Fatalf("Marshal err: %v", err) + } + var got Frequency + if err := json.Unmarshal(b, &got); err != nil { + t.Fatalf("Unmarshal err: %v", err) + } + if got != f { + t.Errorf("round-trip: got %d, want %d", uint(got), uint(f)) + } + }) + } +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go index 3bef85a34..dc92ae349 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go @@ -20,6 +20,31 @@ package policy import () +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CPUClass) DeepCopyInto(out *CPUClass) { + *out = *in + if in.DisabledCstates != nil { + in, out := &in.DisabledCstates, &out.DisabledCstates + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.PctClosID != nil { + in, out := &in.PctClosID, &out.PctClosID + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUClass. +func (in *CPUClass) DeepCopy() *CPUClass { + if in == nil { + return nil + } + out := new(CPUClass) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingClass) DeepCopyInto(out *SchedulingClass) { *out = *in diff --git a/pkg/resmgr/cpuclass/cpuclass.go b/pkg/resmgr/cpuclass/cpuclass.go new file mode 100644 index 000000000..21da445aa --- /dev/null +++ b/pkg/resmgr/cpuclass/cpuclass.go @@ -0,0 +1,286 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpuclass is the resource-manager-wide CPU class handler. +// It owns the per-CPU frequency, c-state, uncore-frequency and +// Intel Priority Core Turbo state implied by a list of user-facing +// CPU class definitions. +// +// Policies talk to a single *Handler, constructed with New(sys). +// Configure(spec) installs (or replaces) the class set; UseClass +// pins given CPUs to a named class; Commit() flushes deferred +// per-CPU sysfs writes; Hints() returns placement preferences a +// policy can use when picking new CPUs for an allocation. +package cpuclass + +import ( + "fmt" + "sort" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpufreq" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpuidle" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/pct" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/uncorefreq" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var log = logger.NewLogger("cpuclass") + +// AllocationIntent describes an upcoming CPU allocation for which +// the caller wants placement preferences. +type AllocationIntent = types.AllocationIntent + +// AllocationHints carries technology-agnostic placement preferences +// returned by Handler.Hints. +type AllocationHints = types.AllocationHints + +// CpuPreference is a named CPU set carrying a single placement +// preference (prefer or avoid). +type CpuPreference = types.CpuPreference + +// ConfigSpec carries cpuclass configuration applied via +// Handler.Configure. Idleness is intentionally absent: the caller +// decides which class name (if any) means "idle" and applies it via +// UseClass. +type ConfigSpec struct { + // Classes is the user-facing list of CPU classes. + Classes []*policyapi.CPUClass + // TurboDomain selects the per-domain turbo arbitration scope. + // Empty resolves to "package". + TurboDomain string + // Allowed bounds every cpuclass operation. CPUs outside this + // set are silently dropped by Configure, UseClass and Hints. + Allowed cpuset.CPUSet +} + +// Handler is the sole cpuclass entry point for policy code. It owns +// construction and configuration of the per-technology allocators +// (cpufreq, pct) and writers (cpufreq, cpuidle, uncorefreq). +type Handler struct { + sys sysfs.System + allowed cpuset.CPUSet + + cpufreq *cpufreq.Allocator + pct *pct.Allocator + + // defs maps synthetic class name -> resolved class definition. + // Populated by SetClassDef calls from the cpufreq allocator. + defs map[string]types.ClassDef + // cpuClass maps cpu id -> synthetic class name. Value "" means + // "explicitly assigned to no class". Absent CPUs are unmanaged. + cpuClass map[int]string + // dirtyCPUs tracks CPUs whose class assignment or whose class + // definition changed since the last Commit(). + dirtyCPUs map[int]bool + + freqWriter *cpufreq.Writer + idleWriter *cpuidle.Writer + uncoreWriter *uncorefreq.Writer +} + +// New constructs a Handler with both internal allocators (cpufreq +// and pct) ready in a "no configuration applied" state. Configure +// must be called before the handler is usable. +func New(sys sysfs.System) (*Handler, error) { + h := &Handler{ + sys: sys, + defs: map[string]types.ClassDef{}, + cpuClass: map[int]string{}, + dirtyCPUs: map[int]bool{}, + freqWriter: cpufreq.NewWriter(cpufreq.Hooks{}), + idleWriter: cpuidle.NewWriter(cpuidle.Hooks{}), + uncoreWriter: uncorefreq.NewWriter(uncorefreq.Hooks{}), + } + freq, err := cpufreq.New(sys, h) + if err != nil { + return nil, fmt.Errorf("cpuclass: failed to create cpufreq allocator: %w", err) + } + pctA, err := pct.NewAllocator(sys) + if err != nil { + return nil, fmt.Errorf("cpuclass: failed to create pct allocator: %w", err) + } + h.cpufreq = freq + h.pct = pctA + return h, nil +} + +// PctFreeClassCapacity returns the number of logical CPUs that the +// PCT allocator can still route into the named cpuClass on this +// node, given that 'held' lists CPUs already consumed by some +// balloon belonging to any other cpuClass. Returns 0 if PCT is +// inactive or the class has no PCT plan. +func (h *Handler) PctFreeClassCapacity(className string, held cpuset.CPUSet) int { + if h == nil || h.pct == nil { + return 0 + } + return h.pct.FreeClassCapacity(className, held) +} + +// PctActive reports whether PCT is in effect on this node. +func (h *Handler) PctActive() bool { + return h != nil && h.pct != nil && h.pct.Active() +} + +// Configure (re)applies a configuration spec. Idempotent: may be +// called repeatedly with changed classes, turbo-domain mode, or +// allowed set. +func (h *Handler) Configure(spec ConfigSpec) error { + h.allowed = spec.Allowed + h.defs = map[string]types.ClassDef{} + h.cpuClass = map[int]string{} + h.dirtyCPUs = map[int]bool{} + h.freqWriter.Reset() + h.uncoreWriter.Reset() + if err := h.cpufreq.Configure(spec.Classes, spec.TurboDomain, spec.Allowed); err != nil { + return fmt.Errorf("cpuclass: cpufreq configure: %w", err) + } + if name, needs := uncorefreq.RequiresAvailable(h.defs); needs && !h.uncoreWriter.Available() { + return uncorefreq.UnavailableError(name) + } + if err := h.pct.Configure(spec.Classes, spec.Allowed); err != nil { + return fmt.Errorf("cpuclass: pct configure: %w", err) + } + return nil +} + +// SetClassDef records a class definition keyed by its synthetic +// name. If the definition materially changes, every CPU currently +// assigned to that synthetic class is marked dirty. Implements the +// cpufreq.Sink interface. +func (h *Handler) SetClassDef(name string, def types.ClassDef) { + if name == "" { + return + } + prev, had := h.defs[name] + h.defs[name] = def + if had && prev.Equal(def) { + return + } + for cpu, cls := range h.cpuClass { + if cls == name { + h.dirtyCPUs[cpu] = true + } + } +} + +// AssignCPUs updates the (cpu -> synthetic class) map for the given +// CPUs. CPUs whose class changes are added to the dirty set. An +// empty class name means "no class". Implements the cpufreq.Sink +// interface. +func (h *Handler) AssignCPUs(name string, cpus []int) { + for _, cpu := range cpus { + prev, had := h.cpuClass[cpu] + if had && prev == name { + continue + } + h.cpuClass[cpu] = name + h.dirtyCPUs[cpu] = true + } +} + +// Commit flushes pending cpufreq, cpuidle and uncore changes to +// sysfs. Per-property writes are deduplicated against the writers' +// lastWritten caches. +func (h *Handler) Commit() error { + if h == nil || len(h.dirtyCPUs) == 0 { + return nil + } + perClass := map[string][]int{} + for cpu := range h.dirtyCPUs { + name, ok := h.cpuClass[cpu] + if !ok || name == "" { + continue + } + perClass[name] = append(perClass[name], cpu) + } + var firstErr error + for name, cpus := range perClass { + sort.Ints(cpus) + def, ok := h.defs[name] + if !ok { + log.Debugf("cpuclass: Commit: no definition for class %q; skipping cpus %v", name, cpus) + continue + } + if err := h.freqWriter.Enforce(name, def, cpus); err != nil && firstErr == nil { + firstErr = err + } + if err := h.idleWriter.Enforce(name, def.DisabledCstates, cpus); err != nil && firstErr == nil { + firstErr = err + } + } + dirtyDies := uncorefreq.DiesForCpus(h.sys, h.dirtyCPUs) + if err := h.uncoreWriter.Enforce(h.sys, h.defs, h.cpuClass, dirtyDies); err != nil && firstErr == nil { + firstErr = err + } + h.dirtyCPUs = map[int]bool{} + return firstErr +} + +// UseClass applies className to the given CPUs across every internal +// allocator. An empty className means "no class". CPUs outside the +// configured Allowed set are silently dropped. +func (h *Handler) UseClass(className string, cpus cpuset.CPUSet) error { + if err := h.cpufreq.UseClass(className, cpus); err != nil { + log.Warnf("cpuclass: cpufreq failed to apply class %q on CPUs %s: %v", className, cpus, err) + } + if err := h.pct.UseClass(className, cpus); err != nil { + log.Warnf("cpuclass: pct failed to apply class %q on CPUs %s: %v", className, cpus, err) + } + return nil +} + +// Hints returns technology-agnostic placement preferences for an +// upcoming CPU allocation. The returned CpuPreference sets are +// always subsets of the configured Allowed set. +func (h *Handler) Hints(intent AllocationIntent) AllocationHints { + hints := h.pct.Hints(intent) + if h.allowed.Size() > 0 { + hints = intersectHints(hints, h.allowed) + } + return hints +} + +// Shutdown releases any platform-level resources owned by the +// handler. Safe to call multiple times. +func (h *Handler) Shutdown() error { + if h == nil || h.pct == nil { + return nil + } + return h.pct.Shutdown() +} + +// intersectHints returns a copy of hints with every CpuPreference +// constrained to the given bound. Empty preferences are dropped. +func intersectHints(hints AllocationHints, bound cpuset.CPUSet) AllocationHints { + out := AllocationHints{} + for _, p := range hints.Prefer { + s := p.Cpus.Intersection(bound) + if s.IsEmpty() { + continue + } + out.Prefer = append(out.Prefer, CpuPreference{Name: p.Name, Cpus: s}) + } + for _, p := range hints.Avoid { + s := p.Cpus.Intersection(bound) + if s.IsEmpty() { + continue + } + out.Avoid = append(out.Avoid, CpuPreference{Name: p.Name, Cpus: s}) + } + return out +} diff --git a/pkg/resmgr/cpuclass/handler_commit_test.go b/pkg/resmgr/cpuclass/handler_commit_test.go new file mode 100644 index 000000000..b2b4d37d7 --- /dev/null +++ b/pkg/resmgr/cpuclass/handler_commit_test.go @@ -0,0 +1,347 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpuclass + +import ( + "sync" + "testing" + + idset "github.com/intel/goresctrl/pkg/utils" + + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpufreq" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpuidle" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/uncorefreq" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// dieFakePackage extends the package fake with die support so the +// uncore writer can enumerate (pkg, die) tuples. +type dieFakePackage struct { + sysfs.CPUPackage + id idset.ID + cpus cpuset.CPUSet + dies []idset.ID + dieCpus map[idset.ID]cpuset.CPUSet +} + +func (p *dieFakePackage) ID() idset.ID { return p.id } +func (p *dieFakePackage) CPUSet() cpuset.CPUSet { return p.cpus } +func (p *dieFakePackage) DieIDs() []idset.ID { return p.dies } +func (p *dieFakePackage) DieCPUSet(d idset.ID) cpuset.CPUSet { return p.dieCpus[d] } + +// dieFakeCPU augments the cpu fake with package id. +type dieFakeCPU struct { + sysfs.CPU + id idset.ID + pkg idset.ID +} + +func (c *dieFakeCPU) ID() idset.ID { return c.id } +func (c *dieFakeCPU) PackageID() idset.ID { return c.pkg } + +// dieFakeSys is the minimum sysfs.System surface used by the +// uncore writer (Package, CPU, PackageIDs, DieIDs, DieCPUSet). +// Unimplemented methods panic via the embedded nil interface. +type dieFakeSys struct { + sysfs.System + packages map[idset.ID]*dieFakePackage + cpuPkg map[int]idset.ID +} + +func (s *dieFakeSys) PackageIDs() []idset.ID { + ids := make([]idset.ID, 0, len(s.packages)) + for id := range s.packages { + ids = append(ids, id) + } + return ids +} + +func (s *dieFakeSys) Package(id idset.ID) sysfs.CPUPackage { + if p, ok := s.packages[id]; ok { + return p + } + return nil +} + +func (s *dieFakeSys) CPU(id idset.ID) sysfs.CPU { + pkg, ok := s.cpuPkg[int(id)] + if !ok { + return nil + } + return &dieFakeCPU{id: id, pkg: pkg} +} + +// dieFakeCpu specifies the (pkg, die) location of a single CPU when +// building a dieFakeSys. +type dieFakeCpu struct { + pkg int + die int +} + +// newDieFakeSys builds a dieFakeSys from a map cpu -> (pkg, die). +func newDieFakeSys(cpus map[int]dieFakeCpu) *dieFakeSys { + pkgs := map[idset.ID]*dieFakePackage{} + cpuPkg := map[int]idset.ID{} + type pkgDieKey struct{ pkg, die int } + dieCpus := map[pkgDieKey]cpuset.CPUSet{} + pkgCpus := map[int]cpuset.CPUSet{} + pkgDies := map[int]map[int]bool{} + for cpu, loc := range cpus { + cpuPkg[cpu] = idset.ID(loc.pkg) + pkgCpus[loc.pkg] = pkgCpus[loc.pkg].Union(cpuset.New(cpu)) + k := pkgDieKey(loc) + dieCpus[k] = dieCpus[k].Union(cpuset.New(cpu)) + if pkgDies[loc.pkg] == nil { + pkgDies[loc.pkg] = map[int]bool{} + } + pkgDies[loc.pkg][loc.die] = true + } + for pkg, dies := range pkgDies { + dList := make([]idset.ID, 0, len(dies)) + for d := range dies { + dList = append(dList, idset.ID(d)) + } + dc := map[idset.ID]cpuset.CPUSet{} + for d := range dies { + dc[idset.ID(d)] = dieCpus[pkgDieKey{pkg, d}] + } + pkgs[idset.ID(pkg)] = &dieFakePackage{ + id: idset.ID(pkg), + cpus: pkgCpus[pkg], + dies: dList, + dieCpus: dc, + } + } + return &dieFakeSys{packages: pkgs, cpuPkg: cpuPkg} +} + +// recordingWriters captures the per-CPU and per-die writes issued by +// Commit() so tests can assert exactly what was programmed. +type recordingWriters struct { + mu sync.Mutex + minF map[int]int + maxF map[int]int + gov map[int]string + minU map[uncorefreq.DieKey]int + maxU map[uncorefreq.DieKey]int + minCnt int + maxCnt int + govCnt int + uMinCnt int + uMaxCnt int +} + +func newRecordingWriters() *recordingWriters { + return &recordingWriters{ + minF: map[int]int{}, + maxF: map[int]int{}, + gov: map[int]string{}, + minU: map[uncorefreq.DieKey]int{}, + maxU: map[uncorefreq.DieKey]int{}, + } +} + +// installOn replaces the cpufreq and uncore writers of h with +// in-memory recorders. The cpuidle writer is replaced by a no-op so +// tests do not need a real cstates handle. +func (r *recordingWriters) installOn(h *Handler) { + h.freqWriter = cpufreq.NewWriter(cpufreq.Hooks{ + SetMin: func(cpu, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.minF[cpu] = freq + r.minCnt++ + return nil + }, + SetMax: func(cpu, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.maxF[cpu] = freq + r.maxCnt++ + return nil + }, + SetGov: func(cpu int, g string) error { + r.mu.Lock() + defer r.mu.Unlock() + r.gov[cpu] = g + r.govCnt++ + return nil + }, + }) + h.uncoreWriter = uncorefreq.NewWriter(uncorefreq.Hooks{ + SetMin: func(pkg, die, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.minU[uncorefreq.DieKey{Pkg: pkg, Die: die}] = freq + r.uMinCnt++ + return nil + }, + SetMax: func(pkg, die, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.maxU[uncorefreq.DieKey{Pkg: pkg, Die: die}] = freq + r.uMaxCnt++ + return nil + }, + }) + h.idleWriter = cpuidle.NewWriter(cpuidle.Hooks{}) +} + +// newBareHandler returns a Handler with empty state, no sysfs +// topology (callers may set h.sys), and the recording writers +// installed. The cpuidle writer is left in a state where Enforce +// will return early because no class has DisabledCstates. +func newBareHandler() (*Handler, *recordingWriters) { + h := &Handler{ + defs: map[string]types.ClassDef{}, + cpuClass: map[int]string{}, + dirtyCPUs: map[int]bool{}, + } + r := newRecordingWriters() + r.installOn(h) + return h, r +} + +// TestCommitIdempotentCpufreq verifies that a second Commit() with +// no state change re-issues zero sysfs writes. +func TestCommitIdempotentCpufreq(t *testing.T) { + h, r := newBareHandler() + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_600_000, FreqGovernor: "performance"}) + h.AssignCPUs("hp@d0", []int{0, 1}) + if err := h.Commit(); err != nil { + t.Fatalf("first Commit: %v", err) + } + if r.minCnt != 2 || r.maxCnt != 2 || r.govCnt != 2 { + t.Fatalf("expected 2 of each write, got min=%d max=%d gov=%d", r.minCnt, r.maxCnt, r.govCnt) + } + if err := h.Commit(); err != nil { + t.Fatalf("second Commit: %v", err) + } + if r.minCnt != 2 || r.maxCnt != 2 || r.govCnt != 2 { + t.Fatalf("second Commit should be no-op, got min=%d max=%d gov=%d", r.minCnt, r.maxCnt, r.govCnt) + } +} + +// TestClassDefChangeDirtiesAssignedCpus verifies that updating a +// class definition reprograms the CPUs already assigned to that +// class on the next Commit, without requiring a re-assign. +func TestClassDefChangeDirtiesAssignedCpus(t *testing.T) { + h, r := newBareHandler() + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_000_000}) + h.AssignCPUs("hp@d0", []int{0, 1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#1: %v", err) + } + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_600_000}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#2: %v", err) + } + for _, cpu := range []int{0, 1} { + if r.maxF[cpu] != 4_600_000 { + t.Errorf("cpu%d max=%d, want 4_600_000", cpu, r.maxF[cpu]) + } + } +} + +// TestAssignToEmptyClassDoesNotWriteCpufreq verifies that moving a +// CPU to the empty class leaves the writers untouched. +func TestAssignToEmptyClassDoesNotWriteCpufreq(t *testing.T) { + h, r := newBareHandler() + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_000_000, FreqGovernor: "performance"}) + h.AssignCPUs("hp@d0", []int{0}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#1: %v", err) + } + r.maxCnt, r.minCnt, r.govCnt = 0, 0, 0 + h.AssignCPUs("", []int{0}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#2: %v", err) + } + if r.minCnt+r.maxCnt+r.govCnt != 0 { + t.Errorf("empty class should not write to cpufreq, got min=%d max=%d gov=%d", r.minCnt, r.maxCnt, r.govCnt) + } +} + +// TestUncoreSkipBothZero verifies that a die with effective min=0 +// and max=0 produces no uncore writes. +func TestUncoreSkipBothZero(t *testing.T) { + sys := newDieFakeSys(map[int]dieFakeCpu{ + 0: {pkg: 0, die: 0}, + 1: {pkg: 0, die: 0}, + }) + h, r := newBareHandler() + h.sys = sys + h.SetClassDef("idle@d0", types.ClassDef{MinFreq: 800_000}) + h.AssignCPUs("idle@d0", []int{0, 1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + if r.uMinCnt != 0 || r.uMaxCnt != 0 { + t.Errorf("uncore should not be written when both limits are 0, got min=%d max=%d", r.uMinCnt, r.uMaxCnt) + } +} + +// TestUncoreMaxWinsAcrossClasses verifies the per-die max-wins +// reduction when multiple classes are active on the same die. +func TestUncoreMaxWinsAcrossClasses(t *testing.T) { + sys := newDieFakeSys(map[int]dieFakeCpu{ + 0: {pkg: 0, die: 0}, + 1: {pkg: 0, die: 0}, + }) + h, r := newBareHandler() + h.sys = sys + h.SetClassDef("lo@d0", types.ClassDef{UncoreMinFreq: 800_000, UncoreMaxFreq: 1_500_000}) + h.SetClassDef("hi@d0", types.ClassDef{UncoreMinFreq: 1_200_000, UncoreMaxFreq: 2_400_000}) + h.AssignCPUs("lo@d0", []int{0}) + h.AssignCPUs("hi@d0", []int{1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + key := uncorefreq.DieKey{Pkg: 0, Die: 0} + if got := r.maxU[key]; got != 2_400_000 { + t.Errorf("uncore max = %d, want 2_400_000 (hi class wins)", got) + } + if got := r.minU[key]; got != 1_200_000 { + t.Errorf("uncore min = %d, want 1_200_000 (hi class wins)", got) + } +} + +// TestUncoreRecomputesOnAssignmentChange verifies that removing the +// winner class from a die triggers a fresh write with the loser's +// (lower) values. +func TestUncoreRecomputesOnAssignmentChange(t *testing.T) { + sys := newDieFakeSys(map[int]dieFakeCpu{ + 0: {pkg: 0, die: 0}, + 1: {pkg: 0, die: 0}, + }) + h, r := newBareHandler() + h.sys = sys + h.SetClassDef("lo@d0", types.ClassDef{UncoreMaxFreq: 1_500_000}) + h.SetClassDef("hi@d0", types.ClassDef{UncoreMaxFreq: 2_400_000}) + h.AssignCPUs("lo@d0", []int{0}) + h.AssignCPUs("hi@d0", []int{1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#1: %v", err) + } + h.AssignCPUs("lo@d0", []int{1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#2: %v", err) + } + if got := r.maxU[uncorefreq.DieKey{Pkg: 0, Die: 0}]; got != 1_500_000 { + t.Errorf("uncore max after hi removed = %d, want 1_500_000", got) + } +} diff --git a/pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go b/pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go new file mode 100644 index 000000000..aee4fbe82 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go @@ -0,0 +1,338 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpufreq owns the cpufreq-side CPU-class lifecycle: +// resolution of symbolic frequencies (min/base/turbo), turbo-priority +// winner selection per turbo domain, and the per-CPU sysfs writes +// that follow. The package is consumed by the cpuclass handler and +// exposes no behavior to user-facing code. +package cpufreq + +import ( + "fmt" + "sort" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var log = logger.NewLogger("cpuclass") + +// Sink is the back-channel through which the allocator publishes +// resolved class definitions and per-CPU class assignments to its +// owner (the cpuclass handler). The handler turns these into per-CPU +// dirty bits and sysfs writes performed by its Commit(). +type Sink interface { + SetClassDef(name string, def types.ClassDef) + AssignCPUs(name string, cpus []int) +} + +// Allocator owns the per-turbo-domain class state for cpufreq. +type Allocator struct { + sys sysfs.System + sink Sink + classes []*policyapi.CPUClass + classByName map[string]*policyapi.CPUClass + turboDomain string + turboInfo *platformTurboInfo + allowed cpuset.CPUSet + + cpuDomain map[int]domainID + domains []domainID + + // activeCpus[d][className] is the set of CPUs in turbo domain d + // currently assigned to className. + activeCpus map[domainID]map[string]cpuset.CPUSet + + // winnerPrio[d] is the highest TurboPriority among classes that + // had any active CPUs in domain d the last time + // recalculateTurbo(d) ran. -1 forces the first recalculation. + winnerPrio map[domainID]int +} + +// domainID identifies one turbo arbitration domain. +type domainID int + +const systemDomainID domainID = 0 + +const ( + turboDomainPackage = "package" + turboDomainSystem = "system" +) + +// New returns an Allocator that publishes class definitions and +// per-CPU assignments to sink. The constructor does not push any +// class definitions; the caller follows up with Configure(). +func New(sys sysfs.System, sink Sink) (*Allocator, error) { + if sys == nil { + return nil, fmt.Errorf("cpufreq: missing required argument sys") + } + if sink == nil { + return nil, fmt.Errorf("cpufreq: missing required argument sink") + } + a := &Allocator{ + sys: sys, + sink: sink, + activeCpus: map[domainID]map[string]cpuset.CPUSet{}, + winnerPrio: map[domainID]int{}, + } + a.discoverPlatformInfo() + return a, nil +} + +// Configure replaces the CPU class set, turbo domain mode and the +// set of allowed CPUs. Resets per-domain turbo winners and +// re-publishes class definitions to the sink. +func (a *Allocator) Configure(classes []*policyapi.CPUClass, turboDomain string, allowed cpuset.CPUSet) error { + a.classes = classes + a.classByName = make(map[string]*policyapi.CPUClass, len(classes)) + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + switch turboDomain { + case "", turboDomainPackage, turboDomainSystem: + a.turboDomain = turboDomain + default: + return fmt.Errorf("cpufreq: unsupported turboDomain %q (expected %q or %q)", + turboDomain, turboDomainPackage, turboDomainSystem) + } + a.allowed = allowed + a.buildCpuDomains() + a.activeCpus = map[domainID]map[string]cpuset.CPUSet{} + a.winnerPrio = map[domainID]int{} + a.pushInitialClassDefinitions() + return nil +} + +// IsKnownClass reports whether the given class name is known to the +// allocator's CPUClasses configuration. +func (a *Allocator) IsKnownClass(name string) bool { + _, ok := a.classByName[name] + return ok +} + +// resolveClassName logs an error for unknown names and returns the +// name unchanged so the caller sees what was requested. +func (a *Allocator) resolveClassName(name string) string { + if name == "" { + return "" + } + if a.IsKnownClass(name) { + return name + } + log.Errorf("unknown CPU class %q", name) + return name +} + +// UseClass marks the given CPUs as active under className, +// recalculates the turbo winner of every affected turbo domain, then +// publishes per-CPU assignments to the sink. CPUs outside the +// configured Allowed set are silently dropped. +func (a *Allocator) UseClass(className string, cpus cpuset.CPUSet) error { + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + return nil + } + className = a.resolveClassName(className) + a.removeCpusFromAllClasses(cpus) + byDomain := a.cpusByDomain(cpus) + if className != "" { + for d, dc := range byDomain { + if a.activeCpus[d] == nil { + a.activeCpus[d] = map[string]cpuset.CPUSet{} + } + a.activeCpus[d][className] = a.activeCpus[d][className].Union(dc) + } + } + for d := range byDomain { + a.recalculateTurbo(d) + } + for d, dc := range byDomain { + syn := a.syntheticName(className, d) + a.sink.AssignCPUs(syn, dc.UnsortedList()) + } + return nil +} + +// removeCpusFromAllClasses removes the given CPUs from every active +// class set, in every turbo domain. +func (a *Allocator) removeCpusFromAllClasses(cpus cpuset.CPUSet) { + for d, perClass := range a.activeCpus { + for name, set := range perClass { + newSet := set.Difference(cpus) + if newSet.IsEmpty() { + delete(perClass, name) + } else { + perClass[name] = newSet + } + } + if len(perClass) == 0 { + delete(a.activeCpus, d) + } + } +} + +func (a *Allocator) cpusByDomain(cpus cpuset.CPUSet) map[domainID]cpuset.CPUSet { + out := map[domainID]cpuset.CPUSet{} + for _, cpu := range cpus.UnsortedList() { + d, ok := a.cpuDomain[cpu] + if !ok { + d = systemDomainID + } + out[d] = out[d].Union(cpuset.New(cpu)) + } + return out +} + +func (a *Allocator) buildCpuDomains() { + a.cpuDomain = map[int]domainID{} + seen := map[domainID]bool{} + mode := a.turboDomain + if mode == "" { + mode = turboDomainPackage + } + for _, cpuID := range a.sys.CPUIDs() { + if a.allowed.Size() > 0 && !a.allowed.Contains(int(cpuID)) { + continue + } + c := a.sys.CPU(cpuID) + if c == nil { + continue + } + var d domainID + switch mode { + case turboDomainSystem: + d = systemDomainID + default: + d = domainID(c.PackageID()) + } + a.cpuDomain[int(cpuID)] = d + seen[d] = true + } + a.domains = a.domains[:0] + for d := range seen { + a.domains = append(a.domains, d) + } + sort.Slice(a.domains, func(i, j int) bool { return a.domains[i] < a.domains[j] }) + for _, d := range a.domains { + a.winnerPrio[d] = -1 + } + log.Debugf("turbo domains (mode=%s): %v (cpu->domain: %v)", mode, a.domains, a.cpuDomain) +} + +// syntheticName returns the per-domain internal name used to track a +// user-facing class in a turbo domain. Empty class names pass +// through unchanged. +func (a *Allocator) syntheticName(name string, d domainID) string { + if name == "" { + return "" + } + if _, ok := a.classByName[name]; !ok { + return name + } + return fmt.Sprintf("%s@d%d", name, d) +} + +// pushInitialClassDefinitions resolves symbolic frequencies in every +// CPUClass and publishes the resulting types.ClassDef to the sink, +// once per (class, turbo domain) pair. +func (a *Allocator) pushInitialClassDefinitions() { + if len(a.domains) == 0 { + return + } + for _, cc := range a.classes { + def := classDefFromCPUClass(cc, a.turboInfo, 0) + for _, d := range a.domains { + a.sink.SetClassDef(a.syntheticName(cc.Name, d), def) + } + log.Infof("cpuClass %q configured: minFreq=%s(%d) maxFreq=%s(%d) disabledCstates=%v", + cc.Name, cc.MinFreq, def.MinFreq, cc.MaxFreq, def.MaxFreq, cc.DisabledCstates) + } +} + +// recalculateTurbo resolves exclusive turbo frequency access in the +// given turbo domain based on TurboPriority across all CPU classes +// that currently have active CPUs in that domain. See the in-tree +// design notes for the algorithm. +func (a *Allocator) recalculateTurbo(d domainID) { + if len(a.classes) == 0 { + return + } + newPrio := 0 + if perClass, ok := a.activeCpus[d]; ok { + for _, cc := range a.classes { + if cc.TurboPriority <= newPrio { + continue + } + if set, ok := perClass[cc.Name]; ok && !set.IsEmpty() { + newPrio = cc.TurboPriority + } + } + } + if prev, ok := a.winnerPrio[d]; ok && prev == newPrio { + return + } + a.winnerPrio[d] = newPrio + if a.turboInfo == nil { + log.Warnf("turbo recalculation skipped (domain %d): no platform turbo info", d) + return + } + for _, cc := range a.classes { + effectiveTurboKHz := a.turboInfo.baseFreqKHz + if newPrio == 0 || cc.TurboPriority >= newPrio { + effectiveTurboKHz = a.turboInfo.maxTurboFreqKHz + } + def := classDefFromCPUClass(cc, a.turboInfo, effectiveTurboKHz) + a.sink.SetClassDef(a.syntheticName(cc.Name, d), def) + log.Infof("turbo: domain=%d class %q (prio=%d, winner=%v): minFreq=%d maxFreq=%d", + d, cc.Name, cc.TurboPriority, + newPrio == 0 || cc.TurboPriority >= newPrio, + def.MinFreq, def.MaxFreq) + } +} + +// classDefFromCPUClass converts a user-facing CPUClass to a +// resolved ClassDef. When info is nil, symbolic frequencies resolve +// to 0; when info is non-nil they resolve to the corresponding +// platform value (with effectiveTurboKHz overriding the turbo +// sentinel if non-zero). +func classDefFromCPUClass(cc *policyapi.CPUClass, info *platformTurboInfo, effectiveTurboKHz uint) types.ClassDef { + resolve := func(f policyapi.Frequency) uint { + if info != nil { + turboKHz := info.maxTurboFreqKHz + if effectiveTurboKHz > 0 { + turboKHz = effectiveTurboKHz + } + return f.Resolve(info.minFreqKHz, info.baseFreqKHz, turboKHz) + } + if f.IsSymbolic() { + return 0 + } + return f.KHz() + } + return types.ClassDef{ + MinFreq: resolve(cc.MinFreq), + MaxFreq: resolve(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: resolve(cc.UncoreMinFreq), + UncoreMaxFreq: resolve(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: cc.DisabledCstates, + } +} diff --git a/pkg/resmgr/cpuclass/internal/cpufreq/platform.go b/pkg/resmgr/cpuclass/internal/cpufreq/platform.go new file mode 100644 index 000000000..18ce2e177 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpufreq/platform.go @@ -0,0 +1,71 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpufreq + +import ( + "fmt" + + "github.com/containers/nri-plugins/pkg/sysfs" +) + +// platformTurboInfo holds platform-level turbo frequency capabilities +// discovered from sysfs. +type platformTurboInfo struct { + baseFreqKHz uint + maxTurboFreqKHz uint + minFreqKHz uint +} + +// discoverPlatformInfo populates a.turboInfo from sysfs. Failure is +// non-fatal: symbolic frequencies then resolve to 0. +func (a *Allocator) discoverPlatformInfo() { + info, err := discoverTurboInfo(a.sys) + if err != nil { + log.Warnf("cpufreq: cannot discover platform turbo info: %v", err) + return + } + a.turboInfo = info +} + +// discoverTurboInfo reads platform turbo capabilities from sysfs. It +// uses the first online CPU's frequency range as representative. +func discoverTurboInfo(sys sysfs.System) (*platformTurboInfo, error) { + cpuIDs := sys.CPUIDs() + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("no CPUs found in system topology") + } + for _, id := range cpuIDs { + cpu := sys.CPU(id) + if cpu == nil || !cpu.Online() { + continue + } + freq := cpu.FrequencyRange() + baseFreq := cpu.BaseFrequency() + if freq.Min == 0 && freq.Max == 0 { + log.Warnf("cannot detect cpu%d frequency range, skipping platform turbo info", id) + continue + } + if baseFreq == 0 { + log.Warnf("cannot detect cpu%d base frequency, default to max", id) + baseFreq = freq.Max + } + return &platformTurboInfo{ + baseFreqKHz: uint(baseFreq), + maxTurboFreqKHz: uint(freq.Max), + minFreqKHz: uint(freq.Min), + }, nil + } + return nil, fmt.Errorf("no online CPU with valid frequency information found") +} diff --git a/pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go b/pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go new file mode 100644 index 000000000..cf91491be --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go @@ -0,0 +1,153 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpufreq + +import ( + "github.com/intel/goresctrl/pkg/utils" + + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" +) + +// Hooks lets tests intercept per-CPU writes without touching real +// sysfs. Production use leaves all hooks nil; the writer then talks +// to the platform via goresctrl. +type Hooks struct { + SetMin func(cpu, kHz int) error + SetMax func(cpu, kHz int) error + SetGov func(cpu int, governor string) error +} + +// cpufreqWritten records the last successfully written values on a +// single CPU. Used for write deduplication. +type cpufreqWritten struct { + min uint + max uint + governor string + hasMin bool + hasMax bool + hasGov bool +} + +// Writer is the direct per-CPU cpufreq sysfs writer. Properties are +// written only when the desired value differs from the last +// successfully written one. Failures on individual CPUs/properties +// are logged but do not stop processing of the remaining ones; the +// first error encountered is returned. +type Writer struct { + hooks Hooks + lastWritten map[int]cpufreqWritten +} + +// NewWriter returns a Writer wired to the given hooks. Pass a +// zero-valued Hooks to use real sysfs via goresctrl. +func NewWriter(hooks Hooks) *Writer { + return &Writer{ + hooks: hooks, + lastWritten: make(map[int]cpufreqWritten), + } +} + +// Reset clears the per-CPU lastWritten cache so the next Enforce +// pass re-writes every desired value. Called by the handler when +// class definitions or the allowed set change. +func (w *Writer) Reset() { + w.lastWritten = make(map[int]cpufreqWritten) +} + +// Forget drops the lastWritten cache entries for the given CPUs. +func (w *Writer) Forget(cpus ...int) { + for _, c := range cpus { + delete(w.lastWritten, c) + } +} + +// Enforce writes min/max/governor to sysfs for every CPU in cpus, +// skipping properties whose desired value matches the last written +// one. A zero min or max means "don't enforce". An empty governor +// means "don't enforce". The first error encountered is returned. +func (w *Writer) Enforce(class string, def types.ClassDef, cpus []int) error { + if len(cpus) == 0 { + return nil + } + min := def.MinFreq + max := def.MaxFreq + governor := def.FreqGovernor + + var firstErr error + for _, cpu := range cpus { + state := w.lastWritten[cpu] + + if min > 0 && (!state.hasMin || state.min != min) { + log.Debugf("enforcing cpu frequency min %d from class %q on cpu %d", min, class, cpu) + if err := w.callSetMin(cpu, int(min)); err != nil { + log.Errorf("cpufreq: cpu%d: cannot set min=%d: %v", cpu, min, err) + if firstErr == nil { + firstErr = err + } + } + state.min = min + state.hasMin = true + } + + if max > 0 && (!state.hasMax || state.max != max) { + log.Debugf("enforcing cpu frequency max %d from class %q on cpu %d", max, class, cpu) + if err := w.callSetMax(cpu, int(max)); err != nil { + log.Errorf("cpufreq: cpu%d: cannot set max=%d: %v", cpu, max, err) + if firstErr == nil { + firstErr = err + } + } + state.max = max + state.hasMax = true + } + + if governor != "" && (!state.hasGov || state.governor != governor) { + log.Debugf("enforcing cpu frequency governor %q from class %q on cpu %d", governor, class, cpu) + if err := w.callSetGov(cpu, governor); err != nil { + log.Errorf("cpufreq: cpu%d: cannot set governor=%q: %v", cpu, governor, err) + if firstErr == nil { + firstErr = err + } + } + state.governor = governor + state.hasGov = true + } + + w.lastWritten[cpu] = state + } + + return firstErr +} + +func (w *Writer) callSetMin(cpu, freq int) error { + if w.hooks.SetMin != nil { + return w.hooks.SetMin(cpu, freq) + } + return utils.SetCPUScalingMinFreq(utils.ID(cpu), freq) +} + +func (w *Writer) callSetMax(cpu, freq int) error { + if w.hooks.SetMax != nil { + return w.hooks.SetMax(cpu, freq) + } + return utils.SetCPUScalingMaxFreq(utils.ID(cpu), freq) +} + +func (w *Writer) callSetGov(cpu int, governor string) error { + if w.hooks.SetGov != nil { + return w.hooks.SetGov(cpu, governor) + } + return utils.SetCPUScalingGovernor(utils.ID(cpu), governor) +} diff --git a/pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go b/pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go new file mode 100644 index 000000000..a208ff757 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go @@ -0,0 +1,122 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpuidle is the C-state writer used by the cpuclass +// handler. It wraps the goresctrl cstates library, exposing a +// uniform Hooks-injectable interface that matches the cpufreq and +// uncorefreq writers. +package cpuidle + +import ( + "fmt" + + "github.com/intel/goresctrl/pkg/cstates" + + logger "github.com/containers/nri-plugins/pkg/log" +) + +var log = logger.NewLogger("cpuclass") + +// Hooks lets tests intercept the cstate apply operations without +// touching real sysfs. Production use leaves all hooks nil; the +// writer then talks to the platform via goresctrl. The two hooks +// mirror the two Apply calls performed per enforce(): enable and +// disable. +type Hooks struct { + Apply func(cpus []int, enabled, disabled []string) error +} + +// Writer enforces per-class enable/disable bits across the cstate +// names exposed by the platform. The cstates handle is created +// lazily on first enforce() call that has any disabled cstates; +// hosts and tests that never request a cstate change therefore +// never touch the cpuidle sysfs. +type Writer struct { + hooks Hooks + cs *cstates.Cstates +} + +// NewWriter returns a Writer wired to the given hooks. Pass a +// zero-valued Hooks to use real sysfs via goresctrl. +func NewWriter(hooks Hooks) *Writer { + return &Writer{hooks: hooks} +} + +// Enforce applies the class-specific C-state enable/disable mask on +// the given CPUs. An empty disabledCstates leaves the writer +// untouched as long as the cstates handle has never been +// initialized. Returns the first error encountered. +func (w *Writer) Enforce(class string, disabledCstates []string, cpus []int) error { + if len(cpus) == 0 { + return nil + } + if len(disabledCstates) == 0 && w.cs == nil && w.hooks.Apply == nil && cstatesEnvOverridesJson == "" { + return nil + } + if w.hooks.Apply != nil { + return w.hooks.Apply(cpus, nil, disabledCstates) + } + if err := w.ensureHandle(); err != nil { + return err + } + enabledCstates := []string{} + for _, name := range w.cs.Names() { + enabled := true + for _, d := range disabledCstates { + if name == d { + enabled = false + break + } + } + if enabled { + enabledCstates = append(enabledCstates, name) + } + } + cpuCstates := w.cs.Copy(cstates.NewBasicFilter().SetCPUs(cpus...)) + enCpuCstates := cpuCstates.Copy(cstates.NewBasicFilter().SetCstateNames(enabledCstates...)) + disCpuCstates := cpuCstates.Copy(cstates.NewBasicFilter().SetCstateNames(disabledCstates...)) + enCpuCstates.SetAttrs(cstates.AttrDisable, "0") + disCpuCstates.SetAttrs(cstates.AttrDisable, "1") + log.Debugf("cstates: class %q on cpus %v: enable=%v disable=%v", class, cpus, enabledCstates, disabledCstates) + if err := enCpuCstates.Apply(); err != nil { + return fmt.Errorf("cannot enable cstates %v on cpus %v: %w", enabledCstates, cpus, err) + } + if err := disCpuCstates.Apply(); err != nil { + return fmt.Errorf("cannot disable cstates %v on cpus %v: %w", disabledCstates, cpus, err) + } + return nil +} + +// ensureHandle lazily creates the cstates handle, picking the +// in-memory override fs when OVERRIDE_SYS_CSTATES is set. +func (w *Writer) ensureHandle() error { + if w.cs != nil { + return nil + } + filter := cstates.NewBasicFilter().SetAttributes(cstates.AttrDisable) + var ( + cs *cstates.Cstates + err error + ) + if cstatesEnvOverridesJson != "" { + cs, err = newCstatesFromOverride(filter) + } else { + cs, err = cstates.NewCstatesFromSysfs(filter) + } + if err != nil { + return fmt.Errorf("failed to read C-states: %w", err) + } + w.cs = cs + return nil +} diff --git a/pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go b/pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go new file mode 100644 index 000000000..0a8dec47d --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go @@ -0,0 +1,166 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpuidle + +import ( + "encoding/json" + "fmt" + "maps" + "os" + "slices" + "strconv" + "strings" + + "github.com/intel/goresctrl/pkg/cstates" + "github.com/intel/goresctrl/pkg/utils" +) + +// cstatesEnvOverridesJson lets e2e tests inject a simulated cstates +// sysfs through the OVERRIDE_SYS_CSTATES environment variable. The +// variable is read once at process start. Production deployments +// leave it unset and use real sysfs. +var ( + cstatesEnvOverridesVar = "OVERRIDE_SYS_CSTATES" + cstatesEnvOverridesJson = os.Getenv(cstatesEnvOverridesVar) +) + +type cstatesOverrides []cstatesOverride +type cstatesOverride struct { + Cpus string `json:"cpus"` + Names []string `json:"names"` + Files map[string]string `json:"files"` +} + +type cstatesOverrideFs struct { + overrides cstatesOverrides + stateName map[int]string + nameState map[string]int + cpuStateFile map[utils.ID]map[int]map[string]string +} + +// newCstatesFromOverride builds a *cstates.Cstates backed by an +// in-memory override fs constructed from the OVERRIDE_SYS_CSTATES +// JSON. Used only when that environment variable is set. +func newCstatesFromOverride(filter cstates.Filter) (*cstates.Cstates, error) { + cs := cstates.NewCstates() + ofs, err := newCstatesOverrideFs() + if err != nil { + return nil, fmt.Errorf("failed to create override fs from %s: %v", cstatesEnvOverridesVar, err) + } + cs.SetFs(ofs) + if err := cs.Read(filter); err != nil { + return nil, fmt.Errorf("failed to refresh cstates from %s overrides: %v", cstatesEnvOverridesVar, err) + } + return cs, nil +} + +func newCstatesOverrideFs() (*cstatesOverrideFs, error) { + ofs := &cstatesOverrideFs{ + stateName: make(map[int]string), + nameState: make(map[string]int), + cpuStateFile: make(map[utils.ID]map[int]map[string]string), + } + if err := json.Unmarshal([]byte(cstatesEnvOverridesJson), &ofs.overrides); err != nil { + return nil, err + } + if len(ofs.overrides) == 0 { + return nil, fmt.Errorf("no overrides found in %s", cstatesEnvOverridesVar) + } + names := make(map[string]bool) + for _, o := range ofs.overrides { + for _, name := range o.Names { + names[name] = true + } + } + orderedNames := make([]string, 0, len(names)) + for name := range names { + orderedNames = append(orderedNames, name) + } + slices.Sort(orderedNames) + for state, name := range orderedNames { + ofs.stateName[state] = name + ofs.nameState[name] = state + } + + for _, o := range ofs.overrides { + cpus, err := utils.NewIDSetFromString(o.Cpus) + if err != nil { + return nil, fmt.Errorf("invalid CPU list %q in %s: %v", o.Cpus, cstatesEnvOverridesVar, err) + } + for cpu := range cpus { + cpuid := utils.ID(cpu) + if _, ok := ofs.cpuStateFile[cpuid]; !ok { + ofs.cpuStateFile[cpuid] = make(map[int]map[string]string) + } + for _, name := range o.Names { + state := ofs.nameState[name] + if _, ok := ofs.cpuStateFile[cpuid][state]; !ok { + ofs.cpuStateFile[cpuid][state] = make(map[string]string) + } + maps.Copy(ofs.cpuStateFile[cpuid][state], o.Files) + ofs.cpuStateFile[cpuid][state]["name"] = name + } + } + } + log.Debugf("cstates override fs: loaded overrides for %d CPUs C-states: %s", len(ofs.cpuStateFile), strings.Join(orderedNames, ", ")) + return ofs, nil +} + +func (fs *cstatesOverrideFs) PossibleCpus() (string, error) { + maxCpu := utils.ID(-1) + for cpu := range fs.cpuStateFile { + if cpu > maxCpu { + maxCpu = cpu + } + } + if maxCpu < 0 { + return "", nil + } + return "0-" + strconv.Itoa(maxCpu), nil +} + +func (fs *cstatesOverrideFs) CpuidleStates(cpuID utils.ID) ([]int, error) { + states := []int{} + for state := range fs.stateName { + states = append(states, state) + } + slices.Sort(states) + return states, nil +} + +func (fs *cstatesOverrideFs) CpuidleStateAttrRead(cpu utils.ID, state int, attribute string) (string, error) { + if stateFiles, ok := fs.cpuStateFile[cpu]; ok { + if files, ok := stateFiles[state]; ok { + if val, ok := files[attribute]; ok { + log.Debugf("cstates override fs: read cpu%d cstate=%s %s=%q", cpu, fs.stateName[state], attribute, val) + return val, nil + } + } + } + log.Errorf("cstates override fs: cannot read cpu%d cstate=%s attribute %q", cpu, fs.stateName[state], attribute) + return "", os.ErrNotExist +} + +func (fs *cstatesOverrideFs) CpuidleStateAttrWrite(cpu utils.ID, state int, attribute string, value string) error { + if stateFiles, ok := fs.cpuStateFile[cpu]; ok { + if files, ok := stateFiles[state]; ok { + files[attribute] = value + log.Debugf("cstates override fs: wrote cpu%d cstate=%s %s=%q", cpu, fs.stateName[state], attribute, value) + return nil + } + } + log.Errorf("cstates override fs: write to non-existing cpu%d cstate=%d %s=%q ignored", cpu, state, attribute, value) + return nil +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct.go b/pkg/resmgr/cpuclass/internal/pct/pct.go new file mode 100644 index 000000000..1e4f06ec5 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct.go @@ -0,0 +1,974 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "fmt" + "sort" + + idset "github.com/intel/goresctrl/pkg/utils" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var log = logger.NewLogger("cpuclass") + +const ( + // pctDefaultHpClos / pctDefaultLpClos are the conventional + // CLOS slots used in managed mode when the user does not pin + // PctClosID explicitly. See the PCT Technical Article example. + pctDefaultHpClos = 0 + pctDefaultLpClos = 3 +) + +// pctMode is the operating mode of the PCT allocator. +type pctMode int + +const ( + pctModeDisabled pctMode = iota + pctModeManaged // nri-plugin owns SoC-wide SST + CLOS configs + pctModeAssocOnly // operator/BIOS owns CLOSes; we only associate CPUs +) + +// pctClassPlan records the CLOS that should be used for one PCT +// cpuClass and the freq bounds to program in managed mode. +type pctClassPlan struct { + ClosID int + MinFreq uint // kHz, 0 = leave alone + MaxFreq uint // kHz, 0 = leave alone +} + +// Sys is the subset of sysfs.System that Allocator depends +// on. Defined here so tests can substitute a fake without +// implementing the full sysfs.System surface. +type Sys interface { + PackageIDs() []idset.ID + Package(id idset.ID) sysfs.CPUPackage + CPU(id idset.ID) sysfs.CPU + CPUIDs() []idset.ID +} + +// Allocator manages Intel Priority Core Turbo CLOS associations +// driven by cpuClass definitions. +type Allocator struct { + sys Sys + sst sst + mode pctMode + classByName map[string]*policyapi.CPUClass + classPlan map[string]*pctClassPlan // class name -> CLOS plan (PCT classes only) + // fallbackClos is the hardware CLOS used for CPUs whose class + // is not a PCT class. After SST reset CLOS 0 is the default, + // so we use it here too. This is a hardware-level concept, + // not a user-visible "idle". + fallbackClos int + allowed cpuset.CPUSet + // hpClasses holds the names of cpuClasses currently + // classified as high priority. In managed mode this is every + // class with pctPriority=high. In assoc-only mode it is + // populated from GetClosConfig at Configure(): the CLOS with + // the largest programmed MaxFreq is HP; classes targeting + // that CLOS are HP. Tie-break (equal MaxFreq) goes to the + // smaller CLOS id, matching SST-CP ordered-priority + // convention. Empty when no HP class can be determined. + hpClasses map[string]bool + // punits is the per-punit topology cached from sst.Punits() + // at Configure() time, with each punit's CPUs already + // intersected with allowed. + punits []pctPunit + // punitByCpu maps each allowed CPU to its index in punits. + // CPUs outside any known punit are absent from the map; the + // allocator treats them as "no HP knowledge". + punitByCpu map[int]int + // hpUsed[i] is the set of CPUs currently held by HP-class + // workloads on punits[i]. + hpUsed map[int]cpuset.CPUSet + // hpEligiblePunit[i] reports whether punits[i] can actually + // host HP-class CPUs at top turbo. Populated at Configure(). + // In managed mode every punit becomes eligible (the plugin + // enables SST-TF itself). In assoc-only mode a punit is + // eligible only when SST-TF is currently enabled on it + // (operator's responsibility); otherwise its standard + // turbo-ratio bucket caps HP frequency and the punit must + // not contribute to scheduler-visible HP capacity. Missing + // entries are treated as not eligible. + hpEligiblePunit map[int]bool +} + +// NewAllocator returns a new PCT allocator in the disabled mode. +func NewAllocator(sys Sys) (*Allocator, error) { + s, err := newSst() + if err != nil { + return nil, err + } + return &Allocator{ + sys: sys, + sst: s, + mode: pctModeDisabled, + }, nil +} + +// configure selects the PCT operating mode from the given cpuClass +// definitions and, in managed mode, programs the corresponding SST +// CLOSes. Honors `allowed` as the boundary of CPUs the allocator may +// touch. +// +// - classes: cpuClass definitions to inspect for PCT fields. +// - allowed: CPUs the allocator may configure. +func (a *Allocator) Configure(classes []*policyapi.CPUClass, allowed cpuset.CPUSet) error { + a.classByName = make(map[string]*policyapi.CPUClass, len(classes)) + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + a.fallbackClos = pctDefaultHpClos // CLOS 0 == default-after-reset + a.allowed = allowed + a.hpUsed = map[int]cpuset.CPUSet{} + a.hpClasses = map[string]bool{} + a.hpEligiblePunit = map[int]bool{} + a.punits = nil + a.punitByCpu = nil + + mode, plans, err := a.planClasses(classes) + if err != nil { + return err + } + a.mode = mode + a.classPlan = plans + if mode == pctModeDisabled { + log.Debugf("pct: no cpuClasses request PCT; PCT allocator disabled") + return nil + } + if !a.sst.Supported() { + log.Warnf("pct: SST not supported on this host; ignoring PCT fields in cpuClasses") + a.mode = pctModeDisabled + a.classPlan = nil + return nil + } + + a.snapshotPunits() + log.Infof("pct: mode=%s, %d PCT cpuClass(es), %d punit(s) across %d package(s)", + a.modeString(), len(plans), len(a.punits), len(a.packageIDsFromPunits())) + + if mode == pctModeManaged { + if err := a.sst.PrepareManagedMode(); err != nil { + return fmt.Errorf("pct: failed to prepare managed mode: %w", err) + } + // Managed mode owns SST-TF and enables it on every punit + // (PrepareManagedMode). All snapshotted punits are thus + // HP-eligible. + for idx := range a.punits { + a.hpEligiblePunit[idx] = true + } + // Program every requested CLOS. + closesProgrammed := map[int]bool{} + closIDs := make([]int, 0, len(plans)) + for _, p := range plans { + if closesProgrammed[p.ClosID] { + continue + } + closIDs = append(closIDs, p.ClosID) + closesProgrammed[p.ClosID] = true + } + sort.Ints(closIDs) + for _, closID := range closIDs { + var minF, maxF int + for _, p := range plans { + if p.ClosID == closID { + minF = int(p.MinFreq) + maxF = int(p.MaxFreq) + break + } + } + cfg := pctClosConfig{ClosID: closID, MinFreq: minF, MaxFreq: maxF} + if err := a.sst.ConfigureClos(cfg); err != nil { + return fmt.Errorf("pct: failed to configure CLOS %d: %w", closID, err) + } + log.Infof("pct: programmed CLOS %d min=%d max=%d kHz", closID, minF, maxF) + } + if err := a.sst.EnableCP(); err != nil { + return fmt.Errorf("pct: failed to enable SST-CP: %w", err) + } + // Managed mode: HP classes are exactly those with pctPriority=high. + // LP classes are those with pctPriority=low. + var lpClos *int + for _, cc := range classes { + switch cc.PctPriority { + case "high": + a.hpClasses[cc.Name] = true + log.Infof("pct: cpuClass %q classified HP (managed: pctPriority=high, CLOS %d)", + cc.Name, plans[cc.Name].ClosID) + case "low": + id := plans[cc.Name].ClosID + lpClos = &id + log.Infof("pct: cpuClass %q classified LP (managed: pctPriority=low, CLOS %d)", + cc.Name, plans[cc.Name].ClosID) + } + } + // Idle / non-PCT CPUs must fall back to the LP CLOS (when + // defined). Leaving them on CLOS 0 inflates the SST-TF + // active-HP-core count on every punit and prevents bucket-0 + // turbo selection on punits hosting both an HP and an LP + // balloon. + if lpClos != nil { + a.fallbackClos = *lpClos + log.Infof("pct: fallback CLOS for non-PCT CPUs set to %d (LP)", a.fallbackClos) + } + } else { + // Assoc-only: classify HP/LP from CLOS configs programmed + // by the operator/BIOS. The CLOS with the largest MaxFreq + // among the CLOSes our cpuClasses target is HP. + a.classifyAssocOnlyHP(classes) + a.evaluateAssocOnlyHpEligibility() + } + return nil +} + +// evaluateAssocOnlyHpEligibility populates hpEligiblePunit and +// warns the operator about punits where SST-TF is disabled. In +// assoc-only mode the plugin must not toggle SST-TF (the operator +// owns global SST state). Without SST-TF the standard turbo-ratio +// table caps HP cores at the many-active-cores bucket frequency -- +// a low-CLOS-ID association alone is not enough to exceed it. +// Capacity for HP cpuClasses on such punits must therefore be +// reported as zero, otherwise the scheduler bin-packs HP pods onto +// nodes that cannot actually deliver top turbo. The warning points +// the operator at the intel-speed-select command that enables it. +func (a *Allocator) evaluateAssocOnlyHpEligibility() { + if len(a.punits) == 0 { + return + } + status, err := a.sst.TFStatus() + if err != nil { + log.Warnf("pct: assoc-only: cannot read SST-TF status: %v", err) + // Unknown TF state: leave every punit ineligible. Safer + // to under-publish HP capacity than to over-publish it. + return + } + for idx, pu := range a.punits { + enabled, ok := status[pctPunitID{PkgID: pu.PkgID, PunitID: pu.PunitID}] + if !ok { + // No entry: TF state unknown for this punit. Treat + // as ineligible. + continue + } + if enabled { + a.hpEligiblePunit[idx] = true + continue + } + // Pick one representative CPU from the punit for the + // operator hint -- intel-speed-select needs at least + // one CPU on the target punit. + repCPU := -1 + for _, c := range pu.CPUs.UnsortedList() { + repCPU = c + break + } + log.Warnf("pct: assoc-only: SST-TF disabled on pkg=%d punit=%d; "+ + "HP cores on this punit cannot exceed the standard "+ + "turbo-ratio bucket frequency. Enable with: "+ + "intel-speed-select -c %d turbo-freq enable -a", + pu.PkgID, pu.PunitID, repCPU) + } +} + +// snapshotPunits caches the per-punit topology from the sst +// backend, intersecting each punit's CPUs with the allowed set. +// Punits whose intersection with allowed is empty are dropped -- +// they cannot affect placement under this Configure(). The +// resulting punits and punitByCpu indices drive HP accounting and +// hpReserveCpus tier selection. +func (a *Allocator) snapshotPunits() { + raw := a.sst.Punits() + a.punits = make([]pctPunit, 0, len(raw)) + a.punitByCpu = map[int]int{} + for _, pu := range raw { + cpus := pu.CPUs + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + continue + } + idx := len(a.punits) + a.punits = append(a.punits, pctPunit{ + PkgID: pu.PkgID, + PunitID: pu.PunitID, + CPUs: cpus, + MaxHpCpus: pu.MaxHpCpus, + GuaranteedHpCpus: pu.GuaranteedHpCpus, + }) + for _, c := range cpus.UnsortedList() { + a.punitByCpu[c] = idx + } + } +} + +// packageIDsFromPunits returns the set of package IDs present in +// the cached punits, in stable sorted order. +func (a *Allocator) packageIDsFromPunits() []int { + seen := map[int]bool{} + ids := []int{} + for _, pu := range a.punits { + if seen[pu.PkgID] { + continue + } + seen[pu.PkgID] = true + ids = append(ids, pu.PkgID) + } + sort.Ints(ids) + return ids +} + +// classifyAssocOnlyHP populates hpClasses by reading the +// programmed MaxFreq of each CLOS referenced by an assoc-only +// cpuClass. The CLOS with the largest MaxFreq is treated as HP; +// ties go to the smaller CLOS id (matching SST-CP ordered-priority +// convention where lower CLOS ids have higher priority). When no +// CLOS reports a programmed MaxFreq, no class is classified as HP +// (HP-specific hints stay quiet for that class set). +func (a *Allocator) classifyAssocOnlyHP(classes []*policyapi.CPUClass) { + maxFreqs := map[int]int{} + closIDs := []int{} + for _, p := range a.classPlan { + if _, seen := maxFreqs[p.ClosID]; seen { + continue + } + cfg, ok, err := a.sst.GetClosConfig(p.ClosID) + if err != nil { + log.Warnf("pct: assoc-only: GetClosConfig(%d) failed: %v", p.ClosID, err) + continue + } + if !ok { + log.Infof("pct: assoc-only: CLOS %d not programmed; cannot classify HP/LP", p.ClosID) + continue + } + maxFreqs[p.ClosID] = cfg.MaxFreq + closIDs = append(closIDs, p.ClosID) + log.Infof("pct: assoc-only: CLOS %d programmed min=%d max=%d kHz", p.ClosID, cfg.MinFreq, cfg.MaxFreq) + } + if len(closIDs) == 0 { + return + } + sort.Ints(closIDs) + bestClos := -1 + bestMax := -1 + for _, id := range closIDs { + if maxFreqs[id] > bestMax { + bestMax = maxFreqs[id] + bestClos = id + } + } + if bestClos < 0 || bestMax <= 0 { + log.Infof("pct: assoc-only: no CLOS has a programmed MaxFreq; HP classification skipped") + return + } + for _, cc := range classes { + p, ok := a.classPlan[cc.Name] + if !ok || p.ClosID != bestClos { + continue + } + a.hpClasses[cc.Name] = true + log.Infof("pct: cpuClass %q classified HP (assoc-only: CLOS %d MaxFreq=%d kHz)", cc.Name, bestClos, bestMax) + } +} + +// planClasses returns the PCT operating mode and the per-class +// CLOS plan derived from cpuClasses. +func (a *Allocator) planClasses(classes []*policyapi.CPUClass) (pctMode, map[string]*pctClassPlan, error) { + plans := map[string]*pctClassPlan{} + managed, assocOnly := false, false + for _, cc := range classes { + switch { + case cc.PctPriority != "": + managed = true + plan := &pctClassPlan{} + switch cc.PctPriority { + case "high": + plan.ClosID = pctDefaultHpClos + case "low": + plan.ClosID = pctDefaultLpClos + default: + return pctModeDisabled, nil, fmt.Errorf("cpuClass %q: invalid pctPriority %q", cc.Name, cc.PctPriority) + } + minSrc, maxSrc := cc.PctMinFreq, cc.PctMaxFreq + if minSrc == 0 { + minSrc = cc.MinFreq + } + if maxSrc == 0 { + maxSrc = cc.MaxFreq + } + plan.MinFreq = a.resolveHWFreq(minSrc) + plan.MaxFreq = a.resolveHWFreq(maxSrc) + plans[cc.Name] = plan + case cc.PctClosID != nil: + assocOnly = true + plans[cc.Name] = &pctClassPlan{ClosID: *cc.PctClosID} + } + } + switch { + case !managed && !assocOnly: + return pctModeDisabled, nil, nil + case managed && assocOnly: + return pctModeDisabled, nil, fmt.Errorf("pct: cannot mix managed (pctPriority) and assoc-only (pctClosID) modes") + case managed: + return pctModeManaged, plans, nil + default: + return pctModeAssocOnly, plans, nil + } +} + +// resolveHWFreq returns the hardware frequency in kHz that the +// given symbolic policyapi.Frequency refers to. "turbo" resolves to the +// platform's maximum turbo frequency. +func (a *Allocator) resolveHWFreq(f policyapi.Frequency) uint { + if f == 0 { + return 0 + } + info, err := discoverTurboInfo(a.sys) + if err != nil || info == nil { + log.Warnf("pct: cannot discover platform turbo info: %v", err) + return uint(f) + } + return f.Resolve(info.minFreqKHz, info.baseFreqKHz, info.maxTurboFreqKHz) +} + +// active reports whether PCT is in effect (mode != disabled). +func (a *Allocator) Active() bool { + return a != nil && a.mode != pctModeDisabled +} + +// freeClassCapacity returns the number of logical CPUs that can +// still be allocated to className, given that 'held' lists CPUs +// already consumed by some balloon on this node (any class). +// +// Same formula in managed and assoc-only modes: +// - HP class: sum over HP-eligible punits of +// min(GuaranteedHpCpus, |pu.CPUs intersect Allowed minus held|). +// HP capacity is bounded by the punit's *guaranteed top-turbo* +// HP count (smallest non-zero SST-TF bucket +// HighPriorityCoreCount, or SST-BF HP CPU count when TF is +// unsupported) -- not by the larger MaxHpCpus the allocator +// uses for steering. The scheduler-visible capacity must +// reflect how many CPUs can *actually* sustain the highest +// turbo frequency this platform exposes; otherwise HP pods +// get scheduled past the guaranteed-turbo headroom and fall +// back to lower-bucket frequencies. +// - non-HP class: |Allowed minus held|. The allocator can +// re-associate any Allowed CPU to any CLOS on demand, so the +// gating set is what the plugin owns, not what currently +// lives on the target CLOS in hardware. +// +// The modes differ in how hpEligiblePunit is populated: +// - Managed mode: every snapshotted punit is HP-eligible (the +// plugin enables SST-TF itself via PrepareManagedMode). +// - Assoc-only mode: a punit is HP-eligible only when SST-TF +// is currently enabled on it (operator's responsibility). +// Punits where TF is disabled cannot exceed the standard +// turbo-ratio bucket and contribute 0 to HP capacity, so the +// scheduler does not bin-pack HP pods onto nodes that cannot +// deliver top turbo. +// +// Returns 0 for classes that have no PCT plan or when PCT is not +// active. Negative intermediate counts are clamped to 0. +func (a *Allocator) FreeClassCapacity(className string, held cpuset.CPUSet) int { + if !a.Active() { + return 0 + } + if _, ok := a.classPlan[className]; !ok { + return 0 + } + allowed := a.allowed + free := allowed + if free.Size() > 0 { + free = free.Difference(held) + } + if !a.classIsHighPriority(className) { + return free.Size() + } + total := 0 + for idx, pu := range a.punits { + if !a.hpEligiblePunit[idx] { + continue + } + puCpus := pu.CPUs + if allowed.Size() > 0 { + puCpus = puCpus.Intersection(allowed) + } + puFree := puCpus.Difference(held).Size() + gtdHp := pu.GuaranteedHpCpus + if gtdHp <= 0 { + continue + } + room := gtdHp + if puFree < room { + room = puFree + } + if room < 0 { + room = 0 + } + total += room + } + return total +} + +// useClass associates the given CPUs to the CLOS chosen for className. +// In managed mode, CPUs whose className is not a PCT class are +// associated to the fallback CLOS. In assoc-only mode such CPUs are +// left unchanged. CPUs outside the configured Allowed set are silently +// dropped. +func (a *Allocator) UseClass(className string, cpus cpuset.CPUSet) error { + if !a.Active() { + return nil + } + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + return nil + } + a.trackHpUsage(className, cpus) + plan, ok := a.classPlan[className] + if !ok { + if a.mode == pctModeAssocOnly { + return nil + } + return a.associate(cpus, a.fallbackClos) + } + return a.associate(cpus, plan.ClosID) +} + +// trackHpUsage updates per-punit HP CPU bookkeeping so cpus are +// recorded as held by an HP class if className is HP, and removed +// from HP bookkeeping otherwise. CPUs not mapped to any punit +// (e.g. outside Allowed at Configure time) are ignored: they +// cannot affect HP placement and tracking them would only confuse +// hpInUseCpus. +func (a *Allocator) trackHpUsage(className string, cpus cpuset.CPUSet) { + if !a.hpHintsActive() { + return + } + a.clearHpUsage(cpus) + if !a.classIsHighPriority(className) { + return + } + perPunit := map[int][]int{} + for _, cpu := range cpus.UnsortedList() { + idx, ok := a.punitByCpu[cpu] + if !ok { + continue + } + perPunit[idx] = append(perPunit[idx], cpu) + } + for idx, list := range perPunit { + set := a.hpUsed[idx] + a.hpUsed[idx] = set.Union(cpuset.New(list...)) + } +} + +// clearHpUsage removes cpus from per-punit HP bookkeeping. +func (a *Allocator) clearHpUsage(cpus cpuset.CPUSet) { + if !a.hpHintsActive() { + return + } + for idx, set := range a.hpUsed { + if remaining := set.Difference(cpus); remaining.Size() != set.Size() { + a.hpUsed[idx] = remaining + } + } +} + +func (a *Allocator) associate(cpus cpuset.CPUSet, clos int) error { + list := cpus.UnsortedList() + sort.Ints(list) + assocs := make([]pctClosAssoc, 0, len(list)) + for _, c := range list { + assocs = append(assocs, pctClosAssoc{CPU: c, ClosID: clos}) + } + if err := a.sst.AssociateCPUs(assocs); err != nil { + return fmt.Errorf("pct: associate cpus %s to CLOS %d: %w", cpus, clos, err) + } + log.Debugf("pct: associated cpus %s to CLOS %d", cpus, clos) + return nil +} + +// Shutdown restores the platform to its default state. Safe to +// call multiple times. +func (a *Allocator) Shutdown() error { + if a == nil || !a.sst.Supported() { + return nil + } + if a.mode != pctModeManaged { + return nil + } + return a.sst.Shutdown() +} + +func (a *Allocator) modeString() string { + switch a.mode { + case pctModeManaged: + return "managed" + case pctModeAssocOnly: + return "assoc-only" + default: + return "disabled" + } +} + +// classIsHighPriority reports whether className is currently +// classified as PCT high priority. In managed mode this comes from +// pctPriority=high; in assoc-only mode it comes from the largest +// programmed CLOS MaxFreq (see classifyAssocOnlyHP). The two +// regimes share one map so that hints() can treat HP/non-HP +// classes uniformly. +func (a *Allocator) classIsHighPriority(className string) bool { + if !a.Active() { + return false + } + return a.hpClasses[className] +} + +// hpHintsActive reports whether HP-room reasoning (hpReserveCpus, +// hpInUseCpus, trackHpUsage) is currently meaningful. It requires +// PCT to be active *and* at least one cpuClass to be classified as +// HP. In assoc-only mode without programmed CLOS frequencies this +// is false even though the allocator runs, because we cannot +// distinguish HP from LP CLOSes from the data we have. +func (a *Allocator) hpHintsActive() bool { + return a.Active() && len(a.hpClasses) > 0 +} + +// closCpus returns the subset of Allowed CPUs that are currently +// associated to CLOS closID. +func (a *Allocator) closCpus(closID int) cpuset.CPUSet { + if !a.Active() { + return cpuset.New() + } + out := []int{} + for _, cpu := range a.allowed.UnsortedList() { + id, err := a.sst.GetCPUClosID(cpu) + if err != nil { + continue + } + if id == closID { + out = append(out, cpu) + } + } + return cpuset.New(out...) +} + +// hpInUseCpus returns the union of CPUs of every punit currently +// hosting at least one HP CPU, constrained to Allowed. Expanding +// HP usage to whole-punit (rather than whole-package) granularity +// keeps the Avoid hint for non-HP classes from being unnecessarily +// broad on TPMI-class platforms with multiple punits per package. +func (a *Allocator) hpInUseCpus() cpuset.CPUSet { + if !a.hpHintsActive() { + return cpuset.New() + } + out := cpuset.New() + for idx, used := range a.hpUsed { + if used.IsEmpty() { + continue + } + if idx < 0 || idx >= len(a.punits) { + continue + } + out = out.Union(a.punits[idx].CPUs) + } + if a.allowed.Size() > 0 { + out = out.Intersection(a.allowed) + } + return out +} + +// hpReserveCpus returns the CPU set the upcoming HP allocation +// should prefer, computed with punit-granular HP-room accounting: +// +// room(punit) = MaxHpCpus(punit) - len(hpUsed[punit] \ excludeBln) +// +// Selection follows a strict tier order: +// +// - Tier A (single-punit win): the punit with the largest +// non-zero room and at least requested free CPUs. Returns the +// free CPUs of that punit. +// - Tier B (same-package union): when no single punit can host +// `requested` HP CPUs but some package's punits jointly can, +// return the union of free CPUs across that package's punits. +// The picked package is the one with the largest aggregate +// room; ties broken by largest aggregate free-CPU count. +// - Tier C (cross-package): never. Steering HP work across +// sockets defeats the turbo gains it would obtain, because +// cross-socket data traffic typically dominates per-core +// frequency benefits. +// +// When `requested` is 0 the function falls back to Tier A only -- +// pick the punit with the most HP room and at least one free CPU. +// Returns the empty set when no punit/package satisfies any tier +// or no free CPUs remain after Allowed-intersection; the caller +// then falls back to topology-only placement. +// +// - free: free CPUs to consider for placement. +// - excludeBln: CPUs to exclude from HP-room accounting (the +// caller's current CPU set, e.g. when expanding an existing +// allocation, so its current HP usage is not double-counted). +// - requested: number of CPUs the upcoming allocation wants. +// 0 means "unknown" (initial priming before the count is +// known); Tier A is used. +func (a *Allocator) hpReserveCpus(free cpuset.CPUSet, excludeBln cpuset.CPUSet, requested int) cpuset.CPUSet { + if !a.hpHintsActive() { + return cpuset.New() + } + if a.allowed.Size() > 0 { + free = free.Intersection(a.allowed) + } + if free.IsEmpty() { + return cpuset.New() + } + + type punitState struct { + free cpuset.CPUSet + room int + } + states := make([]punitState, len(a.punits)) + anyKnown := false + for i, pu := range a.punits { + states[i].free = pu.CPUs.Intersection(free) + if pu.MaxHpCpus <= 0 { + // Unknown capacity for this punit: do not let it + // influence HP steering. Leave room=0 so it never + // wins Tier A; package-aggregate Tier B still + // uses only known-capacity punits. + continue + } + anyKnown = true + used := a.hpUsed[i] + if excludeBln.Size() > 0 { + used = used.Difference(excludeBln) + } + room := pu.MaxHpCpus - used.Size() + if room < 0 { + room = 0 + } + states[i].room = room + } + if !anyKnown { + return cpuset.New() + } + + // Tier A: best single punit that satisfies the request. + need := requested + if need < 1 { + need = 1 + } + bestIdx := -1 + bestRoom := 0 + bestFree := -1 + for i := range a.punits { + s := states[i] + if s.free.IsEmpty() || s.room <= 0 { + continue + } + // Both the punit's free CPUs and its remaining HP + // room must be able to host the entire request. + if s.free.Size() < need || s.room < need { + continue + } + if s.room > bestRoom || (s.room == bestRoom && s.free.Size() > bestFree) { + bestIdx = i + bestRoom = s.room + bestFree = s.free.Size() + } + } + if bestIdx >= 0 { + log.Debugf("pct: hpReserveCpus tier=A punit=%d/%d room=%d free=%s", + a.punits[bestIdx].PkgID, a.punits[bestIdx].PunitID, bestRoom, states[bestIdx].free) + return states[bestIdx].free + } + + // Tier B: aggregate per package; pick the package whose + // punits together have the most room (and free CPUs). + if requested > 0 { + type pkgAgg struct { + room int + free cpuset.CPUSet + freeN int + } + agg := map[int]*pkgAgg{} + for i, pu := range a.punits { + if states[i].room <= 0 || states[i].free.IsEmpty() { + continue + } + e, ok := agg[pu.PkgID] + if !ok { + e = &pkgAgg{free: cpuset.New()} + agg[pu.PkgID] = e + } + e.room += states[i].room + e.free = e.free.Union(states[i].free) + } + pkgIDs := make([]int, 0, len(agg)) + for id, e := range agg { + e.freeN = e.free.Size() + pkgIDs = append(pkgIDs, id) + } + sort.Ints(pkgIDs) // deterministic tie-break order + bestPkg := -1 + bestPkgRoom := 0 + bestPkgFree := -1 + for _, id := range pkgIDs { + e := agg[id] + if e.room < requested { + continue + } + if e.freeN < requested { + continue + } + if e.room > bestPkgRoom || (e.room == bestPkgRoom && e.freeN > bestPkgFree) { + bestPkg = id + bestPkgRoom = e.room + bestPkgFree = e.freeN + } + } + if bestPkg >= 0 { + log.Debugf("pct: hpReserveCpus tier=B pkg=%d room=%d free=%s", + bestPkg, bestPkgRoom, agg[bestPkg].free) + return agg[bestPkg].free + } + } + + // Tier C is never taken: do not hint across packages. + log.Debugf("pct: hpReserveCpus tier=none (no punit or package has %d HP room with %d free CPUs)", + requested, free.Size()) + return cpuset.New() +} + +// classClosID returns the CLOS ID that the named cpuClass maps to, +// or (-1, false) if the class has no PCT plan. +func (a *Allocator) classClosID(className string) (int, bool) { + if !a.Active() { + return -1, false + } + p, ok := a.classPlan[className] + if !ok { + return -1, false + } + return p.ClosID, true +} + +// virtDevSstHpReserveHint and virtDevSstHpInUseHint are the +// human-readable hint names returned in types.CpuPreference.Name for the +// dynamic PCT placement preferences. +const ( + virtDevSstHpReserveHint = "sst-hp-reserve" + virtDevSstHpInUseHint = "sst-hp-in-use" +) + +// virtDevSstClosHint returns the human-readable hint name for the +// CLOS-membership preference of the given CLOS ID. +func virtDevSstClosHint(closID int) string { + return fmt.Sprintf("sst-clos-%d", closID) +} + +// hints returns prefer/avoid CPU sets that PCT would like an upcoming +// allocation under intent.ClassName to honor. Returned types.CpuPreference +// sets are not yet intersected with Allowed; the handler does that. +// +// Behavior: +// - Class has an explicit CLOS plan (assoc-only or managed): Prefer +// CLOS-member CPUs. +// - Class is currently classified HP: Prefer hpReserveCpus +// (best-fit punit; same-package union as fallback), and also +// CLOS-member CPUs. No cross-package hint is ever emitted. +// - Class is not HP and at least one HP class exists: Avoid +// hpInUseCpus (punits currently hosting HP work). +func (a *Allocator) Hints(intent types.AllocationIntent) types.AllocationHints { + if a == nil || !a.Active() { + return types.AllocationHints{} + } + out := types.AllocationHints{} + + if closID, ok := a.classClosID(intent.ClassName); ok { + closCpus := a.closCpus(closID) + if !closCpus.IsEmpty() { + out.Prefer = append(out.Prefer, types.CpuPreference{ + Name: virtDevSstClosHint(closID), + Cpus: closCpus, + }) + } + } + + if a.classIsHighPriority(intent.ClassName) { + reserve := a.hpReserveCpus(intent.FreeCpus, intent.CurrentCpus, intent.RequestedCount) + if !reserve.IsEmpty() { + out.Prefer = append(out.Prefer, types.CpuPreference{ + Name: virtDevSstHpReserveHint, + Cpus: reserve, + }) + } + return out + } + + if a.hpHintsActive() { + inUse := a.hpInUseCpus() + if !inUse.IsEmpty() { + out.Avoid = append(out.Avoid, types.CpuPreference{ + Name: virtDevSstHpInUseHint, + Cpus: inUse, + }) + } + } + return out +} + +// turboInfo holds the platform frequency reference used by the PCT +// allocator to resolve symbolic min/base/turbo frequencies. +type turboInfo struct { + baseFreqKHz uint + maxTurboFreqKHz uint + minFreqKHz uint +} + +// discoverTurboInfo reads platform turbo capabilities from sysfs via +// the first online CPU. Returns nil if no online CPU exposes valid +// frequency data. +func discoverTurboInfo(sys Sys) (*turboInfo, error) { + cpuIDs := sys.CPUIDs() + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("no CPUs found in system topology") + } + for _, id := range cpuIDs { + cpu := sys.CPU(id) + if cpu == nil || !cpu.Online() { + continue + } + freq := cpu.FrequencyRange() + baseFreq := cpu.BaseFrequency() + if freq.Min == 0 && freq.Max == 0 { + continue + } + if baseFreq == 0 { + baseFreq = freq.Max + } + return &turboInfo{ + baseFreqKHz: uint(baseFreq), + maxTurboFreqKHz: uint(freq.Max), + minFreqKHz: uint(freq.Min), + }, nil + } + return nil, fmt.Errorf("no online CPU with valid frequency information found") +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_sst.go b/pkg/resmgr/cpuclass/internal/pct/pct_sst.go new file mode 100644 index 000000000..30ed1a97b --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_sst.go @@ -0,0 +1,142 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "os" + + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// pctClosConfig describes one CLOS configuration that the +// Allocator wants to program. +type pctClosConfig struct { + ClosID int + MinFreq int // kHz + MaxFreq int // kHz +} + +// pctClosAssoc records the desired CLOS association for a CPU. +type pctClosAssoc struct { + CPU int + ClosID int +} + +// pctPunit describes one SST power domain (punit) exposed by the +// platform. PkgID and PunitID together uniquely identify it; CPUs +// is the set of logical CPUs in this punit; MaxHpCpus is the +// maximum number of CPUs this punit can sustain at the elevated +// PCT high-priority frequency (SST-TF bucket count, or SST-BF HP +// CPU count when TF is unsupported). MaxHpCpus == 0 means the +// platform does not expose HP capacity for this punit; the +// allocator excludes such punits from HP steering. +type pctPunit struct { + PkgID int + PunitID int + CPUs cpuset.CPUSet + MaxHpCpus int + // GuaranteedHpCpus is the count of HP CPUs on this punit that + // can simultaneously sustain the highest turbo frequency the + // platform exposes: the smallest non-zero SST-TF bucket's + // HighPriorityCoreCount (smaller buckets unlock higher + // frequencies), or len(SST-BF HighPriorityCPUs) when TF is + // unsupported. 0 if neither feature exposes HP capacity. + // Used to publish scheduler-visible HP capacity that reflects + // "guaranteed top-turbo headroom" rather than the worst-case + // MaxHpCpus. + GuaranteedHpCpus int +} + +// pctClosCfg carries the frequency bounds programmed for one CLOS, +// in kHz. Zero stands for "not specified / leave alone". +type pctClosCfg struct { + MinFreq int + MaxFreq int +} + +// pctPunitID identifies one power domain by (package, punit) ID. +type pctPunitID struct { + PkgID int + PunitID int +} + +// sst is the subset of Intel SST functionality used by the +// cpuclass code. Implementations: sstGoresctrl for real +// hardware via goresctrl/pkg/sst, and sstMock for an +// in-memory fake seeded from OVERRIDE_SST. +type sst interface { + // Supported reports whether SST is available. + Supported() bool + + // ClosCount returns the number of CLOSes supported. + ClosCount() int + + // PackageIDs returns the IDs of all packages. + PackageIDs() []int + + // CPUsOfPackage returns the CPUs of the given package. + CPUsOfPackage(pkgID int) []int + + // Punits returns the per-punit topology and HP capacity of + // every package the platform exposes. Order is stable. + Punits() []pctPunit + + // GetClosConfig returns the frequency bounds currently + // programmed for closID. The second return value is false + // when no information is available (e.g. closID not in + // range, or the platform does not expose per-CLOS + // configuration). Used in assoc-only mode to classify a CLOS + // as HP or LP from its programmed MaxFreq. + GetClosConfig(closID int) (pctClosCfg, bool, error) + + // PrepareManagedMode resets and enables SST-TF on every + // package and selects ordered priority arbitration. + PrepareManagedMode() error + + // ConfigureClos programs CLOS frequency bounds on every + // package. + ConfigureClos(cfg pctClosConfig) error + + // EnableCP enables SST-CP on every package. + EnableCP() error + + // AssociateCPUs binds each CPU to the indicated CLOS. + AssociateCPUs(assocs []pctClosAssoc) error + + // TFStatus returns the current SST-TF enabled state per + // power domain. The map is empty when SST is unsupported. + // The status is read at call time (SST-TF can be toggled + // out-of-band by the operator). Used in assoc-only mode to + // warn at configure time when SST-TF is disabled on a punit + // hosting PCT-managed CPUs -- without SST-TF, HP cores on + // that punit cannot exceed the standard turbo-ratio bucket + // limit even if associated to a low-CLOS-ID (HP) CLOS. + TFStatus() (map[pctPunitID]bool, error) + + // GetCPUClosID returns the current CLOS association of a CPU. + GetCPUClosID(cpu int) (int, error) + + // Shutdown restores managed-mode platform state to defaults. + Shutdown() error +} + +// newSst returns an SST implementation: the in-memory mock when +// OVERRIDE_SST is set, otherwise the goresctrl-backed one. +func newSst() (sst, error) { + if v := os.Getenv(sstOverrideEnvVar); v != "" { + return newSstMock(v) + } + return newSstGoresctrl() +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go b/pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go new file mode 100644 index 000000000..b00809a5a --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go @@ -0,0 +1,380 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "fmt" + "sort" + + gosst "github.com/intel/goresctrl/pkg/sst" + "github.com/intel/goresctrl/pkg/utils" + + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// sstGoresctrl is the real-hardware sst backed by +// goresctrl/pkg/sst. Per-(pkg, punit) topology and HP capacity +// are snapshotted at Init() time -- the goresctrl Platform itself +// snapshots CPU topology at Init(), so refreshing here would not +// pick up CPU hotplug either. +type sstGoresctrl struct { + plat *gosst.Platform + // punits is the cached per-punit topology + HP capacity in + // stable order (sorted by PkgID, then PunitID). + punits []pctPunit +} + +func newSstGoresctrl() (sst, error) { + b := &sstGoresctrl{} + if !gosst.SstSupported() { + return b, nil + } + plat, err := gosst.Init() + if err != nil { + return nil, fmt.Errorf("SST init failed: %w", err) + } + b.plat = plat + b.punits = discoverPunits(plat) + return b, nil +} + +// discoverPunits snapshots per-punit topology and HP capacity for +// every package the platform exposes. The PP level is the current +// level of the first punit of each package, mirroring the +// approach of goresctrl's "sst info" CLI. Logged at INFO so +// operators can correlate placement decisions with the platform +// state observed at startup. A failure on one package does not +// abort discovery for the others. +func discoverPunits(plat *gosst.Platform) []pctPunit { + out := []pctPunit{} + if plat == nil { + return out + } + for _, pkg := range plat.Packages() { + pkgID := pkg.ID() + st, err := pkg.GetStatus() + if err != nil { + log.Warnf("pct: SST status unavailable for package %d: %v", pkgID, err) + continue + } + // Pick the current PP level from any punit (they share + // a level on every platform we have seen); warn on + // divergence and stick with the first one. + level := -1 + for _, pu := range st.Punits { + if level < 0 { + level = pu.PP.CurrentLevel + continue + } + if pu.PP.CurrentLevel != level { + log.Warnf("pct: package %d punits report differing PP levels; using level %d", pkgID, level) + break + } + } + if level < 0 { + log.Warnf("pct: package %d has no punits, skipping discovery", pkgID) + continue + } + info, err := pkg.GetPerfLevelInfo(level) + if err != nil { + log.Warnf("pct: SST PerfLevelInfo unavailable for package %d level %d: %v", pkgID, level, err) + continue + } + // Stable per-punit iteration. + punitIDs := make([]int, 0, len(st.Punits)) + for id := range st.Punits { + punitIDs = append(punitIDs, int(id)) + } + sort.Ints(punitIDs) + for _, pid := range punitIDs { + pu := st.Punits[utils.ID(pid)] + cpus := cpuset.New(pu.CPUs.Members()...) + max := 0 + gtd := 0 + if pi, ok := info[utils.ID(pid)]; ok { + max = punitMaxHpCpus(pi) + gtd = punitGuaranteedHpCpus(pi) + log.Infof("pct: SST discovered: pkg=%d punit=%d level=%d cpus=%s maxHpCpus=%d guaranteedHpCpus=%d (tf=%v bf=%v)", + pkgID, pid, level, cpus, max, gtd, pi.TF.Supported, pi.BF.Supported) + } else { + log.Infof("pct: SST discovered: pkg=%d punit=%d level=%d cpus=%s maxHpCpus=0 (no PerfLevelInfo)", + pkgID, pid, level, cpus) + } + out = append(out, pctPunit{ + PkgID: pkgID, + PunitID: pid, + CPUs: cpus, + MaxHpCpus: max, + GuaranteedHpCpus: gtd, + }) + } + } + return out +} + +// punitMaxHpCpus returns the maximum number of CPUs that can be +// promoted to high priority on this punit at the queried PP +// level. SST-TF takes precedence: the largest bucket's +// HighPriorityCoreCount sets the upper bound (smaller buckets +// allow higher turbo but admit fewer HP cores -- the allocator +// only needs to know the cap). When TF is unsupported or all +// buckets are empty, fall back to len(BF.HighPriorityCPUs); BF +// guarantees those CPUs run at an elevated *base* frequency, so +// the count is exact. Returns 0 only when neither feature +// exposes any HP CPUs. +func punitMaxHpCpus(pi *gosst.PerfLevelInfo) int { + if pi == nil { + return 0 + } + max := 0 + if pi.TF.Supported { + for _, b := range pi.TF.Buckets { + if b.HighPriorityCoreCount > max { + max = b.HighPriorityCoreCount + } + } + } + if max == 0 && pi.BF.Supported { + max = len(pi.BF.HighPriorityCPUs) + } + return max +} + +// punitGuaranteedHpCpus returns the count of HP CPUs that can +// simultaneously reach the platform's highest exposed turbo +// frequency on this punit. With SST-TF, smaller buckets unlock +// higher turbo frequencies, so the smallest non-zero +// HighPriorityCoreCount across buckets is the figure of merit: +// staying at or below it lets every HP CPU sustain the top-bucket +// frequency. When TF is unsupported, fall back to +// len(BF.HighPriorityCPUs) -- BF guarantees those CPUs run at the +// elevated base frequency, and there is no further headroom to +// reserve. Returns 0 when neither feature exposes HP capacity. +func punitGuaranteedHpCpus(pi *gosst.PerfLevelInfo) int { + if pi == nil { + return 0 + } + if pi.TF.Supported { + min := 0 + for _, b := range pi.TF.Buckets { + if b.HighPriorityCoreCount <= 0 { + continue + } + if min == 0 || b.HighPriorityCoreCount < min { + min = b.HighPriorityCoreCount + } + } + if min > 0 { + return min + } + } + if pi.BF.Supported { + return len(pi.BF.HighPriorityCPUs) + } + return 0 +} + +func (b *sstGoresctrl) Supported() bool { return b.plat != nil } + +func (b *sstGoresctrl) ClosCount() int { + if b.plat == nil { + return 0 + } + return b.plat.ClosCount() +} + +func (b *sstGoresctrl) PackageIDs() []int { + if b.plat == nil { + return nil + } + seen := map[int]bool{} + ids := []int{} + for _, pu := range b.punits { + if seen[pu.PkgID] { + continue + } + seen[pu.PkgID] = true + ids = append(ids, pu.PkgID) + } + sort.Ints(ids) + return ids +} + +func (b *sstGoresctrl) CPUsOfPackage(pkgID int) []int { + if b.plat == nil { + return nil + } + out := []int{} + for _, pu := range b.punits { + if pu.PkgID != pkgID { + continue + } + out = append(out, pu.CPUs.UnsortedList()...) + } + sort.Ints(out) + return out +} + +// Punits returns the cached per-punit topology and HP capacity. +func (b *sstGoresctrl) Punits() []pctPunit { + if b.plat == nil { + return nil + } + // Return a defensive copy so callers cannot mutate cached state. + out := make([]pctPunit, len(b.punits)) + copy(out, b.punits) + return out +} + +func (b *sstGoresctrl) PrepareManagedMode() error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + for _, pkg := range b.plat.Packages() { + if err := pkg.CPReset(); err != nil { + return fmt.Errorf("CPReset on package %d: %w", pkg.ID(), err) + } + if err := pkg.TFEnable(); err != nil { + return fmt.Errorf("TFEnable on package %d: %w", pkg.ID(), err) + } + if err := pkg.CPSetPriorityType(gosst.Ordered); err != nil { + return fmt.Errorf("CPSetPriorityType on package %d: %w", pkg.ID(), err) + } + } + return nil +} + +func (b *sstGoresctrl) ConfigureClos(cfg pctClosConfig) error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + // pctClosConfig stores frequencies in kHz; goresctrl ClosConfig + // uses MHz (max ratio-encoded 25500 MHz on mbox platforms). + cc := gosst.ClosConfig{MinFreq: cfg.MinFreq / 1000, MaxFreq: cfg.MaxFreq / 1000} + for _, pkg := range b.plat.Packages() { + if err := pkg.ClosConfigure(cfg.ClosID, cc); err != nil { + return fmt.Errorf("ClosConfigure(%d) on package %d: %w", cfg.ClosID, pkg.ID(), err) + } + } + return nil +} + +func (b *sstGoresctrl) EnableCP() error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + for _, pkg := range b.plat.Packages() { + if err := pkg.CPEnable(); err != nil { + return fmt.Errorf("CPEnable on package %d: %w", pkg.ID(), err) + } + } + return nil +} + +func (b *sstGoresctrl) AssociateCPUs(assocs []pctClosAssoc) error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + byClos := map[int]utils.IDSet{} + for _, a := range assocs { + if _, ok := byClos[a.ClosID]; !ok { + byClos[a.ClosID] = utils.NewIDSet() + } + byClos[a.ClosID].Add(utils.ID(a.CPU)) + } + for clos, cpus := range byClos { + if err := b.plat.ClosAssociate(clos, cpus); err != nil { + return fmt.Errorf("ClosAssociate(%d) for cpus %s: %w", clos, cpus, err) + } + } + return nil +} + +func (b *sstGoresctrl) GetCPUClosID(cpu int) (int, error) { + if b.plat == nil { + return 0, fmt.Errorf("SST not supported on this host") + } + return b.plat.GetCPUClosID(utils.ID(cpu)) +} + +func (b *sstGoresctrl) TFStatus() (map[pctPunitID]bool, error) { + out := map[pctPunitID]bool{} + if b.plat == nil { + return out, nil + } + for _, pkg := range b.plat.Packages() { + st, err := pkg.GetStatus() + if err != nil { + return nil, fmt.Errorf("TFStatus: package %d status: %w", pkg.ID(), err) + } + for pid, pu := range st.Punits { + out[pctPunitID{PkgID: pkg.ID(), PunitID: int(pid)}] = pu.TF.Enabled + } + } + return out, nil +} + +// GetClosConfig returns the frequency bounds programmed on CLOS +// closID, queried from the first package (CLOS programming is +// applied identically to every package by ConfigureClos). The +// second return value is false when SST is unsupported, the +// package status cannot be read, or closID is out of range. +func (b *sstGoresctrl) GetClosConfig(closID int) (pctClosCfg, bool, error) { + if b.plat == nil { + return pctClosCfg{}, false, nil + } + pkgs := b.plat.Packages() + if len(pkgs) == 0 { + return pctClosCfg{}, false, nil + } + st, err := pkgs[0].GetStatus() + if err != nil { + return pctClosCfg{}, false, fmt.Errorf("GetClosConfig: package %d status: %w", pkgs[0].ID(), err) + } + // Pick any punit -- per-package ConfigureClos programs all + // punits identically. goresctrl reports CLOS Config.Min/MaxFreq + // in MHz; convert to kHz so callers always see the same unit as + // they passed to ConfigureClos. + for _, pu := range st.Punits { + if closID < 0 || closID >= len(pu.Clos) { + return pctClosCfg{}, false, nil + } + return pctClosCfg{ + MinFreq: pu.Clos[closID].Config.MinFreq * 1000, + MaxFreq: pu.Clos[closID].Config.MaxFreq * 1000, + }, true, nil + } + return pctClosCfg{}, false, nil +} + +// MaxHpCpus method removed in favor of Punits(). + +func (b *sstGoresctrl) Shutdown() error { + if b.plat == nil { + return nil + } + for _, pkg := range b.plat.Packages() { + if err := pkg.CPReset(); err != nil { + return fmt.Errorf("CPReset on package %d: %w", pkg.ID(), err) + } + if err := pkg.TFDisable(); err != nil { + return fmt.Errorf("TFDisable on package %d: %w", pkg.ID(), err) + } + if err := pkg.CPDisable(); err != nil { + return fmt.Errorf("CPDisable on package %d: %w", pkg.ID(), err) + } + } + return nil +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go b/pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go new file mode 100644 index 000000000..77bb62777 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go @@ -0,0 +1,452 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// sstOverrideEnvVar holds JSON seeding the in-memory SST mock. +// Follows the existing OVERRIDE_SYS_CACHES / OVERRIDE_SYS_CPUFREQ +// convention in pkg/sysfs/system.go. +const ( + sstOverrideEnvVar = "OVERRIDE_SST" + sstOverrideStateDirVar = "OVERRIDE_SST_STATE_DIR" + sstOverrideStateFile = "state.json" +) + +// sstMockClos seeds the per-CLOS state of one package. +type sstMockClos struct { + ID int `json:"id"` + MinFreq int `json:"min_freq"` + MaxFreq int `json:"max_freq"` + CPUs string `json:"cpus,omitempty"` // listset like "0-15" +} + +// sstMockPunit seeds one punit's CPUs and HP capacity. +type sstMockPunit struct { + ID int `json:"id"` + CPUs string `json:"cpus"` // listset + MaxHpCpus int `json:"max_hp_cpus,omitempty"` + GuaranteedHpCpus int `json:"guaranteed_hp_cpus,omitempty"` +} + +// sstMockPackage seeds one package's worth of SST state. +type sstMockPackage struct { + ID int `json:"id"` + CPUs string `json:"cpus"` // listset of all CPUs in the package + TFSupported bool `json:"tf_supported"` + TFEnabled bool `json:"tf_enabled"` + CPSupported bool `json:"cp_supported"` + CPEnabled bool `json:"cp_enabled"` + CPPriority string `json:"cp_priority,omitempty"` // "ordered" or "proportional" + // MaxHpCpus seeds a per-package HP CPU count for the + // back-compat case where Punits is not specified -- one + // synthetic punit is created containing every package CPU + // and this MaxHpCpus value. + MaxHpCpus int `json:"max_hp_cpus,omitempty"` + Punits []*sstMockPunit `json:"punits,omitempty"` + Clos []*sstMockClos `json:"clos,omitempty"` +} + +// sstMockDoc is the full JSON document accepted in OVERRIDE_SST. +type sstMockDoc struct { + Supported bool `json:"supported"` + ClosCount int `json:"clos_count"` + Packages []*sstMockPackage `json:"packages"` +} + +// sstMock is an in-memory sst implementation. Seed +// state comes from OVERRIDE_SST; mutations from policy calls are +// recorded into the in-memory doc and persisted to a state file +// after every operation so e2e tests can inspect the result. +type sstMock struct { + doc *sstMockDoc + cpuPkg map[int]*sstMockPackage // cpu -> package + cpuClos map[int]int // cpu -> currently-associated CLOS id + stateDir string +} + +func newSstMock(jsonData string) (sst, error) { + doc := &sstMockDoc{} + if err := json.Unmarshal([]byte(jsonData), doc); err != nil { + return nil, fmt.Errorf("failed to parse %s JSON: %w", sstOverrideEnvVar, err) + } + if doc.ClosCount == 0 { + doc.ClosCount = 4 + } + b := &sstMock{ + doc: doc, + cpuPkg: map[int]*sstMockPackage{}, + cpuClos: map[int]int{}, + stateDir: os.Getenv(sstOverrideStateDirVar), + } + if b.stateDir == "" { + b.stateDir = "/tmp/nri-pct-mock" + } + for _, pkg := range doc.Packages { + cpus, err := parseCPUList(pkg.CPUs) + if err != nil { + return nil, fmt.Errorf("%s: invalid cpus %q in package %d: %w", sstOverrideEnvVar, pkg.CPUs, pkg.ID, err) + } + for _, c := range cpus { + b.cpuPkg[c] = pkg + b.cpuClos[c] = 0 + } + // If seed pre-associates CPUs to non-zero CLOSes, honor that. + for _, cl := range pkg.Clos { + if cl.CPUs == "" { + continue + } + clCpus, err := parseCPUList(cl.CPUs) + if err != nil { + return nil, fmt.Errorf("%s: invalid clos.cpus %q: %w", sstOverrideEnvVar, cl.CPUs, err) + } + for _, c := range clCpus { + b.cpuClos[c] = cl.ID + } + } + } + if err := b.persist(); err != nil { + log.Warnf("pct mock: failed to write initial state file: %v", err) + } + log.Infof("pct mock: seeded with %d package(s), supported=%v, closCount=%d, stateDir=%q", + len(doc.Packages), doc.Supported, doc.ClosCount, b.stateDir) + return b, nil +} + +func (b *sstMock) Supported() bool { return b.doc.Supported } + +func (b *sstMock) ClosCount() int { return b.doc.ClosCount } + +func (b *sstMock) PackageIDs() []int { + ids := make([]int, 0, len(b.doc.Packages)) + for _, p := range b.doc.Packages { + ids = append(ids, p.ID) + } + sort.Ints(ids) + return ids +} + +func (b *sstMock) CPUsOfPackage(pkgID int) []int { + for _, p := range b.doc.Packages { + if p.ID == pkgID { + cpus, _ := parseCPUList(p.CPUs) + return cpus + } + } + return nil +} + +func (b *sstMock) pkgEnsureClos(pkg *sstMockPackage, clos int) *sstMockClos { + for _, c := range pkg.Clos { + if c.ID == clos { + return c + } + } + c := &sstMockClos{ID: clos} + pkg.Clos = append(pkg.Clos, c) + sort.Slice(pkg.Clos, func(i, j int) bool { return pkg.Clos[i].ID < pkg.Clos[j].ID }) + return c +} + +func (b *sstMock) PrepareManagedMode() error { + for _, pkg := range b.doc.Packages { + // CPReset: clear CLOS configs, associate all CPUs to CLOS 0. + pkg.Clos = nil + cpus, _ := parseCPUList(pkg.CPUs) + for _, c := range cpus { + b.cpuClos[c] = 0 + } + pkg.TFEnabled = true + pkg.CPPriority = "ordered" + } + log.Debugf("pct mock: PrepareManagedMode done (CPReset+TFEnable+CPSetPriorityType=ordered)") + return b.persist() +} + +func (b *sstMock) ConfigureClos(cfg pctClosConfig) error { + for _, pkg := range b.doc.Packages { + c := b.pkgEnsureClos(pkg, cfg.ClosID) + c.MinFreq = cfg.MinFreq + c.MaxFreq = cfg.MaxFreq + } + log.Debugf("pct mock: ConfigureClos %+v", cfg) + return b.persist() +} + +func (b *sstMock) EnableCP() error { + for _, pkg := range b.doc.Packages { + pkg.CPEnabled = true + } + log.Debugf("pct mock: EnableCP done") + return b.persist() +} + +func (b *sstMock) AssociateCPUs(assocs []pctClosAssoc) error { + for _, a := range assocs { + if _, ok := b.cpuPkg[a.CPU]; !ok { + return fmt.Errorf("pct mock: CPU %d not present in any seeded package", a.CPU) + } + b.cpuClos[a.CPU] = a.ClosID + } + // Refresh per-CLOS CPU lists on each package for readable state. + for _, pkg := range b.doc.Packages { + clos2cpus := map[int][]int{} + cpus, _ := parseCPUList(pkg.CPUs) + for _, c := range cpus { + cl := b.cpuClos[c] + clos2cpus[cl] = append(clos2cpus[cl], c) + } + for _, cl := range pkg.Clos { + cl.CPUs = formatCPUList(clos2cpus[cl.ID]) + delete(clos2cpus, cl.ID) + } + for clID, list := range clos2cpus { + c := b.pkgEnsureClos(pkg, clID) + c.CPUs = formatCPUList(list) + } + } + log.Debugf("pct mock: AssociateCPUs %+v", assocs) + return b.persist() +} + +func (b *sstMock) GetCPUClosID(cpu int) (int, error) { + cl, ok := b.cpuClos[cpu] + if !ok { + return 0, fmt.Errorf("pct mock: CPU %d not present in any seeded package", cpu) + } + return cl, nil +} + +// Punits returns the per-punit topology of every seeded package. +// If a package's seed omits the Punits list, a single synthetic +// punit (ID 0) is returned spanning every CPU of the package, +// carrying the package-level MaxHpCpus for back-compat with the +// pre-punit OVERRIDE_SST schema. +func (b *sstMock) Punits() []pctPunit { + out := []pctPunit{} + // Stable order: sort packages by ID, punits by ID. + pkgIDs := make([]int, 0, len(b.doc.Packages)) + pkgByID := map[int]*sstMockPackage{} + for _, p := range b.doc.Packages { + pkgIDs = append(pkgIDs, p.ID) + pkgByID[p.ID] = p + } + sort.Ints(pkgIDs) + for _, pid := range pkgIDs { + pkg := pkgByID[pid] + if len(pkg.Punits) == 0 { + cpus, _ := parseCPUList(pkg.CPUs) + out = append(out, pctPunit{ + PkgID: pkg.ID, + PunitID: 0, + CPUs: cpuset.New(cpus...), + MaxHpCpus: pkg.MaxHpCpus, + GuaranteedHpCpus: pkg.MaxHpCpus, + }) + continue + } + punits := append([]*sstMockPunit(nil), pkg.Punits...) + sort.Slice(punits, func(i, j int) bool { return punits[i].ID < punits[j].ID }) + for _, pu := range punits { + cpus, _ := parseCPUList(pu.CPUs) + gtd := pu.GuaranteedHpCpus + if gtd == 0 { + gtd = pu.MaxHpCpus + } + out = append(out, pctPunit{ + PkgID: pkg.ID, + PunitID: pu.ID, + CPUs: cpuset.New(cpus...), + MaxHpCpus: pu.MaxHpCpus, + GuaranteedHpCpus: gtd, + }) + } + } + return out +} + +// GetClosConfig returns the frequency bounds currently programmed +// for closID. The mock's CLOS state is shared across packages by +// construction (ConfigureClos writes it to all packages); we +// return the first package's entry. +func (b *sstMock) GetClosConfig(closID int) (pctClosCfg, bool, error) { + for _, pkg := range b.doc.Packages { + for _, cl := range pkg.Clos { + if cl.ID != closID { + continue + } + return pctClosCfg{MinFreq: cl.MinFreq, MaxFreq: cl.MaxFreq}, true, nil + } + // First package checked, no entry for closID. + return pctClosCfg{}, false, nil + } + return pctClosCfg{}, false, nil +} + +// TFStatus mirrors the per-package TFEnabled flag onto each of +// the package's punits (the mock's TF state is per-package). +func (b *sstMock) TFStatus() (map[pctPunitID]bool, error) { + out := map[pctPunitID]bool{} + for _, pkg := range b.doc.Packages { + if len(pkg.Punits) == 0 { + out[pctPunitID{PkgID: pkg.ID, PunitID: 0}] = pkg.TFEnabled + continue + } + for _, pu := range pkg.Punits { + out[pctPunitID{PkgID: pkg.ID, PunitID: pu.ID}] = pkg.TFEnabled + } + } + return out, nil +} + +func (b *sstMock) Shutdown() error { + for cpu := range b.cpuClos { + b.cpuClos[cpu] = 0 + } + for _, pkg := range b.doc.Packages { + pkg.Clos = nil + pkg.TFEnabled = false + pkg.CPEnabled = false + } + log.Debugf("pct mock: Shutdown done") + return b.persist() +} + +func (b *sstMock) persist() error { + if err := os.MkdirAll(b.stateDir, 0o755); err != nil { + return err + } + data, err := json.MarshalIndent(b.doc, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(b.stateDir, sstOverrideStateFile), data, 0o644) +} + +// parseCPUList parses a listset string like "0-3,8,10-12". +func parseCPUList(s string) ([]int, error) { + if s == "" { + return nil, nil + } + out := []int{} + for _, part := range splitComma(s) { + if part == "" { + continue + } + lo, hi, err := parseRange(part) + if err != nil { + return nil, err + } + for i := lo; i <= hi; i++ { + out = append(out, i) + } + } + sort.Ints(out) + return out, nil +} + +// formatCPUList formats an int slice as a listset like "0-3,8,10-12". +func formatCPUList(ids []int) string { + if len(ids) == 0 { + return "" + } + sorted := append([]int(nil), ids...) + sort.Ints(sorted) + var parts []string + lo := sorted[0] + prev := lo + flush := func() { + if lo == prev { + parts = append(parts, fmt.Sprintf("%d", lo)) + } else { + parts = append(parts, fmt.Sprintf("%d-%d", lo, prev)) + } + } + for _, v := range sorted[1:] { + if v == prev+1 { + prev = v + continue + } + flush() + lo, prev = v, v + } + flush() + return joinComma(parts) +} + +func splitComma(s string) []string { + out := []string{} + cur := "" + for _, r := range s { + if r == ',' { + out = append(out, cur) + cur = "" + continue + } + cur += string(r) + } + if cur != "" { + out = append(out, cur) + } + return out +} + +func joinComma(parts []string) string { + out := "" + for i, p := range parts { + if i > 0 { + out += "," + } + out += p + } + return out +} + +func parseRange(s string) (int, int, error) { + for i, r := range s { + if r == '-' { + lo, err := atoi(s[:i]) + if err != nil { + return 0, 0, err + } + hi, err := atoi(s[i+1:]) + if err != nil { + return 0, 0, err + } + return lo, hi, nil + } + } + v, err := atoi(s) + if err != nil { + return 0, 0, err + } + return v, v, nil +} + +func atoi(s string) (int, error) { + var v int + if _, err := fmt.Sscanf(s, "%d", &v); err != nil { + return 0, fmt.Errorf("invalid integer %q: %w", s, err) + } + return v, nil +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_test.go b/pkg/resmgr/cpuclass/internal/pct/pct_test.go new file mode 100644 index 000000000..61df8bd37 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_test.go @@ -0,0 +1,1070 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "errors" + "sort" + "testing" + + gosst "github.com/intel/goresctrl/pkg/sst" + idset "github.com/intel/goresctrl/pkg/utils" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var errFakeSstNoClos = errors.New("fakeSst: no CLOS for CPU") + +// --- minimal sysfs.System / CPUPackage / CPU fakes ------------------ + +// fakePackage implements sysfs.CPUPackage via an embedded nil +// interface. Methods not overridden here panic if called, which is +// the desired guardrail in unit tests. +type fakePackage struct { + sysfs.CPUPackage + id idset.ID + cpus cpuset.CPUSet +} + +func (p *fakePackage) ID() idset.ID { return p.id } +func (p *fakePackage) CPUSet() cpuset.CPUSet { return p.cpus } + +// fakeCPU implements sysfs.CPU likewise. +type fakeCPU struct { + sysfs.CPU + id idset.ID + pkg idset.ID +} + +func (c *fakeCPU) ID() idset.ID { return c.id } +func (c *fakeCPU) PackageID() idset.ID { return c.pkg } + +// fakeSys is a minimal Sys implementation built from package +// CPU maps. +type fakeSys struct { + packageCpus map[idset.ID]cpuset.CPUSet // pkgID -> cpus + cpuPkg map[int]idset.ID // cpu -> pkgID +} + +func (s *fakeSys) PackageIDs() []idset.ID { + ids := make([]idset.ID, 0, len(s.packageCpus)) + for id := range s.packageCpus { + ids = append(ids, id) + } + return ids +} + +func (s *fakeSys) Package(id idset.ID) sysfs.CPUPackage { + cpus, ok := s.packageCpus[id] + if !ok { + return nil + } + return &fakePackage{id: id, cpus: cpus} +} + +func (s *fakeSys) CPU(id idset.ID) sysfs.CPU { + pkg, ok := s.cpuPkg[int(id)] + if !ok { + return nil + } + return &fakeCPU{id: id, pkg: pkg} +} + +func (s *fakeSys) CPUIDs() []idset.ID { return nil } + +// newTwoPackageFakeSys returns a fakeSys with two packages of 4 CPUs +// each: pkg0=0..3, pkg1=4..7. +func newTwoPackageFakeSys() *fakeSys { + return &fakeSys{ + packageCpus: map[idset.ID]cpuset.CPUSet{ + 0: cpuset.MustParse("0-3"), + 1: cpuset.MustParse("4-7"), + }, + cpuPkg: map[int]idset.ID{ + 0: 0, 1: 0, 2: 0, 3: 0, + 4: 1, 5: 1, 6: 1, 7: 1, + }, + } +} + +// --- minimal sst fake ------------------------------------------------ + +// fakeSst implements just the methods that Allocator.hints (and +// closCpus) actually call. +type fakeSst struct { + supported bool + cpuClos map[int]int // cpu -> CLOS id + maxHp map[int]int // pkgID -> max HP CPUs (missing = "unknown") + pkgCpus map[int]cpuset.CPUSet + // punits, when non-nil, overrides the synthesized one-punit-per-package + // Punits() output. Use to exercise multi-punit-per-package layouts. + punits []pctPunit + // closCfg, when non-nil, drives GetClosConfig() responses. + closCfg map[int]pctClosCfg +} + +func (s *fakeSst) Supported() bool { return s.supported } +func (s *fakeSst) ClosCount() int { return 4 } +func (s *fakeSst) PackageIDs() []int { return nil } +func (s *fakeSst) CPUsOfPackage(int) []int { return nil } +func (s *fakeSst) PrepareManagedMode() error { return nil } +func (s *fakeSst) ConfigureClos(pctClosConfig) error { return nil } +func (s *fakeSst) EnableCP() error { return nil } +func (s *fakeSst) AssociateCPUs([]pctClosAssoc) error { return nil } +func (s *fakeSst) GetCPUClosID(cpu int) (int, error) { + if clos, ok := s.cpuClos[cpu]; ok { + return clos, nil + } + // Return an error so closCpus skips this CPU rather than + // treating it as "associated to CLOS 0 by default". + return -1, errFakeSstNoClos +} + +// Punits synthesizes one punit per package whose CPUs come from +// pkgCpus (or maxHp keys if pkgCpus is nil) with MaxHpCpus set +// from the maxHp map. PunitID is always 0 (single punit per pkg +// preserves the legacy per-package test semantics). +func (s *fakeSst) Punits() []pctPunit { + if s.punits != nil { + out := make([]pctPunit, len(s.punits)) + copy(out, s.punits) + return out + } + pkgIDs := map[int]struct{}{} + for id := range s.pkgCpus { + pkgIDs[id] = struct{}{} + } + for id := range s.maxHp { + pkgIDs[id] = struct{}{} + } + out := make([]pctPunit, 0, len(pkgIDs)) + for id := range pkgIDs { + cpus, ok := s.pkgCpus[id] + if !ok { + // Derive a default cpu range matching newTwoPackageFakeSys layout. + switch id { + case 0: + cpus = cpuset.MustParse("0-3") + case 1: + cpus = cpuset.MustParse("4-7") + } + } + out = append(out, pctPunit{ + PkgID: id, + PunitID: 0, + CPUs: cpus, + MaxHpCpus: s.maxHp[id], + }) + } + sort.Slice(out, func(i, j int) bool { + if out[i].PkgID != out[j].PkgID { + return out[i].PkgID < out[j].PkgID + } + return out[i].PunitID < out[j].PunitID + }) + return out +} + +func (s *fakeSst) GetClosConfig(closID int) (pctClosCfg, bool, error) { + if c, ok := s.closCfg[closID]; ok { + return c, true, nil + } + return pctClosCfg{}, false, nil +} + +func (s *fakeSst) Shutdown() error { return nil } + +func (s *fakeSst) TFStatus() (map[pctPunitID]bool, error) { + // Tests do not care about SST-TF; report enabled everywhere. + out := map[pctPunitID]bool{} + for _, pu := range s.Punits() { + out[pctPunitID{PkgID: pu.PkgID, PunitID: pu.PunitID}] = true + } + return out, nil +} + +// --- helpers to construct a hand-wired Allocator ----------------- + +func newManagedPctForTest(t *testing.T, classes []*policyapi.CPUClass, plans map[string]*pctClassPlan, + allowed cpuset.CPUSet, sys *fakeSys, sst *fakeSst) *Allocator { + t.Helper() + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{}, + classPlan: plans, + allowed: allowed, + hpUsed: map[int]cpuset.CPUSet{}, + hpClasses: map[string]bool{}, + } + for _, cc := range classes { + a.classByName[cc.Name] = cc + if cc.PctPriority == "high" { + a.hpClasses[cc.Name] = true + } + } + pctTestWirePunits(a) + return a +} + +// pctTestWirePunits seeds a hand-built Allocator's punit caches +// from its sst's Punits(), intersected with allowed. It is the +// test-time equivalent of snapshotPunits() and lets struct-literal +// fixtures exercise the punit-keyed code paths. +func pctTestWirePunits(a *Allocator) { + if a.punitByCpu == nil { + a.punitByCpu = map[int]int{} + } + if a.hpClasses == nil { + a.hpClasses = map[string]bool{} + } + if a.hpEligiblePunit == nil { + a.hpEligiblePunit = map[int]bool{} + } + for name, cc := range a.classByName { + if cc.PctPriority == "high" { + a.hpClasses[name] = true + } + } + pus := a.sst.Punits() + a.punits = a.punits[:0] + for _, pu := range pus { + cpus := pu.CPUs + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + continue + } + idx := len(a.punits) + a.punits = append(a.punits, pctPunit{ + PkgID: pu.PkgID, PunitID: pu.PunitID, + CPUs: cpus, MaxHpCpus: pu.MaxHpCpus, + GuaranteedHpCpus: pu.GuaranteedHpCpus, + }) + for _, c := range cpus.UnsortedList() { + a.punitByCpu[c] = idx + } + // Default to HP-eligible so existing tests that don't + // care about TF state keep working. Tests that exercise + // HP-ineligibility set hpEligiblePunit explicitly after + // calling this helper. + a.hpEligiblePunit[idx] = true + } +} + +// --- hints() test suite --------------------------------------------- + +// TestPctHintsNoClassNoOp covers the "no plan and not managed-with-HP" +// branch where hints() must return an empty types.AllocationHints. +func TestPctHintsNoClassNoOp(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{supported: true} + + // disabled allocator: hints must short-circuit to empty. + a := &Allocator{sys: sys, sst: sst, mode: pctModeDisabled} + got := a.Hints(types.AllocationIntent{ClassName: "anything"}) + if len(got.Prefer) != 0 || len(got.Avoid) != 0 { + t.Errorf("disabled mode: hints=%+v, want empty", got) + } + + // managed mode with no HP class defined and an unknown + // className: no prefer, no avoid. + classes := []*policyapi.CPUClass{{Name: "lp", PctPriority: "low"}} + // "lp" is configured but classIsHighPriority is false; still the + // "anyHighPriorityClassDefined" gate must be false so no Avoid. + a2 := newManagedPctForTest(t, classes, + map[string]*pctClassPlan{"lp": {ClosID: 3}}, + cpuset.MustParse("0-7"), sys, sst) + got = a2.Hints(types.AllocationIntent{ClassName: "unknown-class"}) + if len(got.Avoid) != 0 { + t.Errorf("no HP class: Avoid=%+v, want empty", got.Avoid) + } +} + +// TestPctHintsAssocOnlyPreferClosCpus covers the "explicit CLOS plan" +// branch in assoc-only mode: hints prefer CPUs already associated to +// the class's CLOS, enabling bin packing. +func TestPctHintsAssocOnlyPreferClosCpus(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + // cpus 2 and 3 already on CLOS 1, others on default CLOS 0. + cpuClos: map[int]int{2: 1, 3: 1}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeAssocOnly, + classByName: map[string]*policyapi.CPUClass{"c1": {Name: "c1"}}, + classPlan: map[string]*pctClassPlan{"c1": {ClosID: 1}}, + allowed: cpuset.MustParse("0-7"), + hpUsed: map[int]cpuset.CPUSet{}, + } + pctTestWirePunits(a) + got := a.Hints(types.AllocationIntent{ClassName: "c1"}) + if len(got.Prefer) != 1 { + t.Fatalf("Prefer count = %d, want 1: got=%+v", len(got.Prefer), got) + } + if got.Prefer[0].Name != virtDevSstClosHint(1) { + t.Errorf("Prefer[0].Name = %q, want %q", got.Prefer[0].Name, virtDevSstClosHint(1)) + } + want := cpuset.MustParse("2-3") + if !got.Prefer[0].Cpus.Equals(want) { + t.Errorf("Prefer[0].Cpus = %s, want %s", got.Prefer[0].Cpus, want) + } + if len(got.Avoid) != 0 { + t.Errorf("assoc-only mode must not emit Avoid hints: %+v", got.Avoid) + } +} + +// TestPctHintsHighPriorityReserveAndClosCpus covers the HP class +// branch: hints contain (a) CPUs already on the HP CLOS for bin +// packing and (b) the HP-reserve preference (largest-room package). +func TestPctHintsHighPriorityReserveAndClosCpus(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + // cpu 0 already on CLOS 0 (HP). + cpuClos: map[int]int{0: 0}, + // max_hp_cpus = 2 per package on both packages. + maxHp: map[int]int{0: 2, 1: 2}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + }, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-7"), + // pkg0 has 1 HP cpu already used (cpu 0). + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("0")}, + } + pctTestWirePunits(a) + + // Free pool excludes the already-used cpu 0. + free := cpuset.MustParse("1-7") + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + CurrentCpus: cpuset.New(), + FreeCpus: free, + RequestedCount: 1, + }) + + // Expect two Prefer hints: CLOS 0 members (cpu 0) and HP reserve + // (the package with more HP room - pkg1, since pkg0 has 2-1=1 + // room left and pkg1 has 2-0=2 room left). + if len(got.Prefer) != 2 { + t.Fatalf("Prefer count = %d, want 2: got=%+v", len(got.Prefer), got.Prefer) + } + if got.Prefer[0].Name != virtDevSstClosHint(0) { + t.Errorf("Prefer[0].Name = %q, want %q", got.Prefer[0].Name, virtDevSstClosHint(0)) + } + if got.Prefer[1].Name != virtDevSstHpReserveHint { + t.Errorf("Prefer[1].Name = %q, want %q", got.Prefer[1].Name, virtDevSstHpReserveHint) + } + wantReserve := cpuset.MustParse("4-7") + if !got.Prefer[1].Cpus.Equals(wantReserve) { + t.Errorf("HP reserve = %s, want %s (largest-room package)", got.Prefer[1].Cpus, wantReserve) + } + // HP-class hints must NOT carry an Avoid (HP picks first). + if len(got.Avoid) != 0 { + t.Errorf("HP class: Avoid=%+v, want empty", got.Avoid) + } +} + +// TestPctHintsManagedNonHpAvoidsHpInUse covers the managed-mode +// non-HP-class branch: hints must Avoid CPUs on packages currently +// hosting HP-class CPUs, so non-HP classes do not steal HP turbo +// budget. THIS BRANCH IS NOT COVERED IN test19 e2e. +func TestPctHintsManagedNonHpAvoidsHpInUse(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + cpuClos: map[int]int{}, + maxHp: map[int]int{0: 2, 1: 2}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + "lp": {Name: "lp", PctPriority: "low"}, + }, + classPlan: map[string]*pctClassPlan{ + "hp": {ClosID: 0}, + "lp": {ClosID: 3}, + }, + allowed: cpuset.MustParse("0-7"), + // pkg0 hosts HP cpu 1. + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("1")}, + } + pctTestWirePunits(a) + got := a.Hints(types.AllocationIntent{ + ClassName: "lp", + FreeCpus: cpuset.MustParse("2-7"), + }) + + // LP has a CLOS plan, so Prefer must include CLOS 3 (empty in + // our setup) - but only if any CPU is currently on CLOS 3. With + // none, classClosID still matches but closCpus returns empty + // and the Prefer entry is skipped. So len(Prefer) == 0. + if len(got.Prefer) != 0 { + t.Errorf("Prefer = %+v, want empty (no LP CPUs currently on CLOS 3)", got.Prefer) + } + // Avoid must list pkg0's full CPU set (where HP is in use). + if len(got.Avoid) != 1 { + t.Fatalf("Avoid count = %d, want 1: got=%+v", len(got.Avoid), got.Avoid) + } + if got.Avoid[0].Name != virtDevSstHpInUseHint { + t.Errorf("Avoid[0].Name = %q, want %q", got.Avoid[0].Name, virtDevSstHpInUseHint) + } + wantAvoid := cpuset.MustParse("0-3") // entire pkg0 + if !got.Avoid[0].Cpus.Equals(wantAvoid) { + t.Errorf("Avoid[0].Cpus = %s, want %s (pkg0 == HP-in-use package)", got.Avoid[0].Cpus, wantAvoid) + } +} + +// TestPctHintsAllowedBoundsResults ensures that even with sst / +// hpUsed pointing at CPUs outside the allowed set, hints honor +// Allowed (via the handler-level intersectHints + pct-internal +// allowed intersections). +func TestPctHintsAllowedBoundsResults(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + cpuClos: map[int]int{0: 0, 4: 0}, // HP cpus on both packages + maxHp: map[int]int{0: 2, 1: 2}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + }, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + // allowed restricts to pkg0 only. + allowed: cpuset.MustParse("0-3"), + hpUsed: map[int]cpuset.CPUSet{ + 0: cpuset.MustParse("0"), + 1: cpuset.MustParse("4"), // outside allowed + }, + } + pctTestWirePunits(a) + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("1-3"), + RequestedCount: 1, + }) + // closCpus walks a.allowed, so cpu 4 is excluded automatically. + // Prefer[0] (closCpus) must contain only cpu 0. + if len(got.Prefer) == 0 { + t.Fatalf("Prefer empty, want at least closCpus hint") + } + if !got.Prefer[0].Cpus.Equals(cpuset.MustParse("0")) { + t.Errorf("Prefer[0].Cpus = %s, want {0} (cpu 4 outside allowed)", got.Prefer[0].Cpus) + } + // HP reserve must come from a package whose free CPUs are + // inside allowed; only pkg0 qualifies. + if len(got.Prefer) >= 2 { + want := cpuset.MustParse("1-3") + if !got.Prefer[1].Cpus.Equals(want) { + t.Errorf("HP reserve = %s, want %s (pkg0 free cpus inside allowed)", got.Prefer[1].Cpus, want) + } + } +} + +// --- Tier A/B/C reservation tests ---------------------------------- + +// newTwoPunitFakeSys returns a fakeSys whose package layout matches +// the standard two-punit-per-package fixture below: pkg0 = 0..7 +// (punit-0 = 0..3, punit-1 = 4..7), pkg1 = 8..15 (punit-2 = 8..11, +// punit-3 = 12..15). The synthesis function does not know about +// punits, only packages. +func newTwoPunitFakeSys() *fakeSys { + return &fakeSys{ + packageCpus: map[idset.ID]cpuset.CPUSet{ + 0: cpuset.MustParse("0-7"), + 1: cpuset.MustParse("8-15"), + }, + cpuPkg: map[int]idset.ID{ + 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, + 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, + }, + } +} + +// makeTwoPunitsPerPkg returns four punits laid out as in +// newTwoPunitFakeSys, with the given MaxHpCpus per punit. +func makeTwoPunitsPerPkg(hp0, hp1, hp2, hp3 int) []pctPunit { + return []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), MaxHpCpus: hp0}, + {PkgID: 0, PunitID: 1, CPUs: cpuset.MustParse("4-7"), MaxHpCpus: hp1}, + {PkgID: 1, PunitID: 2, CPUs: cpuset.MustParse("8-11"), MaxHpCpus: hp2}, + {PkgID: 1, PunitID: 3, CPUs: cpuset.MustParse("12-15"), MaxHpCpus: hp3}, + } +} + +// TestPctHints_HpRoomTierAPunitWins: punit-0 is fully occupied by +// HP work, punit-1 in the same package has full HP room. A request +// for 1 HP CPU must steer to punit-1 (Tier A), not to pkg1. +func TestPctHints_HpRoomTierAPunitWins(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + punits: makeTwoPunitsPerPkg(2, 2, 2, 2), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{"hp": {Name: "hp", PctPriority: "high"}}, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-15"), + // Punit-0 fully booked with HP (cpus 0,1 take both HP slots). + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("0-1")}, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("2-15"), + RequestedCount: 1, + }) + + // Find HP reserve hint. + var reserve cpuset.CPUSet + for _, p := range got.Prefer { + if p.Name == virtDevSstHpReserveHint { + reserve = p.Cpus + } + } + if reserve.IsEmpty() { + t.Fatalf("expected HP reserve hint, got Prefer=%+v", got.Prefer) + } + // Tier A: punit-1 (room=2) beats punit-0 (room=0) and the + // equal-room punits in pkg1 because punit-0/punit-1 both belong + // to pkg0 -- here we pick by largest room. + // Actually both punit-1 (room=2), punit-2 (room=2), punit-3 + // (room=2) tie; tie-break by free-CPU count (all 4) and then + // by iteration order (slice index 1 first). So expect punit-1. + want := cpuset.MustParse("4-7") + if !reserve.Equals(want) { + t.Errorf("Tier A HP reserve = %s, want %s (punit-1)", reserve, want) + } +} + +// TestPctHints_HpRoomTierBSamePackage: punit-0 and punit-1 in pkg0 +// each have only 1 HP slot left, but together they offer 2 slots -- +// enough for the request. Pkg1 has only 1 HP slot in total. The +// Tier-B aggregate must steer to pkg0 (free CPUs of both punits). +func TestPctHints_HpRoomTierBSamePackage(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + punits: makeTwoPunitsPerPkg(2, 2, 1, 0), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{"hp": {Name: "hp", PctPriority: "high"}}, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-15"), + // Both pkg0 punits already host 1 HP CPU each, leaving room=1 in each. + hpUsed: map[int]cpuset.CPUSet{ + 0: cpuset.MustParse("0"), // punit-0 idx 0 + 1: cpuset.MustParse("4"), // punit-1 idx 1 + }, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("1-3,5-15"), + RequestedCount: 2, + }) + var reserve cpuset.CPUSet + for _, p := range got.Prefer { + if p.Name == virtDevSstHpReserveHint { + reserve = p.Cpus + } + } + if reserve.IsEmpty() { + t.Fatalf("expected HP reserve hint, got Prefer=%+v", got.Prefer) + } + // Tier A is impossible (no single punit has room>=2 in pkg0, + // and pkg1 punit-2 has 1 cpu only). Tier B: pkg0 sum-room=2 + // >= 2, pkg1 sum-room=1 < 2. Reserve = pkg0 free CPUs. + want := cpuset.MustParse("1-3,5-7") + if !reserve.Equals(want) { + t.Errorf("Tier B HP reserve = %s, want %s (pkg0 union)", reserve, want) + } +} + +// TestPctHints_HpRoomTierCNoCrossPackage: request exceeds the HP +// room of every single package. Tier C is never taken - the +// allocator must return no HP-reserve hint so the caller falls back +// to topology-only placement on the same socket. +func TestPctHints_HpRoomTierCNoCrossPackage(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + // pkg0 has 2 HP CPUs total, pkg1 has 2 HP CPUs total. + punits: makeTwoPunitsPerPkg(1, 1, 1, 1), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{"hp": {Name: "hp", PctPriority: "high"}}, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-15"), + hpUsed: map[int]cpuset.CPUSet{}, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("0-15"), + RequestedCount: 3, // > any single package's HP capacity (2) + }) + for _, p := range got.Prefer { + if p.Name == virtDevSstHpReserveHint { + t.Errorf("Tier C must not emit HP reserve hint; got %+v", p) + } + } +} + +// TestPctHints_HpInUseIsPunitGranular: managed-mode non-HP class +// must Avoid only the punits currently hosting HP work, not the +// entire package. This is a regression guard for the punit-keyed +// rewrite of hpInUseCpus. +func TestPctHints_HpInUseIsPunitGranular(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + punits: makeTwoPunitsPerPkg(2, 2, 2, 2), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + "lp": {Name: "lp", PctPriority: "low"}, + }, + classPlan: map[string]*pctClassPlan{ + "hp": {ClosID: 0}, + "lp": {ClosID: 3}, + }, + allowed: cpuset.MustParse("0-15"), + // HP work on punit-0 only (pkg0). + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("0")}, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "lp", + FreeCpus: cpuset.MustParse("1-15"), + }) + if len(got.Avoid) != 1 { + t.Fatalf("Avoid count = %d, want 1: got=%+v", len(got.Avoid), got.Avoid) + } + // Must be punit-0 (cpus 0-3) ONLY, not all of pkg0 (0-7). + want := cpuset.MustParse("0-3") + if !got.Avoid[0].Cpus.Equals(want) { + t.Errorf("Avoid = %s, want %s (punit-0 only, not full pkg0)", got.Avoid[0].Cpus, want) + } +} + +// --- classifyAssocOnlyHP tests ------------------------------------- + +// TestPctClassifyAssocOnlyHP_MaxFreqWins: of two referenced CLOSes +// with programmed MaxFreq, the larger MaxFreq is the HP class. +func TestPctClassifyAssocOnlyHP_MaxFreqWins(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{ + supported: true, + closCfg: map[int]pctClosCfg{ + 1: {MinFreq: 1000000, MaxFreq: 3000000}, // base-ish + 2: {MinFreq: 2000000, MaxFreq: 3800000}, // turbo + }, + }, + classPlan: map[string]*pctClassPlan{ + "c-base": {ClosID: 1}, + "c-turbo": {ClosID: 2}, + }, + hpClasses: map[string]bool{}, + } + classes := []*policyapi.CPUClass{ + {Name: "c-base"}, + {Name: "c-turbo"}, + } + a.classifyAssocOnlyHP(classes) + if a.hpClasses["c-base"] { + t.Errorf("c-base must NOT be classified HP (lower MaxFreq)") + } + if !a.hpClasses["c-turbo"] { + t.Errorf("c-turbo must be classified HP (higher MaxFreq)") + } +} + +// TestPctClassifyAssocOnlyHP_TieBreakSmallerClos: when two CLOSes +// share the highest MaxFreq, the smaller CLOS id wins (SST-CP +// ordered-priority convention). +func TestPctClassifyAssocOnlyHP_TieBreakSmallerClos(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{ + supported: true, + closCfg: map[int]pctClosCfg{ + 1: {MaxFreq: 3800000}, + 2: {MaxFreq: 3800000}, // tie + }, + }, + classPlan: map[string]*pctClassPlan{ + "c1": {ClosID: 1}, + "c2": {ClosID: 2}, + }, + hpClasses: map[string]bool{}, + } + a.classifyAssocOnlyHP([]*policyapi.CPUClass{{Name: "c1"}, {Name: "c2"}}) + if !a.hpClasses["c1"] { + t.Errorf("c1 must win tie (smaller CLOS id)") + } + if a.hpClasses["c2"] { + t.Errorf("c2 must NOT be HP (lost tie)") + } +} + +// TestPctClassifyAssocOnlyHP_NoProgrammedFreq: when no CLOS has a +// programmed MaxFreq, no class is classified HP -- HP-specific +// hints stay quiet. +func TestPctClassifyAssocOnlyHP_NoProgrammedFreq(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{supported: true, closCfg: map[int]pctClosCfg{}}, + classPlan: map[string]*pctClassPlan{ + "c1": {ClosID: 1}, + }, + hpClasses: map[string]bool{}, + } + a.classifyAssocOnlyHP([]*policyapi.CPUClass{{Name: "c1"}}) + if len(a.hpClasses) != 0 { + t.Errorf("hpClasses=%v, want empty when no CLOS has programmed MaxFreq", a.hpClasses) + } +} + +// TestPctClassifyAssocOnlyHP_ZeroMaxFreqIgnored: a CLOS that +// returns (cfg, true, nil) but with MaxFreq==0 must not be +// classified HP (zero is "not specified"). +func TestPctClassifyAssocOnlyHP_ZeroMaxFreqIgnored(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{ + supported: true, + closCfg: map[int]pctClosCfg{1: {MinFreq: 1000000}}, // MaxFreq=0 + }, + classPlan: map[string]*pctClassPlan{"c1": {ClosID: 1}}, + hpClasses: map[string]bool{}, + } + a.classifyAssocOnlyHP([]*policyapi.CPUClass{{Name: "c1"}}) + if a.hpClasses["c1"] { + t.Errorf("c1 must NOT be HP when MaxFreq=0") + } +} + +// --- BF fallback test ---------------------------------------------- + +// TestPctPunitMaxHpCpus_BfFallback: punit with TF unsupported but +// BF-supported high-priority CPU set must report MaxHpCpus equal +// to len(BF.HighPriorityCPUs). +func TestPctPunitMaxHpCpus_BfFallback(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + BF: gosst.BFInfo{ + Supported: true, + HighPriorityCPUs: idset.NewIDSet(0, 1, 2, 3), + }, + TF: gosst.TFInfo{Supported: false}, + } + if got := punitMaxHpCpus(pi); got != 4 { + t.Errorf("punitMaxHpCpus = %d, want 4 (BF fallback)", got) + } +} + +// TestPctPunitMaxHpCpus_TfWins: when both TF and BF are present, +// TF takes precedence (largest bucket HighPriorityCoreCount sets +// the cap). +func TestPctPunitMaxHpCpus_TfWins(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + BF: gosst.BFInfo{ + Supported: true, + HighPriorityCPUs: idset.NewIDSet(0, 1), // 2 + }, + TF: gosst.TFInfo{ + Supported: true, + Buckets: []gosst.TFBucketInfo{ + {ID: 0, HighPriorityCoreCount: 1}, + {ID: 1, HighPriorityCoreCount: 4}, // max + {ID: 2, HighPriorityCoreCount: 2}, + }, + }, + } + if got := punitMaxHpCpus(pi); got != 4 { + t.Errorf("punitMaxHpCpus = %d, want 4 (largest TF bucket)", got) + } +} + +// TestPctPunitMaxHpCpus_NeitherSupported: with neither TF nor BF +// supported, MaxHpCpus is 0 (the allocator excludes such punits +// from HP steering). +func TestPctPunitMaxHpCpus_NeitherSupported(t *testing.T) { + pi := &gosst.PerfLevelInfo{} + if got := punitMaxHpCpus(pi); got != 0 { + t.Errorf("punitMaxHpCpus = %d, want 0", got) + } +} + +// TestPctPunitGuaranteedHpCpus_TfSmallestBucket: with multiple +// non-zero TF buckets, the guaranteed top-turbo HP CPU count is +// the smallest HighPriorityCoreCount (smaller buckets unlock +// higher turbo frequencies). +func TestPctPunitGuaranteedHpCpus_TfSmallestBucket(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + TF: gosst.TFInfo{ + Supported: true, + Buckets: []gosst.TFBucketInfo{ + {ID: 0, HighPriorityCoreCount: 24}, + {ID: 1, HighPriorityCoreCount: 8}, // smallest non-zero + {ID: 2, HighPriorityCoreCount: 16}, + }, + }, + } + if got := punitGuaranteedHpCpus(pi); got != 8 { + t.Errorf("punitGuaranteedHpCpus = %d, want 8 (smallest TF bucket)", got) + } +} + +// TestPctPunitGuaranteedHpCpus_BfFallback: when TF is +// unsupported, fall back to len(BF.HighPriorityCPUs). +func TestPctPunitGuaranteedHpCpus_BfFallback(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + BF: gosst.BFInfo{ + Supported: true, + HighPriorityCPUs: idset.NewIDSet(0, 1, 2, 3), + }, + } + if got := punitGuaranteedHpCpus(pi); got != 4 { + t.Errorf("punitGuaranteedHpCpus = %d, want 4 (BF fallback)", got) + } +} + +// TestPctPunitGuaranteedHpCpus_NeitherSupported: neither TF nor +// BF -> 0. +func TestPctPunitGuaranteedHpCpus_NeitherSupported(t *testing.T) { + pi := &gosst.PerfLevelInfo{} + if got := punitGuaranteedHpCpus(pi); got != 0 { + t.Errorf("punitGuaranteedHpCpus = %d, want 0", got) + } +} + +// --- FreeClassCapacity test suite ----------------------------------- + +// newAssocOnlyPctForTest mirrors newManagedPctForTest but configures +// the allocator in assoc-only mode. hpClasses, classPlan and +// hpEligiblePunit must be set up by the caller after the helper +// returns to keep the test intent explicit. +func newAssocOnlyPctForTest(t *testing.T, classes []*policyapi.CPUClass, plans map[string]*pctClassPlan, + allowed cpuset.CPUSet, sys *fakeSys, sst *fakeSst) *Allocator { + t.Helper() + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeAssocOnly, + classByName: map[string]*policyapi.CPUClass{}, + classPlan: plans, + allowed: allowed, + hpUsed: map[int]cpuset.CPUSet{}, + hpClasses: map[string]bool{}, + hpEligiblePunit: map[int]bool{}, + } + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + pctTestWirePunits(a) + return a +} + +// TestFreeClassCapacity_AssocOnlyHpFromFallbackCLOS verifies the +// real-world assoc-only bug fix: every CPU starts on the fallback +// (LP) CLOS in hardware because the balloons policy associates the +// idle/default class on Configure, yet HP capacity for an HP class +// must still report sum_pu min(GuaranteedHpCpus, |pu.CPUs \ held|) +// -- not zero. (Pre-fix the result was 0 because closCpus(HP CLOS) +// was empty.) +func TestFreeClassCapacity_AssocOnlyHpFromFallbackCLOS(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + // All CPUs are on CLOS 3 (the LP/fallback CLOS). The HP + // CLOS 0 has no CPUs associated to it. + cpuClos: map[int]int{ + 0: 3, 1: 3, 2: 3, 3: 3, + 4: 3, 5: 3, 6: 3, 7: 3, + }, + // Two punits (one per package); each guarantees 2 HP CPUs at top turbo. + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + classes := []*policyapi.CPUClass{ + {Name: "hp"}, // pctPriority not set; HP is decided by classifyAssocOnlyHP at runtime + {Name: "lp"}, + } + a := newAssocOnlyPctForTest(t, classes, + map[string]*pctClassPlan{"hp": {ClosID: 0}, "lp": {ClosID: 3}}, + cpuset.MustParse("0-7"), sys, sst) + a.hpClasses["hp"] = true // simulate classifyAssocOnlyHP result + + // Held by some non-HP balloon: 2 CPUs (one per punit). + held := cpuset.MustParse("3,7") + + gotHp := a.FreeClassCapacity("hp", held) + wantHp := 2 + 2 // both punits: min(2, |{0,1,2}|=3)=2 and min(2, |{4,5,6}|=3)=2 + if gotHp != wantHp { + t.Errorf("HP capacity (assoc-only, all cpus on fallback CLOS) = %d, want %d", + gotHp, wantHp) + } + + gotLp := a.FreeClassCapacity("lp", held) + wantLp := 8 - 2 // allowed (8) minus held (2) + if gotLp != wantLp { + t.Errorf("LP capacity (assoc-only) = %d, want %d", gotLp, wantLp) + } +} + +// TestFreeClassCapacity_AssocOnlyHpTFDisabledPunitExcluded verifies +// the eligibility gate: punits where SST-TF is disabled in +// assoc-only mode contribute zero HP capacity even when their +// GuaranteedHpCpus is non-zero. Prevents over-publishing HP +// capacity on nodes that cannot actually deliver top turbo. +func TestFreeClassCapacity_AssocOnlyHpTFDisabledPunitExcluded(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + a := newAssocOnlyPctForTest(t, []*policyapi.CPUClass{{Name: "hp"}}, + map[string]*pctClassPlan{"hp": {ClosID: 0}}, + cpuset.MustParse("0-7"), sys, sst) + a.hpClasses["hp"] = true + // pctTestWirePunits marked both eligible; flip pkg1 punit to + // TF-disabled to model the assoc-only "operator did not enable + // SST-TF on this punit" case. + a.hpEligiblePunit[1] = false + + got := a.FreeClassCapacity("hp", cpuset.New()) + want := 2 // only pkg0 contributes + if got != want { + t.Errorf("HP capacity with one TF-disabled punit = %d, want %d", got, want) + } +} + +// TestFreeClassCapacity_AssocOnlyNoHpClassification: assoc-only +// where no class was classified HP (e.g. no CLOS has a programmed +// MaxFreq) falls through to the non-HP formula |Allowed \ held|. +func TestFreeClassCapacity_AssocOnlyNoHpClassification(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + a := newAssocOnlyPctForTest(t, []*policyapi.CPUClass{{Name: "c1"}}, + map[string]*pctClassPlan{"c1": {ClosID: 1}}, + cpuset.MustParse("0-7"), sys, sst) + // Intentionally no entries in a.hpClasses. + + got := a.FreeClassCapacity("c1", cpuset.MustParse("1,5")) + want := 8 - 2 + if got != want { + t.Errorf("non-HP assoc-only capacity = %d, want %d", got, want) + } +} + +// TestFreeClassCapacity_ManagedHpRespectsEligibility keeps the +// existing managed-mode formula intact: every punit is HP-eligible +// (PrepareManagedMode enables SST-TF) and the result is the +// guaranteed-top-turbo sum, capped by per-punit free CPUs. +func TestFreeClassCapacity_ManagedHpRespectsEligibility(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + classes := []*policyapi.CPUClass{ + {Name: "hp", PctPriority: "high"}, + {Name: "lp", PctPriority: "low"}, + } + a := newManagedPctForTest(t, classes, + map[string]*pctClassPlan{"hp": {ClosID: 0}, "lp": {ClosID: 3}}, + cpuset.MustParse("0-7"), sys, sst) + + gotHp := a.FreeClassCapacity("hp", cpuset.MustParse("3")) + wantHp := 2 + 2 // pkg0: min(2, 3)=2; pkg1: min(2, 4)=2 + if gotHp != wantHp { + t.Errorf("managed HP capacity = %d, want %d", gotHp, wantHp) + } + gotLp := a.FreeClassCapacity("lp", cpuset.MustParse("3")) + wantLp := 8 - 1 + if gotLp != wantLp { + t.Errorf("managed LP capacity = %d, want %d", gotLp, wantLp) + } + + // Squeeze pkg0: hold 3 of its 4 CPUs => pkg0 contributes min(2,1)=1. + gotHp = a.FreeClassCapacity("hp", cpuset.MustParse("0-2")) + wantHp = 1 + 2 + if gotHp != wantHp { + t.Errorf("managed HP capacity with squeezed pkg0 = %d, want %d", gotHp, wantHp) + } +} + +// TestFreeClassCapacity_UnknownClassReturnsZero: unknown class +// (no PCT plan) yields 0 regardless of mode. +func TestFreeClassCapacity_UnknownClassReturnsZero(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{supported: true} + a := newManagedPctForTest(t, []*policyapi.CPUClass{{Name: "hp", PctPriority: "high"}}, + map[string]*pctClassPlan{"hp": {ClosID: 0}}, + cpuset.MustParse("0-7"), sys, sst) + if got := a.FreeClassCapacity("nope", cpuset.New()); got != 0 { + t.Errorf("unknown class capacity = %d, want 0", got) + } +} diff --git a/pkg/resmgr/cpuclass/internal/types/types.go b/pkg/resmgr/cpuclass/internal/types/types.go new file mode 100644 index 000000000..b554beb63 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/types/types.go @@ -0,0 +1,91 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package types defines the internal class-definition struct used by +// the cpuclass writers (cpufreq, cpuidle, uncorefreq). It exists as +// a separate package so each writer can depend on the same struct +// without depending on the public cpuclass API or on the +// soon-to-be-deprecated control/cpu config package. +package types + +import ( + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// ClassDef is the resolved, platform-aware definition of a CPU class +// as consumed by the writers. All frequency fields are in kHz; zero +// means "no enforcement". Symbolic frequencies in the user-facing +// configuration are resolved before being placed into a ClassDef. +type ClassDef struct { + MinFreq uint + MaxFreq uint + EnergyPerformancePreference uint + UncoreMinFreq uint + UncoreMaxFreq uint + FreqGovernor string + DisabledCstates []string +} + +// Equal reports whether two ClassDef values describe identical +// per-CPU enforcement. Used by the handler to decide whether a +// class-table change actually requires re-programming CPUs. +func (c ClassDef) Equal(other ClassDef) bool { + if c.MinFreq != other.MinFreq || + c.MaxFreq != other.MaxFreq || + c.EnergyPerformancePreference != other.EnergyPerformancePreference || + c.UncoreMinFreq != other.UncoreMinFreq || + c.UncoreMaxFreq != other.UncoreMaxFreq || + c.FreqGovernor != other.FreqGovernor { + return false + } + if len(c.DisabledCstates) != len(other.DisabledCstates) { + return false + } + for i := range c.DisabledCstates { + if c.DisabledCstates[i] != other.DisabledCstates[i] { + return false + } + } + return true +} + +// AllocationIntent describes an upcoming CPU allocation for which +// the caller wants placement preferences. Lives here so internal +// helpers (e.g. pct) can implement Hints without depending on the +// public cpuclass package. +type AllocationIntent struct { + ClassName string + CurrentCpus cpuset.CPUSet + FreeCpus cpuset.CPUSet + RequestedCount int +} + +// CpuPreference is a named CPU set carrying a single placement +// preference (prefer or avoid depending on the slice it appears in). +type CpuPreference struct { + Name string + Cpus cpuset.CPUSet +} + +// AllocationHints carries technology-agnostic placement preferences +// for an upcoming allocation. Both slices are ordered by descending +// priority. +type AllocationHints struct { + Prefer []CpuPreference + Avoid []CpuPreference +} + +// CPUSet aliases cpuset.CPUSet for callers that want to refer to it +// via this package without re-importing pkg/utils/cpuset. +type CPUSet = cpuset.CPUSet diff --git a/pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go b/pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go new file mode 100644 index 000000000..33c589571 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go @@ -0,0 +1,239 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package uncorefreq is the per-die uncore frequency writer used by +// the cpuclass handler. It exposes a Hooks-injectable interface +// matching the cpufreq and cpuidle writers and computes the +// effective per-die min/max as the max-wins reduction over all +// classes that have at least one CPU on the die. +package uncorefreq + +import ( + "fmt" + + "github.com/intel/goresctrl/pkg/utils" + + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" +) + +var log = logger.NewLogger("cpuclass") + +// DieKey identifies one (package, die) uncore frequency domain. +type DieKey struct { + Pkg int + Die int +} + +// Hooks lets tests intercept per-die uncore writes without touching +// real sysfs. Production use leaves all hooks nil; the writer then +// talks to the platform via goresctrl. Setting any hook also forces +// "available" to true so tests can exercise enforce paths on VMs +// without the intel_uncore_frequency driver. +type Hooks struct { + SetMin func(pkg, die, kHz int) error + SetMax func(pkg, die, kHz int) error +} + +// uncoreWritten records the last successfully written min/max kHz +// on a single die. Used for write deduplication. +type uncoreWritten struct { + min uint + max uint + hasMin bool + hasMax bool +} + +// Writer enforces per-die uncore frequency limits. A die with +// effective min=max=0 is left untouched. Failures on individual +// dies are logged; the first error is returned to the caller. +type Writer struct { + hooks Hooks + available bool + lastWritten map[DieKey]uncoreWritten +} + +// NewWriter returns a Writer wired to the given hooks. Pass a +// zero-valued Hooks to use real sysfs via goresctrl. The "available" +// bit is probed once; setting any hook overrides the probe. +func NewWriter(hooks Hooks) *Writer { + available := utils.UncoreFreqAvailable() + if hooks.SetMin != nil || hooks.SetMax != nil { + available = true + } + return &Writer{ + hooks: hooks, + available: available, + lastWritten: make(map[DieKey]uncoreWritten), + } +} + +// Available reports whether the uncore frequency driver was found +// at construction time. Used by the handler to surface a helpful +// configuration error when classes request uncore limits but the +// driver is missing. +func (w *Writer) Available() bool { return w.available } + +// Reset clears the per-die lastWritten cache. Called by the handler +// when class definitions or the allowed set change. +func (w *Writer) Reset() { + w.lastWritten = make(map[DieKey]uncoreWritten) +} + +// RequiresAvailable reports whether any class definition requests +// uncore limits. Used by the handler to fail Configure with a +// helpful error when classes ask for uncore but the driver is not +// loaded. +func RequiresAvailable(defs map[string]types.ClassDef) (string, bool) { + for name, c := range defs { + if c.UncoreMinFreq != 0 || c.UncoreMaxFreq != 0 { + return name, true + } + } + return "", false +} + +// UnavailableError formats a configuration error when classes +// request uncore limits but the driver is missing. +func UnavailableError(className string) error { + return fmt.Errorf("uncore limits set in cpu class %q but uncore driver not available; load the intel_uncore_frequency driver", className) +} + +// Enforce recomputes and writes the effective uncore min/max for +// every dirty die. Parameters: +// - sys: narrow topology surface used to enumerate CPUs per die. +// - defs: class name -> definition. +// - cpuClass: cpu id -> class name (current assignments). +// - dirtyDies: set of (pkg, die) keys that need recomputation. +// +// Returns the first error encountered. Skips silently when the +// uncore driver is unavailable. +func (w *Writer) Enforce(sys sysfs.System, defs map[string]types.ClassDef, cpuClass map[int]string, dirtyDies map[DieKey]bool) error { + if !w.available || len(dirtyDies) == 0 { + return nil + } + var firstErr error + for key := range dirtyDies { + min, max, minCls, maxCls := effectiveUncoreFreqs(sys, key, defs, cpuClass) + if min == 0 && max == 0 { + log.Debugf("uncore: pkg/die %d/%d: no limits in effect", key.Pkg, key.Die) + continue + } + log.Debugf("uncore: pkg/die %d/%d: min=%d (class %q) max=%d (class %q)", + key.Pkg, key.Die, min, minCls, max, maxCls) + state := w.lastWritten[key] + if min > 0 && max > 0 && min > max { + log.Warnf("uncore: pkg/die %d/%d: min %d > max %d", key.Pkg, key.Die, min, max) + } + if min > 0 && (!state.hasMin || state.min != min) { + if err := w.callSetMin(key.Pkg, key.Die, int(min)); err != nil { + log.Errorf("uncore: pkg/die %d/%d: cannot set min=%d: %v", key.Pkg, key.Die, min, err) + if firstErr == nil { + firstErr = err + } + } + state.min = min + state.hasMin = true + } + if max > 0 && (!state.hasMax || state.max != max) { + if err := w.callSetMax(key.Pkg, key.Die, int(max)); err != nil { + log.Errorf("uncore: pkg/die %d/%d: cannot set max=%d: %v", key.Pkg, key.Die, max, err) + if firstErr == nil { + firstErr = err + } + } + state.max = max + state.hasMax = true + } + w.lastWritten[key] = state + } + return firstErr +} + +// effectiveUncoreFreqs computes the effective uncore min and max for +// a single die. Returns 0,0 when no class with uncore limits is +// active on the die. +func effectiveUncoreFreqs(sys sysfs.System, key DieKey, defs map[string]types.ClassDef, cpuClass map[int]string) (minFreq, maxFreq uint, minCls, maxCls string) { + pkg := sys.Package(utils.ID(key.Pkg)) + if pkg == nil { + return 0, 0, "", "" + } + dieCPUs := pkg.DieCPUSet(utils.ID(key.Die)) + seen := map[string]bool{} + for _, cpu := range dieCPUs.UnsortedList() { + name, ok := cpuClass[cpu] + if !ok || name == "" { + continue + } + if seen[name] { + continue + } + seen[name] = true + def, ok := defs[name] + if !ok { + continue + } + if def.UncoreMinFreq > minFreq { + minFreq = def.UncoreMinFreq + minCls = name + } + if def.UncoreMaxFreq > maxFreq { + maxFreq = def.UncoreMaxFreq + maxCls = name + } + } + return minFreq, maxFreq, minCls, maxCls +} + +// DiesForCpus returns the set of (pkg, die) keys that contain at +// least one cpu from cpus. +func DiesForCpus(sys sysfs.System, cpus map[int]bool) map[DieKey]bool { + out := map[DieKey]bool{} + if sys == nil { + return out + } + for cpu := range cpus { + c := sys.CPU(utils.ID(cpu)) + if c == nil { + continue + } + pkgID := int(c.PackageID()) + pkg := sys.Package(utils.ID(pkgID)) + if pkg == nil { + continue + } + for _, die := range pkg.DieIDs() { + if pkg.DieCPUSet(die).Contains(cpu) { + out[DieKey{Pkg: pkgID, Die: int(die)}] = true + break + } + } + } + return out +} + +func (w *Writer) callSetMin(pkg, die, freq int) error { + if w.hooks.SetMin != nil { + return w.hooks.SetMin(pkg, die, freq) + } + return utils.SetUncoreMinFreq(utils.ID(pkg), utils.ID(die), freq) +} + +func (w *Writer) callSetMax(pkg, die, freq int) error { + if w.hooks.SetMax != nil { + return w.hooks.SetMax(pkg, die, freq) + } + return utils.SetUncoreMaxFreq(utils.ID(pkg), utils.ID(die), freq) +} diff --git a/pkg/resmgr/main/main.go b/pkg/resmgr/main/main.go index 89f4bc80a..b756366a6 100644 --- a/pkg/resmgr/main/main.go +++ b/pkg/resmgr/main/main.go @@ -18,6 +18,7 @@ import ( "flag" "fmt" "os" + "os/signal" "strings" "syscall" @@ -67,6 +68,26 @@ func (m *Main) Run() error { } defer m.stopTracing() + // Install a SIGTERM/SIGINT handler that triggers a graceful + // agent shutdown: this lets us clean up node state (e.g., + // extended resources we published) before the kubelet kills + // the container. Closing the agent's stop channel makes its + // event loop return, which unwinds m.mgr.Start() and lets + // Run() exit normally. + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT) + go func() { + sig, ok := <-sigCh + if !ok { + return + } + log.Infof("received signal %s, shutting down gracefully", sig) + if m.agt != nil { + m.agt.Stop() + } + }() + defer signal.Stop(sigCh) + err := m.mgr.Start() return err } diff --git a/pkg/resmgr/nri.go b/pkg/resmgr/nri.go index 0678fe0bf..efb14ff81 100644 --- a/pkg/resmgr/nri.go +++ b/pkg/resmgr/nri.go @@ -284,6 +284,7 @@ func (p *nriPlugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, con } m.updateTopologyZones() + m.updateNodeExtendedResources() return p.getPendingUpdates(nil), nil } @@ -445,6 +446,7 @@ func (p *nriPlugin) CreateContainer(ctx context.Context, pod *api.PodSandbox, co m.policy.ExportResourceData(c) m.updateTopologyZones() + m.updateNodeExtendedResources() adjust = p.getPendingAdjustment(container) updates = p.getPendingUpdates(container) @@ -596,6 +598,7 @@ func (p *nriPlugin) StopContainer(ctx context.Context, pod *api.PodSandbox, cont c.UpdateState(cache.ContainerStateExited) m.updateTopologyZones() + m.updateNodeExtendedResources() return p.getPendingUpdates(container), nil } diff --git a/pkg/resmgr/policy/policy.go b/pkg/resmgr/policy/policy.go index ceaff7934..a682422f6 100644 --- a/pkg/resmgr/policy/policy.go +++ b/pkg/resmgr/policy/policy.go @@ -117,6 +117,14 @@ type Backend interface { ExportResourceData(cache.Container) map[string]string // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs. GetTopologyZones() []*TopologyZone + // GetExtendedResources returns node-level extended resources + // this policy wishes to publish on the local Node, mapping + // fully-qualified resource name (e.g. + // "cpuclass.balloons.nri.io/hp-pct") to its current capacity + // (logical CPU count). Returning nil or an empty map means + // "publish nothing"; previously-published resources are then + // cleared by the agent. + GetExtendedResources() map[string]int64 } // Policy is the exposed interface for container resource allocations decision making. @@ -143,6 +151,9 @@ type Policy interface { ExportResourceData(cache.Container) // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs. GetTopologyZones() []*TopologyZone + // GetExtendedResources returns node-level extended resources + // the active policy wishes to publish on the local Node. + GetExtendedResources() map[string]int64 } // Metrics is the interface we expect policy-specific metrics to implement. @@ -336,3 +347,9 @@ func (p *policy) ExportResourceData(c cache.Container) { func (p *policy) GetTopologyZones() []*TopologyZone { return p.active.GetTopologyZones() } + +// GetExtendedResources returns node-level extended resources the +// active policy wishes to publish on the local Node. +func (p *policy) GetExtendedResources() map[string]int64 { + return p.active.GetExtendedResources() +} diff --git a/pkg/resmgr/resource-manager.go b/pkg/resmgr/resource-manager.go index 28e5b9434..fe6335a41 100644 --- a/pkg/resmgr/resource-manager.go +++ b/pkg/resmgr/resource-manager.go @@ -160,6 +160,7 @@ func (m *resmgr) updateConfig(newCfg interface{}) (bool, error) { reconfErr := m.reconfigure(cfg) m.updateTopologyZones() + m.updateNodeExtendedResources() return false, reconfErr } @@ -297,6 +298,15 @@ func (m *resmgr) updateTopologyZones() { } } +// updateNodeExtendedResources publishes (or clears) the +// node-level extended resources the active policy advertises. +func (m *resmgr) updateNodeExtendedResources() { + resources := m.policy.GetExtendedResources() + if err := m.agent.UpdateNodeExtendedResources(resources); err != nil { + log.Errorf("failed to update node extended resources: %v", err) + } +} + func (m *resmgr) reconfigure(cfg cfgapi.ResmgrConfig) error { apply := func(cfg cfgapi.ResmgrConfig) error { mCfg := cfg.CommonConfig() diff --git a/pkg/sysfs/system.go b/pkg/sysfs/system.go index 394b39d55..16db6ea29 100644 --- a/pkg/sysfs/system.go +++ b/pkg/sysfs/system.go @@ -292,8 +292,10 @@ var ( PerformanceCore: "OVERRIDE_SYS_CORE_CPUS", EfficientCore: "OVERRIDE_SYS_ATOM_CPUS", } - cacheEnvOverridesVar = "OVERRIDE_SYS_CACHES" - cacheEnvOverridesJson = os.Getenv(cacheEnvOverridesVar) + cacheEnvOverridesVar = "OVERRIDE_SYS_CACHES" + cacheEnvOverridesJson = os.Getenv(cacheEnvOverridesVar) + cpufreqEnvOverridesVar = "OVERRIDE_SYS_CPUFREQ" + cpufreqEnvOverridesJson = os.Getenv(cpufreqEnvOverridesVar) ) // MemInfo contains data read from a NUMA node meminfo file. @@ -338,6 +340,16 @@ type cacheOverride struct { var cacheEnvOverrides map[int][]*Cache +// cpufreqOverride specifies frequency values to use instead of reading sysfs. +type cpufreqOverride struct { + Cpus string `json:"cpus"` // CPU ids in list format, e.g. "0-15" + Base uint64 `json:"base"` // base frequency (kHz) + Min uint64 `json:"min"` // minimum frequency (kHz) + Max uint64 `json:"max"` // maximum/turbo frequency (kHz) +} + +var cpufreqEnvOverrides map[int]CPUFreq + // SetSysRoot sets the sys root directory. func SetSysRoot(root string) { if root != "" { @@ -1063,6 +1075,10 @@ func (sys *system) discoverCPU(path string) error { if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_max_freq", &cpu.freq.Max); err != nil { cpu.freq.Max = 0 } + // Apply cpufreq overrides from OVERRIDE_SYS_CPUFREQ if set. + if err := sys.applyCpufreqOverrides(cpu); err != nil { + log.Warnf("failed to apply cpufreq overrides for cpu%d: %v", cpu.id, err) + } if _, err := readSysfsEntry(path, "cpufreq/energy_performance_preference", &cpu.epp); err != nil { cpu.epp = EPPUnknown } @@ -2082,7 +2098,45 @@ func (sys *system) discoverCacheFromOverrides(cpu *cpu) (bool, error) { return false, nil } -// Discover cache associated with the given CPU. +// applyCpufreqOverrides overrides CPU frequency values from OVERRIDE_SYS_CPUFREQ. +func (sys *system) applyCpufreqOverrides(cpu *cpu) error { + if cpufreqEnvOverridesJson == "" { + return nil + } + if cpufreqEnvOverrides == nil { + sys.Debugf("parsing cpufreq overrides from %s=%q", cpufreqEnvOverridesVar, cpufreqEnvOverridesJson) + overrides, err := parseCpufreqOverrides(cpufreqEnvOverridesJson) + if err != nil { + return fmt.Errorf("failed to parse %s: %v", cpufreqEnvOverridesVar, err) + } + cpufreqEnvOverrides = overrides + } + if freq, ok := cpufreqEnvOverrides[cpu.id]; ok { + sys.Debugf("cpufreq override for cpu%d: base=%d min=%d max=%d", cpu.id, freq.Base, freq.Min, freq.Max) + cpu.freq = freq + } + return nil +} + +// parseCpufreqOverrides parses JSON cpufreq overrides into a per-CPU map. +func parseCpufreqOverrides(jsonData string) (map[int]CPUFreq, error) { + var overrides []cpufreqOverride + if err := json.Unmarshal([]byte(jsonData), &overrides); err != nil { + return nil, err + } + result := make(map[int]CPUFreq) + for _, o := range overrides { + cpus, err := idset.NewIDSetFromString(o.Cpus) + if err != nil { + return nil, fmt.Errorf("invalid CPU list %q: %v", o.Cpus, err) + } + freq := CPUFreq{Base: o.Base, Min: o.Min, Max: o.Max} + for cpu := range cpus { + result[cpu] = freq + } + } + return result, nil +} func (sys *system) discoverCache(cpu *cpu, path string) error { var id idset.ID diff --git a/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in b/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in index 8ebcfaacb..aa11dcf33 100644 --- a/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in +++ b/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in @@ -61,19 +61,29 @@ spec: debug: - policy - control: - cpu: - classes: - default: - minFreq: ${CPU_DEFAULT_MIN:-800000} - maxFreq: ${CPU_DEFAULT_MAX:-2800000} - classA: - minFreq: ${CPU_CLASSA_MIN:-900000} - maxFreq: ${CPU_CLASSA_MAX:-2900000} - classB: - minFreq: ${CPU_CLASSB_MIN:-1000000} - maxFreq: ${CPU_CLASSB_MAX:-3000000} - classC: - minFreq: ${CPU_CLASSC_MIN:-1100000} - maxFreq: ${CPU_CLASSC_MAX:-3100000} - energyPerformancePreference: ${CPU_CLASSC_EPP:-1} + cpuClasses: + + $([ -n "$CPUCLASS_DEFAULT_SKIP" ] || echo " + - name: default + minFreq: ${CPU_DEFAULT_MIN:-800MHz} + maxFreq: ${CPU_DEFAULT_MAX:-2.8GHz} + ") + + $([ -n "$CPUCLASS_A_SKIP" ] || echo " + - name: classA + minFreq: ${CPU_CLASSA_MIN:-900MHz} + maxFreq: ${CPU_CLASSA_MAX:-2.9GHz} + ") + + $([ -n "$CPUCLASS_B_SKIP" ] || echo " + - name: classB + minFreq: ${CPU_CLASSB_MIN:-1GHz} + maxFreq: ${CPU_CLASSB_MAX:-3GHz} + ") + + $([ -n "$CPUCLASS_C_SKIP" ] || echo " + - name: classC + minFreq: ${CPU_CLASSC_MIN:-1.1GHz} + maxFreq: ${CPU_CLASSC_MAX:-3.1GHz} + energyPerformancePreference: ${CPU_CLASSC_EPP:-1} + ") diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg index 215432bf1..03924e9b9 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg +++ b/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg @@ -16,13 +16,11 @@ config: cpuClass: lowlatency-class schedulingClass: realtime - control: - cpu: - classes: - lowlatency-class: - disabledCstates: [C4, C6, C8, C10] - default-class: - disabledCstates: [] + cpuClasses: + - name: lowlatency-class + disabledCstates: [C4, C6, C8, C10] + - name: default-class + disabledCstates: [] schedulingClasses: - name: realtime @@ -40,5 +38,6 @@ config: - policy - nri-plugin - cpu + - cpuclass extraEnv: OVERRIDE_SYS_CSTATES: '''[{"cpus": "0-15", "names": ["C1E", "C2", "C4", "C8"], "files": {"disable": "0"}}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg new file mode 100644 index 000000000..ecc2d6389 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg @@ -0,0 +1,36 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + # Intentionally no idleCPUClass and no cpuClass on the reserved + # balloon type: both must fall back to the cpuClass named "default". + balloonTypes: + - name: reserved + - name: fast-bln + cpuClass: fast + minCPUs: 1 + maxCPUs: 1 + + cpuClasses: + - name: default + minFreq: "min" + maxFreq: "base" + - name: fast + minFreq: "turbo" + maxFreq: "turbo" + + log: + debug: + - policy + - nri-plugin + - cpu + - cpuclass +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg new file mode 100644 index 000000000..90b6a4de2 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg @@ -0,0 +1,37 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + idleCPUClass: legacy-idle + + balloonTypes: + - name: legacy-bln + cpuClass: legacy-fast + minCPUs: 1 + maxCPUs: 1 + + control: + cpu: + classes: + legacy-idle: + minFreq: 800000 + maxFreq: 2900000 + legacy-fast: + minFreq: 3800000 + maxFreq: 3800000 + + log: + debug: + - policy + - nri-plugin + - cpu + - cpuclass +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg new file mode 100644 index 000000000..6ece6cf3b --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg @@ -0,0 +1,49 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + idleCPUClass: default-noturbo + + balloonTypes: + - name: reserved + cpuClass: default-turbo + - name: turbo-high-bln + cpuClass: turbo-high + minCPUs: 1 + maxCPUs: 2 + - name: turbo-low-bln + cpuClass: turbo-low + minCPUs: 1 + maxCPUs: 2 + + cpuClasses: + - name: turbo-high + minFreq: "turbo" + maxFreq: "turbo" + turboPriority: 10 + - name: turbo-low + minFreq: "turbo" + maxFreq: "turbo" + turboPriority: 1 + - name: default-turbo + minFreq: "min" + maxFreq: "turbo" + - name: default-noturbo + minFreq: "min" + maxFreq: "base" + + log: + debug: + - policy + - nri-plugin + - cpu + - cpuclass +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh new file mode 100644 index 000000000..904b582cb --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh @@ -0,0 +1,582 @@ +# Test turbo priority: highest-priority active CPU class gets turbo, +# others get base. When the highest-priority balloon is removed, +# the next highest-priority class regains turbo. +# +# Also verifies CPU frequency write minimality: +# - no duplicate sysfs writes (each (cpu, prop, freq) tuple is logged +# at most once per recorded snapshot window, thanks to the per-CPU +# last-written cache in pkg/resmgr/control/cpu), +# - writes do happen on class transitions (turbo<->base) and when +# idle CPUs need their initial class settings, +# - a no-op event (creating a 2nd container that lands in the +# *same* turbo-low balloon as pod0) does not produce any new +# enforce writes. + +helm-terminate +helm_config=$TEST_DIR/balloons-turbo.cfg helm-launch balloons + +# turbo-log fetches the latest turbo recalculation log lines +turbo-log() { + local last_n=${1:-20} + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep -E 'turbo:|cpuClass' | tail -n $last_n" +} + +# verify-turbo-winner checks that the given class is logged as a turbo winner +# with the expected maxFreq, within the last N turbo log lines. +verify-turbo-winner() { + local class=$1 + local expected_max_freq=$2 + local last_n=${3:-20} + echo "verify turbo winner: class=$class maxFreq=$expected_max_freq" + turbo-log $last_n + grep "class \"$class\"" <<< "$COMMAND_OUTPUT" | grep "winner=true" | tail -n 1 | grep -q "maxFreq=$expected_max_freq" || { + command-error "expected class $class as turbo winner with maxFreq=$expected_max_freq" + } +} + +# verify-turbo-loser checks that the given class is logged as NOT a turbo winner +# (winner=false) with the expected maxFreq (base), within the last N turbo log lines. +verify-turbo-loser() { + local class=$1 + local expected_max_freq=$2 + local last_n=${3:-20} + echo "verify turbo loser: class=$class maxFreq=$expected_max_freq" + turbo-log $last_n + grep "class \"$class\"" <<< "$COMMAND_OUTPUT" | grep "winner=false" | tail -n 1 | grep -q "maxFreq=$expected_max_freq" || { + command-error "expected class $class as turbo loser with maxFreq=$expected_max_freq" + } +} + +ENFORCE_PATTERN='enforcing cpu frequency' + +# enforce-count returns the total number of "enforcing cpu frequency" log lines so far. +enforce-count() { + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep -c '$ENFORCE_PATTERN' || true" >/dev/null + echo "$COMMAND_OUTPUT" | tr -d '[:space:]' +} + +# wait-enforce-grows [timeout=15] +# Polls until the cumulative number of enforce writes is greater than . +wait-enforce-grows() { + local baseline=$1 + local timeout=${2:-15} + vm-run-until --timeout "$timeout" \ + "[ \$(kubectl -n kube-system logs ds/nri-resource-policy-balloons 2>/dev/null | grep -c '$ENFORCE_PATTERN') -gt $baseline ]" || { + command-error "expected enforce-count to grow above $baseline within ${timeout}s" + } +} + +# wait-pod-gone [timeout=30] +# Polls until the named pod no longer exists. +wait-pod-gone() { + local pod=$1 + local timeout=${2:-30} + vm-run-until --timeout "$timeout" "! kubectl get pod $pod -o name 2>/dev/null | grep -q ." || { + command-error "pod $pod did not disappear within ${timeout}s" + } +} + +# enforce-lines-since prints the enforce log lines added since the given absolute count. +enforce-lines-since() { + local from=$1 + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep '$ENFORCE_PATTERN' | tail -n +$((from+1))" >/dev/null +} + +# assert-step-writes