From 624fa08d60269020aaa474661bcd97f123c54fa1 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 01/10] go.mod,go.sum: bump goresctrl to v0.13.0 for PCT support Signed-off-by: Antti Kervinen --- go.mod | 36 ++++++++++++++-------------- go.sum | 76 +++++++++++++++++++++++++++++----------------------------- 2 files changed, 56 insertions(+), 56 deletions(-) diff --git a/go.mod b/go.mod index ef04cf5a0..878276984 100644 --- a/go.mod +++ b/go.mod @@ -10,7 +10,7 @@ require ( github.com/containers/nri-plugins/pkg/topology v0.0.0 github.com/coreos/go-systemd/v22 v22.5.0 github.com/fsnotify/fsnotify v1.6.0 - github.com/intel/goresctrl v0.12.0 + github.com/intel/goresctrl v0.13.0 github.com/intel/memtierd v0.1.1 github.com/k8stopologyawareschedwg/noderesourcetopology-api v0.1.2 github.com/onsi/ginkgo/v2 v2.21.0 @@ -19,19 +19,19 @@ require ( github.com/prometheus/client_golang v1.23.0 github.com/sirupsen/logrus v1.9.3 github.com/stretchr/testify v1.11.1 - go.opentelemetry.io/otel v1.42.0 + go.opentelemetry.io/otel v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 - go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 + go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0 go.opentelemetry.io/otel/exporters/prometheus v0.60.0 - go.opentelemetry.io/otel/metric v1.42.0 - go.opentelemetry.io/otel/sdk v1.42.0 - go.opentelemetry.io/otel/sdk/metric v1.42.0 - go.opentelemetry.io/otel/trace v1.42.0 - golang.org/x/sys v0.41.0 + go.opentelemetry.io/otel/metric v1.43.0 + go.opentelemetry.io/otel/sdk v1.43.0 + go.opentelemetry.io/otel/sdk/metric v1.43.0 + go.opentelemetry.io/otel/trace v1.43.0 + golang.org/x/sys v0.42.0 golang.org/x/time v0.9.0 - google.golang.org/grpc v1.79.3 + google.golang.org/grpc v1.80.0 k8s.io/api v0.31.2 k8s.io/apimachinery v0.33.1 k8s.io/client-go v0.31.2 @@ -86,17 +86,17 @@ require ( github.com/x448/float16 v0.8.4 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 // indirect - go.opentelemetry.io/proto/otlp v1.9.0 // indirect - golang.org/x/mod v0.32.0 // indirect - golang.org/x/net v0.51.0 // indirect + go.opentelemetry.io/proto/otlp v1.10.0 // indirect + golang.org/x/mod v0.33.0 // indirect + golang.org/x/net v0.52.0 // indirect golang.org/x/oauth2 v0.35.0 // indirect - golang.org/x/sync v0.19.0 // indirect - golang.org/x/term v0.40.0 // indirect - golang.org/x/text v0.34.0 // indirect - golang.org/x/tools v0.41.0 // indirect + golang.org/x/sync v0.20.0 // indirect + golang.org/x/term v0.41.0 // indirect + golang.org/x/text v0.35.0 // indirect + golang.org/x/tools v0.42.0 // indirect golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated // indirect - google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 // indirect - google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 // indirect + google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 // indirect + google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 // indirect google.golang.org/protobuf v1.36.11 // indirect gopkg.in/evanphx/json-patch.v4 v4.12.0 // indirect gopkg.in/inf.v0 v0.9.1 // indirect diff --git a/go.sum b/go.sum index 7aabb344a..551d203bf 100644 --- a/go.sum +++ b/go.sum @@ -840,8 +840,8 @@ github.com/ianlancetaylor/demangle v0.0.0-20181102032728-5e5cf60278f6/go.mod h1: github.com/ianlancetaylor/demangle v0.0.0-20200824232613-28f6c0f3b639/go.mod h1:aSSvb/t6k1mPoxDqO4vJh6VOCGPwU4O0C2/Eqndh1Sc= github.com/imdario/mergo v0.3.6 h1:xTNEAn+kxVO7dTZGu0CegyqKZmoWFI0rF8UxjlB2d28= github.com/imdario/mergo v0.3.6/go.mod h1:2EnlNZ0deacrJVfApfmtdGgDfMuh/nq6Ok1EcJh5FfA= -github.com/intel/goresctrl v0.12.0 h1:F44m7jiVgOdqWfTTWaREF+5HTeX3i06qhvpuzpnrBko= -github.com/intel/goresctrl v0.12.0/go.mod h1:5GWtmPY4BWl/a9rU8apGED9Xul5b5WoLtg/qOWaghWU= +github.com/intel/goresctrl v0.13.0 h1:5fhKjNq4V5MYDFHa//6M6x0jP6Iq5EXwZc6/eYxdEtQ= +github.com/intel/goresctrl v0.13.0/go.mod h1:KFHS91JGOmeeuEog+nTQcsGjLC81nRqdsdhcqf69fjU= github.com/intel/memtierd v0.1.1 h1:hGSN0+dzjaUkwgkJrk6B9SU4dntggXLpXgs9Dm+jfz4= github.com/intel/memtierd v0.1.1/go.mod h1:NFDBvjoDS42gBK/c9q/CYCJ2pt/+g7UQwOOBvQli4z0= github.com/josharian/intern v1.0.0 h1:vlS4z54oSdjm0bgjRigI+G1HpF+tI+9rE5LLzOg8HmY= @@ -986,12 +986,12 @@ go.opencensus.io v0.24.0/go.mod h1:vNK8G9p7aAivkbmorf4v+7Hgx+Zs0yY+0fOtgBfjQKo= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= go.opentelemetry.io/auto/sdk v1.2.1/go.mod h1:KRTj+aOaElaLi+wW1kO/DZRXwkF4C5xPbEe3ZiIhN7Y= go.opentelemetry.io/otel v1.19.0/go.mod h1:i0QyjOq3UPoTzff0PJB2N66fb4S0+rSbSB15/oyH9fY= -go.opentelemetry.io/otel v1.42.0 h1:lSQGzTgVR3+sgJDAU/7/ZMjN9Z+vUip7leaqBKy4sho= -go.opentelemetry.io/otel v1.42.0/go.mod h1:lJNsdRMxCUIWuMlVJWzecSMuNjE7dOYyWlqOXWkdqCc= +go.opentelemetry.io/otel v1.43.0 h1:mYIM03dnh5zfN7HautFE4ieIig9amkNANT+xcVxAj9I= +go.opentelemetry.io/otel v1.43.0/go.mod h1:JuG+u74mvjvcm8vj8pI5XiHy1zDeoCS2LB1spIq7Ay0= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0 h1:MdKucPl/HbzckWWEisiNqMPhRrAOQX8r4jTuGr636gk= go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetricgrpc v1.42.0/go.mod h1:RolT8tWtfHcjajEH5wFIZ4Dgh5jpPdFXYV9pTAk/qjc= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0 h1:H7O6RlGOMTizyl3R08Kn5pdM06bnH8oscSj7o11tmLA= -go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.42.0/go.mod h1:mBFWu/WOVDkWWsR7Tx7h6EpQB8wsv7P0Yrh0Pb7othc= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0 h1:w1K+pCJoPpQifuVpsKamUdn9U0zM3xUziVOqsGksUrY= +go.opentelemetry.io/otel/exporters/otlp/otlpmetric/otlpmetrichttp v1.43.0/go.mod h1:HBy4BjzgVE8139ieRI75oXm3EcDN+6GhD88JT1Kjvxg= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0 h1:Mne5On7VWdx7omSrSSZvM4Kw7cS7NQkOOmLcgscI51U= go.opentelemetry.io/otel/exporters/otlp/otlptrace v1.19.0/go.mod h1:IPtUMKL4O3tH5y+iXVyAXqpAwMuzC1IrxVS81rummfE= go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracegrpc v1.19.0 h1:3d+S281UTjM+AbF31XSOYn1qXn3BgIdWl8HNEpx08Jk= @@ -1001,21 +1001,21 @@ go.opentelemetry.io/otel/exporters/otlp/otlptrace/otlptracehttp v1.19.0/go.mod h go.opentelemetry.io/otel/exporters/prometheus v0.60.0 h1:cGtQxGvZbnrWdC2GyjZi0PDKVSLWP/Jocix3QWfXtbo= go.opentelemetry.io/otel/exporters/prometheus v0.60.0/go.mod h1:hkd1EekxNo69PTV4OWFGZcKQiIqg0RfuWExcPKFvepk= go.opentelemetry.io/otel/metric v1.19.0/go.mod h1:L5rUsV9kM1IxCj1MmSdS+JQAcVm319EUrDVLrt7jqt8= -go.opentelemetry.io/otel/metric v1.42.0 h1:2jXG+3oZLNXEPfNmnpxKDeZsFI5o4J+nz6xUlaFdF/4= -go.opentelemetry.io/otel/metric v1.42.0/go.mod h1:RlUN/7vTU7Ao/diDkEpQpnz3/92J9ko05BIwxYa2SSI= +go.opentelemetry.io/otel/metric v1.43.0 h1:d7638QeInOnuwOONPp4JAOGfbCEpYb+K6DVWvdxGzgM= +go.opentelemetry.io/otel/metric v1.43.0/go.mod h1:RDnPtIxvqlgO8GRW18W6Z/4P462ldprJtfxHxyKd2PY= go.opentelemetry.io/otel/sdk v1.19.0/go.mod h1:NedEbbS4w3C6zElbLdPJKOpJQOrGUJ+GfzpjUvI0v1A= -go.opentelemetry.io/otel/sdk v1.42.0 h1:LyC8+jqk6UJwdrI/8VydAq/hvkFKNHZVIWuslJXYsDo= -go.opentelemetry.io/otel/sdk v1.42.0/go.mod h1:rGHCAxd9DAph0joO4W6OPwxjNTYWghRWmkHuGbayMts= -go.opentelemetry.io/otel/sdk/metric v1.42.0 h1:D/1QR46Clz6ajyZ3G8SgNlTJKBdGp84q9RKCAZ3YGuA= -go.opentelemetry.io/otel/sdk/metric v1.42.0/go.mod h1:Ua6AAlDKdZ7tdvaQKfSmnFTdHx37+J4ba8MwVCYM5hc= +go.opentelemetry.io/otel/sdk v1.43.0 h1:pi5mE86i5rTeLXqoF/hhiBtUNcrAGHLKQdhg4h4V9Dg= +go.opentelemetry.io/otel/sdk v1.43.0/go.mod h1:P+IkVU3iWukmiit/Yf9AWvpyRDlUeBaRg6Y+C58QHzg= +go.opentelemetry.io/otel/sdk/metric v1.43.0 h1:S88dyqXjJkuBNLeMcVPRFXpRw2fuwdvfCGLEo89fDkw= +go.opentelemetry.io/otel/sdk/metric v1.43.0/go.mod h1:C/RJtwSEJ5hzTiUz5pXF1kILHStzb9zFlIEe85bhj6A= go.opentelemetry.io/otel/trace v1.19.0/go.mod h1:mfaSyvGyEJEI0nyV2I4qhNQnbBOUUmYZpYojqMnX2vo= -go.opentelemetry.io/otel/trace v1.42.0 h1:OUCgIPt+mzOnaUTpOQcBiM/PLQ/Op7oq6g4LenLmOYY= -go.opentelemetry.io/otel/trace v1.42.0/go.mod h1:f3K9S+IFqnumBkKhRJMeaZeNk9epyhnCmQh/EysQCdc= +go.opentelemetry.io/otel/trace v1.43.0 h1:BkNrHpup+4k4w+ZZ86CZoHHEkohws8AY+WTX09nk+3A= +go.opentelemetry.io/otel/trace v1.43.0/go.mod h1:/QJhyVBUUswCphDVxq+8mld+AvhXZLhe+8WVFxiFff0= go.opentelemetry.io/proto/otlp v0.7.0/go.mod h1:PqfVotwruBrMGOCsRd/89rSnXhoiJIqeYNgFYFoEGnI= go.opentelemetry.io/proto/otlp v0.15.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= go.opentelemetry.io/proto/otlp v0.19.0/go.mod h1:H7XAot3MsfNsj7EXtrA2q5xSNQ10UqI405h3+duxN4U= -go.opentelemetry.io/proto/otlp v1.9.0 h1:l706jCMITVouPOqEnii2fIAuO3IVGBRPV5ICjceRb/A= -go.opentelemetry.io/proto/otlp v1.9.0/go.mod h1:xE+Cx5E/eEHw+ISFkwPLwCZefwVjY+pqKg1qcK03+/4= +go.opentelemetry.io/proto/otlp v1.10.0 h1:IQRWgT5srOCYfiWnpqUYz9CVmbO8bFmKcwYxpuCSL2g= +go.opentelemetry.io/proto/otlp v1.10.0/go.mod h1:/CV4QoCR/S9yaPj8utp3lvQPoqMtxXdzn7ozvvozVqk= go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -1086,8 +1086,8 @@ golang.org/x/mod v0.6.0-dev.0.20220419223038-86c51ed26bb4/go.mod h1:jJ57K6gSWd91 golang.org/x/mod v0.7.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.8.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= golang.org/x/mod v0.9.0/go.mod h1:iBbtSCu2XBx23ZKBPSOrRkjjQPZFPuis4dIYUhu/chs= -golang.org/x/mod v0.32.0 h1:9F4d3PHLljb6x//jOyokMv3eX+YDeepZSEo3mFJy93c= -golang.org/x/mod v0.32.0/go.mod h1:SgipZ/3h2Ci89DlEtEXWUk/HteuRin+HHhN+WbNhguU= +golang.org/x/mod v0.33.0 h1:tHFzIWbBifEmbwtGz65eaWyGiGZatSrT9prnU8DbVL8= +golang.org/x/mod v0.33.0/go.mod h1:swjeQEj+6r7fODbD2cqrnje9PnziFuw4bmLbBZFrQ5w= golang.org/x/net v0.0.0-20180724234803-3673e40ba225/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20180826012351-8a410e7b638d/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= golang.org/x/net v0.0.0-20190108225652-1e06a53dbb7e/go.mod h1:mL1N/T3taQHkDXs73rZJwtUhF3w3ftmwwsq0BUmARs4= @@ -1145,8 +1145,8 @@ golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.7.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= golang.org/x/net v0.9.0/go.mod h1:d48xBJpPfHeWQsugry2m+kC02ZBRGRgulfHnEXEuWns= -golang.org/x/net v0.51.0 h1:94R/GTO7mt3/4wIKpcR5gkGmRLOuE/2hNGeWq/GBIFo= -golang.org/x/net v0.51.0/go.mod h1:aamm+2QF5ogm02fjy5Bb7CQ0WMt1/WVM7FtyaTLlA9Y= +golang.org/x/net v0.52.0 h1:He/TN1l0e4mmR3QqHMT2Xab3Aj3L9qjbhRm78/6jrW0= +golang.org/x/net v0.52.0/go.mod h1:R1MAz7uMZxVMualyPXb+VaqGSa3LIaUqk0eEt3w36Sw= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -1194,8 +1194,8 @@ golang.org/x/sync v0.0.0-20220722155255-886fb9371eb4/go.mod h1:RxMgew5VJxzue5/jJ golang.org/x/sync v0.0.0-20220819030929-7fc1605a5dde/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.0.0-20220929204114-8fcdb60fdcc0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= golang.org/x/sync v0.1.0/go.mod h1:RxMgew5VJxzue5/jJTE5uejpjVlOe/izrB70Jof72aM= -golang.org/x/sync v0.19.0 h1:vV+1eWNmZ5geRlYjzm2adRgW2/mcpevXNg50YZtPCE4= -golang.org/x/sync v0.19.0/go.mod h1:9KTHXmSnoGruLpwFjVSX0lNNA75CykiMECbovNTZqGI= +golang.org/x/sync v0.20.0 h1:e0PTpb7pjO8GAtTs2dQ6jYa5BWYlMuX047Dco/pItO4= +golang.org/x/sync v0.20.0/go.mod h1:9xrNwdLfx4jkKbNva9FpL6vEN7evnE43NNNJQ2LF3+0= golang.org/x/sys v0.0.0-20180830151530-49385e6e1522/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190215142949-d0b11bdaac8a/go.mod h1:STP8DvDyc/dI5b8T5hshtkjS+E42TnysNCUPdjciGhY= golang.org/x/sys v0.0.0-20190312061237-fead79001313/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= @@ -1279,8 +1279,8 @@ golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.7.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.41.0 h1:Ivj+2Cp/ylzLiEU89QhWblYnOE9zerudt9Ftecq2C6k= -golang.org/x/sys v0.41.0/go.mod h1:OgkHotnGiDImocRcuBABYBEXf8A9a87e/uXjp9XT3ks= +golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= +golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.2.0/go.mod h1:TVmDHMZPmdnySmBfhjOoOdhjzdE1h4u1VwSiw2l1Nuc= @@ -1289,8 +1289,8 @@ golang.org/x/term v0.4.0/go.mod h1:9P2UbLfCdcvo3p/nzKvsmas4TnlujnuoV9hGgYzW1lQ= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= golang.org/x/term v0.6.0/go.mod h1:m6U89DPEgQRMq3DNkDClhWw02AUbt2daBVO4cn4Hv9U= golang.org/x/term v0.7.0/go.mod h1:P32HKFT3hSsZrRxla30E9HqToFYAQPCMs/zFMBUFqPY= -golang.org/x/term v0.40.0 h1:36e4zGLqU4yhjlmxEaagx2KuYbJq3EwY8K943ZsHcvg= -golang.org/x/term v0.40.0/go.mod h1:w2P8uVp06p2iyKKuvXIm7N/y0UCRt3UfJTfZ7oOpglM= +golang.org/x/term v0.41.0 h1:QCgPso/Q3RTJx2Th4bDLqML4W6iJiaXFq2/ftQF13YU= +golang.org/x/term v0.41.0/go.mod h1:3pfBgksrReYfZ5lvYM0kSO0LIkAl4Yl2bXOkKP7Ec2A= golang.org/x/text v0.0.0-20170915032832-14c0d48ead0c/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.0/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= golang.org/x/text v0.3.1-0.20180807135948-17ff2d5776d2/go.mod h1:NqM8EUOU14njkJ3fqMW+pc6Ldnwhi/IjpwHt7yyuwOQ= @@ -1307,8 +1307,8 @@ golang.org/x/text v0.6.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= golang.org/x/text v0.9.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= -golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/text v0.35.0 h1:JOVx6vVDFokkpaq1AEptVzLTpDe9KGpj5tR4/X+ybL8= +golang.org/x/text v0.35.0/go.mod h1:khi/HExzZJ2pGnjenulevKNX1W67CUy0AsXcNubPGCA= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= @@ -1380,8 +1380,8 @@ golang.org/x/tools v0.1.12/go.mod h1:hNGJHUnrk76NpqgfD5Aqm5Crs+Hm0VOH/i9J2+nxYbc golang.org/x/tools v0.3.0/go.mod h1:/rWhSS2+zyEVwoJf8YAX6L2f0ntZ7Kn/mGgAWcipA5k= golang.org/x/tools v0.6.0/go.mod h1:Xwgl3UAJ/d3gWutnCtw505GrjyAbvKui8lOU390QaIU= golang.org/x/tools v0.7.0/go.mod h1:4pg6aUX35JBAogB10C9AtvVL+qowtN4pT3CGSQex14s= -golang.org/x/tools v0.41.0 h1:a9b8iMweWG+S0OBnlU36rzLp20z1Rp10w+IY2czHTQc= -golang.org/x/tools v0.41.0/go.mod h1:XSY6eDqxVNiYgezAVqqCeihT4j1U2CCsqvH3WhQpnlg= +golang.org/x/tools v0.42.0 h1:uNgphsn75Tdz5Ji2q36v/nsFSfR/9BRFvqhGBaJGd5k= +golang.org/x/tools v0.42.0/go.mod h1:Ma6lCIwGZvHK6XtgbswSoWroEkhugApmsXyrUmBhfr0= golang.org/x/tools/go/expect v0.1.0-deprecated h1:jY2C5HGYR5lqex3gEniOQL0r7Dq5+VGVgY1nudX5lXY= golang.org/x/tools/go/expect v0.1.0-deprecated/go.mod h1:eihoPOH+FgIqa3FpoTwguz/bVUSGBlGQU67vpBeOrBY= golang.org/x/tools/go/packages/packagestest v0.1.1-deprecated h1:1h2MnaIAIXISqTFKdENegdpAgUXz6NrPEsbIeWaBRvM= @@ -1398,8 +1398,8 @@ gonum.org/v1/gonum v0.0.0-20180816165407-929014505bf4/go.mod h1:Y+Yx5eoAFn32cQvJ gonum.org/v1/gonum v0.8.2/go.mod h1:oe/vMfY3deqTw+1EZJhuvEW2iwGF1bW9wwu7XCu0+v0= gonum.org/v1/gonum v0.9.3/go.mod h1:TZumC3NeyVQskjXqmyWt4S3bINhy7B4eYwW69EbyX+0= gonum.org/v1/gonum v0.11.0/go.mod h1:fSG4YDCxxUZQJ7rKsQrj0gMOg00Il0Z96/qMA4bVQhA= -gonum.org/v1/gonum v0.16.0 h1:5+ul4Swaf3ESvrOnidPp4GZbzf0mxVQpDCYUQE7OJfk= -gonum.org/v1/gonum v0.16.0/go.mod h1:fef3am4MQ93R2HHpKnLk4/Tbh/s0+wqD5nfa6Pnwy4E= +gonum.org/v1/gonum v0.17.0 h1:VbpOemQlsSMrYmn7T2OUvQ4dqxQXU+ouZFQsZOx50z4= +gonum.org/v1/gonum v0.17.0/go.mod h1:El3tOrEuMpv2UdMrbNlKEh9vd86bmQ6vqIcDwxEOc1E= gonum.org/v1/netlib v0.0.0-20190313105609-8cb42192e0e0/go.mod h1:wa6Ws7BG/ESfp6dHfk7C6KdzKA7wR7u/rKwOGE66zvw= gonum.org/v1/plot v0.0.0-20190515093506-e2840ee46a6b/go.mod h1:Wt8AAjI+ypCyYX3nZBvf6cAIx93T+c/OS2HFAYskSZc= gonum.org/v1/plot v0.9.0/go.mod h1:3Pcqqmp6RHvJI72kgb8fThyUnav364FOsdDo2aGW5lY= @@ -1603,12 +1603,12 @@ google.golang.org/genproto v0.0.0-20230525234025-438c736192d0/go.mod h1:9ExIQyXL google.golang.org/genproto v0.0.0-20230526161137-0005af68ea54/go.mod h1:zqTuNwFlFRsw5zIts5VnzLQxSRqh+CGOTVMlYbY0Eyk= google.golang.org/genproto/googleapis/api v0.0.0-20230525234020-1aefcd67740a/go.mod h1:ts19tUU+Z0ZShN1y3aPyq2+O3d5FUNNgT6FtOzmrNn8= google.golang.org/genproto/googleapis/api v0.0.0-20230525234035-dd9d682886f9/go.mod h1:vHYtlOoi6TsQ3Uk2yxR7NI5z8uoV+3pZtR4jmHIkRig= -google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57 h1:JLQynH/LBHfCTSbDWl+py8C+Rg/k1OVH3xfcaiANuF0= -google.golang.org/genproto/googleapis/api v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:kSJwQxqmFXeo79zOmbrALdflXQeAYcUbgS7PbpMknCY= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9 h1:VPWxll4HlMw1Vs/qXtN7BvhZqsS9cdAittCNvVENElA= +google.golang.org/genproto/googleapis/api v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:7QBABkRtR8z+TEnmXTqIqwJLlzrZKVfAUm7tY3yGv0M= google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234015-3fc162c6f38a/go.mod h1:xURIpW9ES5+/GZhnV6beoEtxQrnkRGIfP5VQG2tCBLc= google.golang.org/genproto/googleapis/rpc v0.0.0-20230525234030-28d5490b6b19/go.mod h1:66JfowdXAEgad5O9NnYcsNPLCPZJD++2L9X0PCMODrA= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57 h1:mWPCjDEyshlQYzBpMNHaEof6UX1PmHcaUODUywQ0uac= -google.golang.org/genproto/googleapis/rpc v0.0.0-20260209200024-4cfbd4190f57/go.mod h1:j9x/tPzZkyxcgEFkiKEEGxfvyumM01BEtsW8xzOahRQ= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9 h1:m8qni9SQFH0tJc1X0vmnpw/0t+AImlSvp30sEupozUg= +google.golang.org/genproto/googleapis/rpc v0.0.0-20260401024825-9d38bb4040a9/go.mod h1:4Hqkh8ycfw05ld/3BWL7rJOSfebL2Q+DVDeRgYgxUU8= google.golang.org/grpc v1.19.0/go.mod h1:mqu4LbDTu4XGKhr4mRzUsmM4RtVoemTSY81AxZiDr8c= google.golang.org/grpc v1.20.1/go.mod h1:10oTOabMzJvdu6/UiuZezV6QK5dSlG84ov/aaiqXj38= google.golang.org/grpc v1.21.1/go.mod h1:oYelfM1adQP15Ek0mdvEgi9Df8B9CZIaU1084ijfRaM= @@ -1650,8 +1650,8 @@ google.golang.org/grpc v1.52.3/go.mod h1:pu6fVzoFb+NBYNAvQL08ic+lvB2IojljRYuun5v google.golang.org/grpc v1.53.0/go.mod h1:OnIrk0ipVdj4N5d9IUoFUx72/VlD7+jUsHwZgwSMQpw= google.golang.org/grpc v1.54.0/go.mod h1:PUSEXI6iWghWaB6lXM4knEgpJNu2qUcKfDtNci3EC2g= google.golang.org/grpc v1.57.0/go.mod h1:Sd+9RMTACXwmub0zcNY2c4arhtrbBYD1AUHI/dt16Mo= -google.golang.org/grpc v1.79.3 h1:sybAEdRIEtvcD68Gx7dmnwjZKlyfuc61Dyo9pGXXkKE= -google.golang.org/grpc v1.79.3/go.mod h1:KmT0Kjez+0dde/v2j9vzwoAScgEPx/Bw1CYChhHLrHQ= +google.golang.org/grpc v1.80.0 h1:Xr6m2WmWZLETvUNvIUmeD5OAagMw3FiKmMlTdViWsHM= +google.golang.org/grpc v1.80.0/go.mod h1:ho/dLnxwi3EDJA4Zghp7k2Ec1+c2jqup0bFkw07bwF4= google.golang.org/grpc/cmd/protoc-gen-go-grpc v1.1.0/go.mod h1:6Kw0yEErY5E/yWrBtf03jp27GLLJujG4z/JK95pnjjw= google.golang.org/protobuf v0.0.0-20200109180630-ec00e32a8dfd/go.mod h1:DFci5gLYBciE7Vtevhsrf46CRTquxDuWsQurQQe4oz8= google.golang.org/protobuf v0.0.0-20200221191635-4d8936d0db64/go.mod h1:kwYJMbMJ01Woi6D6+Kah6886xMZcty6N08ah7+eCXa0= From fb270ddb3b956c9756b9f292da15c5a3d4deb38c Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 02/10] sysfs: add OVERRIDE_SYS_CPUFREQ for faking CPU frequency ranges Signed-off-by: Antti Kervinen --- pkg/sysfs/system.go | 60 ++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/pkg/sysfs/system.go b/pkg/sysfs/system.go index 394b39d55..16db6ea29 100644 --- a/pkg/sysfs/system.go +++ b/pkg/sysfs/system.go @@ -292,8 +292,10 @@ var ( PerformanceCore: "OVERRIDE_SYS_CORE_CPUS", EfficientCore: "OVERRIDE_SYS_ATOM_CPUS", } - cacheEnvOverridesVar = "OVERRIDE_SYS_CACHES" - cacheEnvOverridesJson = os.Getenv(cacheEnvOverridesVar) + cacheEnvOverridesVar = "OVERRIDE_SYS_CACHES" + cacheEnvOverridesJson = os.Getenv(cacheEnvOverridesVar) + cpufreqEnvOverridesVar = "OVERRIDE_SYS_CPUFREQ" + cpufreqEnvOverridesJson = os.Getenv(cpufreqEnvOverridesVar) ) // MemInfo contains data read from a NUMA node meminfo file. @@ -338,6 +340,16 @@ type cacheOverride struct { var cacheEnvOverrides map[int][]*Cache +// cpufreqOverride specifies frequency values to use instead of reading sysfs. +type cpufreqOverride struct { + Cpus string `json:"cpus"` // CPU ids in list format, e.g. "0-15" + Base uint64 `json:"base"` // base frequency (kHz) + Min uint64 `json:"min"` // minimum frequency (kHz) + Max uint64 `json:"max"` // maximum/turbo frequency (kHz) +} + +var cpufreqEnvOverrides map[int]CPUFreq + // SetSysRoot sets the sys root directory. func SetSysRoot(root string) { if root != "" { @@ -1063,6 +1075,10 @@ func (sys *system) discoverCPU(path string) error { if _, err := readSysfsEntry(path, "cpufreq/cpuinfo_max_freq", &cpu.freq.Max); err != nil { cpu.freq.Max = 0 } + // Apply cpufreq overrides from OVERRIDE_SYS_CPUFREQ if set. + if err := sys.applyCpufreqOverrides(cpu); err != nil { + log.Warnf("failed to apply cpufreq overrides for cpu%d: %v", cpu.id, err) + } if _, err := readSysfsEntry(path, "cpufreq/energy_performance_preference", &cpu.epp); err != nil { cpu.epp = EPPUnknown } @@ -2082,7 +2098,45 @@ func (sys *system) discoverCacheFromOverrides(cpu *cpu) (bool, error) { return false, nil } -// Discover cache associated with the given CPU. +// applyCpufreqOverrides overrides CPU frequency values from OVERRIDE_SYS_CPUFREQ. +func (sys *system) applyCpufreqOverrides(cpu *cpu) error { + if cpufreqEnvOverridesJson == "" { + return nil + } + if cpufreqEnvOverrides == nil { + sys.Debugf("parsing cpufreq overrides from %s=%q", cpufreqEnvOverridesVar, cpufreqEnvOverridesJson) + overrides, err := parseCpufreqOverrides(cpufreqEnvOverridesJson) + if err != nil { + return fmt.Errorf("failed to parse %s: %v", cpufreqEnvOverridesVar, err) + } + cpufreqEnvOverrides = overrides + } + if freq, ok := cpufreqEnvOverrides[cpu.id]; ok { + sys.Debugf("cpufreq override for cpu%d: base=%d min=%d max=%d", cpu.id, freq.Base, freq.Min, freq.Max) + cpu.freq = freq + } + return nil +} + +// parseCpufreqOverrides parses JSON cpufreq overrides into a per-CPU map. +func parseCpufreqOverrides(jsonData string) (map[int]CPUFreq, error) { + var overrides []cpufreqOverride + if err := json.Unmarshal([]byte(jsonData), &overrides); err != nil { + return nil, err + } + result := make(map[int]CPUFreq) + for _, o := range overrides { + cpus, err := idset.NewIDSetFromString(o.Cpus) + if err != nil { + return nil, fmt.Errorf("invalid CPU list %q: %v", o.Cpus, err) + } + freq := CPUFreq{Base: o.Base, Min: o.Min, Max: o.Max} + for cpu := range cpus { + result[cpu] = freq + } + } + return result, nil +} func (sys *system) discoverCache(cpu *cpu, path string) error { var id idset.ID From c106d151bb41317102ed23d8e41df0e95994e406 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 03/10] agent: support publishing node extended resources Signed-off-by: Antti Kervinen --- pkg/agent/agent.go | 8 + pkg/agent/node-extended-resources.go | 309 +++++++++++++++++++++++++++ 2 files changed, 317 insertions(+) create mode 100644 pkg/agent/node-extended-resources.go diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index ac0f8a649..93ab82bc8 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -258,6 +258,10 @@ func (a *Agent) Stop() { defer a.stopLock.Unlock() if a.stopC != nil { + // Remove any extended resources we own on this node so + // a graceful shutdown does not leave orphan capacity + // entries behind. + a.ClearNodeExtendedResources() close(a.stopC) <-a.doneC a.stopC = nil @@ -597,6 +601,10 @@ func (a *Agent) updateGroupConfig(obj runtime.Object) { func (a *Agent) updateConfig(cfg metav1.Object) { if cfg == nil { log.Warnf("node (%s) has no effective configuration", a.nodeName) + // With no effective configuration there is nothing left to + // publish, so drop any extended resources we currently own + // on the node. + a.ClearNodeExtendedResources() return } diff --git a/pkg/agent/node-extended-resources.go b/pkg/agent/node-extended-resources.go new file mode 100644 index 000000000..7378c2aef --- /dev/null +++ b/pkg/agent/node-extended-resources.go @@ -0,0 +1,309 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package agent + +import ( + "context" + "encoding/json" + "fmt" + "strings" + "sync" + "time" + + "k8s.io/apimachinery/pkg/api/resource" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" +) + +// extendedResourcesLock serializes concurrent node.status PATCHes +// emitted by the policy on container events. Last writer wins. +var extendedResourcesLock sync.Mutex + +// lastPublishedExtendedResources tracks the resources we currently +// own on this node, so that we can issue 'remove' patches for +// resources that the policy stops reporting. +var lastPublishedExtendedResources = map[string]int64{} + +// extendedResourcesSynced is set after the first successful +// node-status scan. Until then, every publish will first try to +// seed lastPublishedExtendedResources from the node so that +// resources left over by a prior plugin process (helm reinstall, +// pod crash, switch to a different policy, etc.) get pruned by +// the regular diff logic on the next publish. +var extendedResourcesSynced bool + +// extendedResourceDomain is the per-domain prefix the agent owns. +// Only resources whose name starts with this prefix are touched +// by the agent (other extended resources advertised by other +// controllers are left alone). +const extendedResourceDomain = "cpuclass.balloons.nri.io/" + +// UpdateNodeExtendedResources publishes the given resource map +// to Node.status.capacity using a JSON patch. Resources previously +// owned by the agent but absent from 'resources' are removed. +// Runs asynchronously to avoid stalling NRI request paths. +func (a *Agent) UpdateNodeExtendedResources(resources map[string]int64) error { + if a.hasLocalConfig() { + return nil + } + if a.k8sCli == nil || a.nodeName == "" { + return nil + } + // Snapshot inputs and run in the background; node-status + // PATCHes can be slow under apiserver load and we never + // want NRI hooks to block on them. + snapshot := make(map[string]int64, len(resources)) + for k, v := range resources { + snapshot[k] = v + } + go func() { + if err := a.updateNodeExtendedResources(snapshot); err != nil { + log.Errorf("failed to publish extended resources: %v", err) + } + }() + return nil +} + +func (a *Agent) updateNodeExtendedResources(resources map[string]int64) error { + extendedResourcesLock.Lock() + defer extendedResourcesLock.Unlock() + + // First call after process start: scan the node for keys we + // already own (from a prior plugin process), so the diff + // below can prune any that the current policy no longer + // publishes. Failure is non-fatal -- we just fall back to + // "trust our in-memory state". + if !extendedResourcesSynced { + if err := a.syncExtendedResourcesFromNode(); err != nil { + log.Warnf("extended-resource startup sync failed (orphans from a prior plugin process may persist): %v", err) + } + extendedResourcesSynced = true + } + + // Compute the patch: add/replace keys present in 'resources', + // remove keys we owned before but are now gone. + type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + Value interface{} `json:"value,omitempty"` + } + + ops := []jsonPatchOp{} + for name, qty := range resources { + if !strings.HasPrefix(name, extendedResourceDomain) { + log.Warnf("refusing to publish resource %q: not in domain %q", + name, extendedResourceDomain) + continue + } + q := resource.NewQuantity(qty, resource.DecimalSI) + ops = append(ops, jsonPatchOp{ + Op: "add", + Path: "/status/capacity/" + escapeJSONPointer(name), + Value: q.String(), + }) + } + for name := range lastPublishedExtendedResources { + if _, kept := resources[name]; kept { + continue + } + ops = append(ops, jsonPatchOp{ + Op: "remove", + Path: "/status/capacity/" + escapeJSONPointer(name), + }) + } + + if len(ops) == 0 { + return nil + } + + body, err := json.Marshal(ops) + if err != nil { + return fmt.Errorf("marshal patch: %w", err) + } + + ctx := context.Background() + _, err = a.k8sCli.CoreV1().Nodes().Patch( + ctx, a.nodeName, types.JSONPatchType, body, + metav1.PatchOptions{}, "status") + if err != nil { + // JSON patch "add" on a missing path fails when the + // node has no prior resource of that name -- 'add' + // requires the parent to exist, but for a map value + // it should create the key. In practice apiservers + // behave correctly here. If we ever hit issues, fall + // back to a strategic merge patch. + return fmt.Errorf("patch node %s status: %w", a.nodeName, err) + } + + // Record current set for next diff. + lastPublishedExtendedResources = make(map[string]int64, len(resources)) + for k, v := range resources { + lastPublishedExtendedResources[k] = v + } + + publishedSummary := summarizeExtendedResources(resources) + if publishedSummary != "" { + log.Infof("published node extended resources: %s", publishedSummary) + } + return nil +} + +// escapeJSONPointer escapes '~' and '/' per RFC 6901 so that a +// resource name containing slashes survives as a single JSON +// Pointer segment. +func escapeJSONPointer(s string) string { + s = strings.ReplaceAll(s, "~", "~0") + s = strings.ReplaceAll(s, "/", "~1") + return s +} + +// summarizeExtendedResources formats the map deterministically +// for logs: "name1=N1, name2=N2, ...". +func summarizeExtendedResources(m map[string]int64) string { + if len(m) == 0 { + return "" + } + keys := make([]string, 0, len(m)) + for k := range m { + keys = append(keys, k) + } + // stable order without pulling in sort + for i := 1; i < len(keys); i++ { + for j := i; j > 0 && keys[j-1] > keys[j]; j-- { + keys[j-1], keys[j] = keys[j], keys[j-1] + } + } + parts := make([]string, 0, len(keys)) + for _, k := range keys { + parts = append(parts, fmt.Sprintf("%s=%d", k, m[k])) + } + return strings.Join(parts, ", ") +} + +// syncExtendedResourcesFromNode reads Node.status.capacity and +// seeds lastPublishedExtendedResources with every entry whose +// key carries extendedResourceDomain. Caller must hold +// extendedResourcesLock. +func (a *Agent) syncExtendedResourcesFromNode() error { + ctx, cancel := context.WithTimeout(context.Background(), 10*time.Second) + defer cancel() + node, err := a.k8sCli.CoreV1().Nodes().Get(ctx, a.nodeName, metav1.GetOptions{}) + if err != nil { + return fmt.Errorf("get node %s: %w", a.nodeName, err) + } + owned := map[string]int64{} + for name, q := range node.Status.Capacity { + key := string(name) + if !strings.HasPrefix(key, extendedResourceDomain) { + continue + } + v, ok := q.AsInt64() + if !ok { + v = q.Value() + } + owned[key] = v + if _, ours := lastPublishedExtendedResources[key]; !ours { + lastPublishedExtendedResources[key] = v + } + } + if len(owned) > 0 { + log.Infof("extended-resource startup sync: found %d existing key(s) on node %s: %s", + len(owned), a.nodeName, summarizeExtendedResources(owned)) + } + return nil +} + +// ClearNodeExtendedResources removes every node-status key the +// agent currently owns (every key in lastPublishedExtendedResources +// plus, for safety, every key currently present on the node that +// carries our domain prefix). Best-effort and synchronous, with a +// short timeout; intended for Agent.Stop() so a graceful shutdown +// does not leave orphan capacity entries behind. +func (a *Agent) ClearNodeExtendedResources() { + if a.hasLocalConfig() { + return + } + if a.k8sCli == nil || a.nodeName == "" { + return + } + + extendedResourcesLock.Lock() + defer extendedResourcesLock.Unlock() + + toRemove := map[string]struct{}{} + for k := range lastPublishedExtendedResources { + toRemove[k] = struct{}{} + } + + // Also fold in anything currently on the node under our + // domain that we may not be tracking (e.g., startup sync + // never ran because no publish happened before Stop). Best + // effort: ignore the read error and fall back to the + // in-memory set. + ctx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + if node, err := a.k8sCli.CoreV1().Nodes().Get(ctx, a.nodeName, metav1.GetOptions{}); err == nil { + for name := range node.Status.Capacity { + key := string(name) + if strings.HasPrefix(key, extendedResourceDomain) { + toRemove[key] = struct{}{} + } + } + } + cancel() + + if len(toRemove) == 0 { + return + } + + type jsonPatchOp struct { + Op string `json:"op"` + Path string `json:"path"` + } + ops := make([]jsonPatchOp, 0, len(toRemove)) + keys := make([]string, 0, len(toRemove)) + for k := range toRemove { + ops = append(ops, jsonPatchOp{ + Op: "remove", + Path: "/status/capacity/" + escapeJSONPointer(k), + }) + keys = append(keys, k) + } + + body, err := json.Marshal(ops) + if err != nil { + log.Warnf("ClearNodeExtendedResources: marshal patch: %v", err) + return + } + + pctx, pcancel := context.WithTimeout(context.Background(), 5*time.Second) + defer pcancel() + _, err = a.k8sCli.CoreV1().Nodes().Patch( + pctx, a.nodeName, types.JSONPatchType, body, + metav1.PatchOptions{}, "status") + if err != nil { + log.Warnf("ClearNodeExtendedResources: patch node %s: %v", a.nodeName, err) + return + } + + // Stable order in the log + for i := 1; i < len(keys); i++ { + for j := i; j > 0 && keys[j-1] > keys[j]; j-- { + keys[j-1], keys[j] = keys[j], keys[j-1] + } + } + log.Infof("cleared node extended resources on shutdown: %s", strings.Join(keys, ", ")) + + lastPublishedExtendedResources = map[string]int64{} +} From 591a474a92ef3f9ac2c009dbf18e4421c06ab456 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 04/10] balloons: add cpuClasses, turboDomain and PCT options to config Signed-off-by: Antti Kervinen --- .../bases/config.nri_balloonspolicies.yaml | 135 ++++++++++++ .../crds/config.nri_balloonspolicies.yaml | 135 ++++++++++++ pkg/apis/config/v1alpha1/balloons-policy.go | 96 +++++++- .../config/v1alpha1/balloons-policy_test.go | 109 ++++++++++ .../v1alpha1/resmgr/policy/balloons/config.go | 17 ++ .../policy/balloons/zz_generated.deepcopy.go | 11 + .../config/v1alpha1/resmgr/policy/cpuclass.go | 98 +++++++++ .../v1alpha1/resmgr/policy/frequency.go | 205 ++++++++++++++++++ .../v1alpha1/resmgr/policy/frequency_test.go | 156 +++++++++++++ .../resmgr/policy/zz_generated.deepcopy.go | 25 +++ 10 files changed, 986 insertions(+), 1 deletion(-) create mode 100644 pkg/apis/config/v1alpha1/balloons-policy_test.go create mode 100644 pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go create mode 100644 pkg/apis/config/v1alpha1/resmgr/policy/frequency.go create mode 100644 pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go diff --git a/config/crd/bases/config.nri_balloonspolicies.yaml b/config/crd/bases/config.nri_balloonspolicies.yaml index 321f3f417..2a6f6402a 100644 --- a/config/crd/bases/config.nri_balloonspolicies.yaml +++ b/config/crd/bases/config.nri_balloonspolicies.yaml @@ -733,6 +733,127 @@ spec: type: boolean type: object type: object + cpuClasses: + description: |- + CPUClasses define CPU frequency, C-state, and turbo + attributes for CPU classes referenced by balloon types. + Exclusive turbo frequency access is controlled via + turboPriority. + items: + description: |- + CPUClass specifies CPU frequency, C-state, and turbo attributes + for a CPU class. + properties: + disabledCstates: + description: |- + DisabledCstates lists C-states disabled for CPUs in this class. + Example: ["C4", "C6", "C8", "C10"] + items: + type: string + type: array + energyPerformancePreference: + description: EnergyPerformancePreference for CPUs in this class. + minimum: 0 + type: integer + freqGovernor: + description: |- + FreqGovernor is the CPUFreq governor for this class + (e.g., "performance", "powersave", "schedutil"). + type: string + maxFreq: + description: |- + MaxFreq is the maximum CPU frequency for this class. + Same format and symbolic names as MinFreq. + type: string + minFreq: + description: |- + MinFreq is the minimum CPU frequency for this class. + Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + or a plain number in kHz. Also accepts symbolic names: "min" + (platform minimum), "base" (CPU base frequency), "turbo" + (maximum turbo frequency), resolved at runtime from sysfs. + When turboPriority is set, "turbo" resolves to actual turbo + only for the highest-priority active class; others get base. + type: string + name: + description: Name of the CPU class. + type: string + pctClosID: + description: |- + PctClosID pins this class to a specific SST-CP CLOS ID + (0..ClosCount-1, typically 0..3) and signals "assoc-only" + mode: nri-plugin will only associate this class's CPUs to + the given CLOS, without touching the SoC-wide SST state + (no CPReset, no TFEnable, no CLOS reconfiguration). Use + this when an operator or the BIOS has pre-configured the + CLOSes. Mutually exclusive with PctPriority. + minimum: 0 + type: integer + pctMaxFreq: + description: |- + PctMaxFreq overrides the CLOS maximum frequency that + nri-plugin programs in managed mode. Defaults to MaxFreq. + Same caveat as PctMinFreq. + type: string + pctMinFreq: + description: |- + PctMinFreq overrides the CLOS minimum frequency that + nri-plugin programs in managed mode. Defaults to MinFreq. + Uses the same format as MinFreq but resolves "turbo" + directly to the hardware maximum turbo frequency, + without participating in the soft turboPriority + arbitration. Ignored in assoc-only mode. + type: string + pctPriority: + description: |- + PctPriority requests Intel Priority Core Turbo (PCT) + hardware support, via SST-CP CLOSes, for CPUs in this + class. "high" associates the CPUs to the high-priority + CLOS (HP cores, typically running at Pmax). "low" + associates them to the low-priority CLOS (LP cores, + typically capped at P1). Unset = PCT is not requested + for this class. Mutually exclusive with PctClosID. + enum: + - high + - low + type: string + publishExtendedResource: + description: |- + PublishExtendedResource opts this CPU class into publishing + a node-level extended resource named + "cpuclass.balloons.nri.io/" whose value reflects + the number of logical CPUs that the balloons policy is + currently able to route into this class on the node. The + scheduler can then bin-pack/spread balloons by adding the + same resource to pod requests, avoiding HP-CPU + over-subscription on a single node. Has effect only when + the class also carries PctPriority or PctClosID. Experimental. + type: boolean + turboPriority: + description: |- + TurboPriority controls exclusive turbo frequency access. + Among CPU classes with active balloons, only the class with + the highest turboPriority gets the symbolic frequency "turbo" + resolved to the actual turbo frequency. All other classes get + "turbo" resolved to the base frequency instead. + If all classes have turboPriority 0 (default), every class + gets actual turbo frequencies -- no competition occurs. + minimum: 0 + type: integer + uncoreMaxFreq: + description: |- + UncoreMaxFreq is the maximum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + uncoreMinFreq: + description: |- + UncoreMinFreq is the minimum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + required: + - name + type: object + type: array idleCPUClass: description: |- IdleCpuClass controls how unusded CPUs outside any a @@ -1076,6 +1197,20 @@ spec: options has no effect unless agent:NodeResourceTopology enables basic topology exposure. type: boolean + turboDomain: + default: package + description: |- + TurboDomain selects the scope over which TurboPriority + arbitration happens. The default is "package": every CPU + package independently picks its own TurboPriority winner, + so a low-priority balloon on one socket can keep turbo even + when a higher-priority balloon is running on another + socket. Set to "system" to pick single TurboPriority winner + for the whole system. + enum: + - package + - system + type: string required: - reservedResources type: object diff --git a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml index 321f3f417..2a6f6402a 100644 --- a/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml +++ b/deployment/helm/balloons/crds/config.nri_balloonspolicies.yaml @@ -733,6 +733,127 @@ spec: type: boolean type: object type: object + cpuClasses: + description: |- + CPUClasses define CPU frequency, C-state, and turbo + attributes for CPU classes referenced by balloon types. + Exclusive turbo frequency access is controlled via + turboPriority. + items: + description: |- + CPUClass specifies CPU frequency, C-state, and turbo attributes + for a CPU class. + properties: + disabledCstates: + description: |- + DisabledCstates lists C-states disabled for CPUs in this class. + Example: ["C4", "C6", "C8", "C10"] + items: + type: string + type: array + energyPerformancePreference: + description: EnergyPerformancePreference for CPUs in this class. + minimum: 0 + type: integer + freqGovernor: + description: |- + FreqGovernor is the CPUFreq governor for this class + (e.g., "performance", "powersave", "schedutil"). + type: string + maxFreq: + description: |- + MaxFreq is the maximum CPU frequency for this class. + Same format and symbolic names as MinFreq. + type: string + minFreq: + description: |- + MinFreq is the minimum CPU frequency for this class. + Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + or a plain number in kHz. Also accepts symbolic names: "min" + (platform minimum), "base" (CPU base frequency), "turbo" + (maximum turbo frequency), resolved at runtime from sysfs. + When turboPriority is set, "turbo" resolves to actual turbo + only for the highest-priority active class; others get base. + type: string + name: + description: Name of the CPU class. + type: string + pctClosID: + description: |- + PctClosID pins this class to a specific SST-CP CLOS ID + (0..ClosCount-1, typically 0..3) and signals "assoc-only" + mode: nri-plugin will only associate this class's CPUs to + the given CLOS, without touching the SoC-wide SST state + (no CPReset, no TFEnable, no CLOS reconfiguration). Use + this when an operator or the BIOS has pre-configured the + CLOSes. Mutually exclusive with PctPriority. + minimum: 0 + type: integer + pctMaxFreq: + description: |- + PctMaxFreq overrides the CLOS maximum frequency that + nri-plugin programs in managed mode. Defaults to MaxFreq. + Same caveat as PctMinFreq. + type: string + pctMinFreq: + description: |- + PctMinFreq overrides the CLOS minimum frequency that + nri-plugin programs in managed mode. Defaults to MinFreq. + Uses the same format as MinFreq but resolves "turbo" + directly to the hardware maximum turbo frequency, + without participating in the soft turboPriority + arbitration. Ignored in assoc-only mode. + type: string + pctPriority: + description: |- + PctPriority requests Intel Priority Core Turbo (PCT) + hardware support, via SST-CP CLOSes, for CPUs in this + class. "high" associates the CPUs to the high-priority + CLOS (HP cores, typically running at Pmax). "low" + associates them to the low-priority CLOS (LP cores, + typically capped at P1). Unset = PCT is not requested + for this class. Mutually exclusive with PctClosID. + enum: + - high + - low + type: string + publishExtendedResource: + description: |- + PublishExtendedResource opts this CPU class into publishing + a node-level extended resource named + "cpuclass.balloons.nri.io/" whose value reflects + the number of logical CPUs that the balloons policy is + currently able to route into this class on the node. The + scheduler can then bin-pack/spread balloons by adding the + same resource to pod requests, avoiding HP-CPU + over-subscription on a single node. Has effect only when + the class also carries PctPriority or PctClosID. Experimental. + type: boolean + turboPriority: + description: |- + TurboPriority controls exclusive turbo frequency access. + Among CPU classes with active balloons, only the class with + the highest turboPriority gets the symbolic frequency "turbo" + resolved to the actual turbo frequency. All other classes get + "turbo" resolved to the base frequency instead. + If all classes have turboPriority 0 (default), every class + gets actual turbo frequencies -- no competition occurs. + minimum: 0 + type: integer + uncoreMaxFreq: + description: |- + UncoreMaxFreq is the maximum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + uncoreMinFreq: + description: |- + UncoreMinFreq is the minimum uncore frequency for this class. + Accepts values with units like MinFreq. + type: string + required: + - name + type: object + type: array idleCPUClass: description: |- IdleCpuClass controls how unusded CPUs outside any a @@ -1076,6 +1197,20 @@ spec: options has no effect unless agent:NodeResourceTopology enables basic topology exposure. type: boolean + turboDomain: + default: package + description: |- + TurboDomain selects the scope over which TurboPriority + arbitration happens. The default is "package": every CPU + package independently picks its own TurboPriority winner, + so a low-priority balloon on one socket can keep turbo even + when a higher-priority balloon is running on another + socket. Set to "system" to pick single TurboPriority winner + for the whole system. + enum: + - package + - system + type: string required: - reservedResources type: object diff --git a/pkg/apis/config/v1alpha1/balloons-policy.go b/pkg/apis/config/v1alpha1/balloons-policy.go index 259e1afaa..12ab646fa 100644 --- a/pkg/apis/config/v1alpha1/balloons-policy.go +++ b/pkg/apis/config/v1alpha1/balloons-policy.go @@ -14,8 +14,17 @@ package v1alpha1 +import ( + "sort" + + cpucfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control/cpu" + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" +) + var ( - _ ResmgrConfig = &BalloonsPolicy{} + _ ResmgrConfig = &BalloonsPolicy{} + bplog = logger.NewLogger("config-v1alpha1") ) func (c *BalloonsPolicy) AgentConfig() *AgentConfig { @@ -39,13 +48,98 @@ func (c *BalloonsPolicy) CommonConfig() *CommonConfig { } } +// PolicyConfig returns the balloons-specific configuration handed to +// the policy. Before returning, any legacy control.cpu.classes +// entries are folded into Spec.Config.CPUClasses (without overriding +// entries with matching names). The legacy CPU controller is no +// longer used by the balloons policy; this reverse merge preserves +// backwards compatibility so existing configurations keep working +// while users migrate to the cpuClasses syntax. func (c *BalloonsPolicy) PolicyConfig() interface{} { if c == nil { return nil } + mergeLegacyCpuClasses(&c.Spec) return &c.Spec.Config } +// mergeLegacyCpuClasses appends synthetic CPUClass entries derived +// from spec.Control.CPU.Classes for names that do not already exist +// in spec.Config.CPUClasses. Conflicting names log a single warning +// per name. Idempotent: repeated calls do not add duplicate entries +// and do not warn again for the same conflict. +func mergeLegacyCpuClasses(spec *BalloonsPolicySpec) { + legacy := spec.Control.CPU.Classes + if len(legacy) == 0 { + return + } + existing := map[string]*policyapi.CPUClass{} + for _, cc := range spec.CPUClasses { + existing[cc.Name] = cc + } + // Sort the legacy class names so warning order is deterministic. + names := make([]string, 0, len(legacy)) + for name := range legacy { + names = append(names, name) + } + sort.Strings(names) + added := []string{} + for _, name := range names { + cc := legacy[name] + if prev, ok := existing[name]; ok { + // Skip silently when the explicit entry already + // has the exact values converted from the legacy + // entry. That happens when a prior PolicyConfig() + // call already merged this spec. + if cpuClassMatchesLegacy(prev, cc) { + continue + } + bplog.Warn("control.cpu.classes entry %q overridden by cpuClasses entry; remove the legacy entry to silence this warning", name) + continue + } + synth := &policyapi.CPUClass{ + Name: name, + MinFreq: policyapi.Frequency(cc.MinFreq), + MaxFreq: policyapi.Frequency(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: policyapi.Frequency(cc.UncoreMinFreq), + UncoreMaxFreq: policyapi.Frequency(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: append([]string(nil), cc.DisabledCstates...), + } + spec.CPUClasses = append(spec.CPUClasses, synth) + existing[name] = synth + added = append(added, name) + } + if len(added) > 0 { + bplog.Warn("control.cpu.classes is deprecated; converted to cpuClasses: %v", added) + } +} + +// cpuClassMatchesLegacy reports whether cc has the exact field +// values that the reverse converter would produce for legacy. Used +// to suppress spurious "override" warnings when the same spec is +// processed more than once. +func cpuClassMatchesLegacy(cc *policyapi.CPUClass, legacy cpucfg.Class) bool { + if cc.MinFreq != policyapi.Frequency(legacy.MinFreq) || + cc.MaxFreq != policyapi.Frequency(legacy.MaxFreq) || + cc.EnergyPerformancePreference != legacy.EnergyPerformancePreference || + cc.UncoreMinFreq != policyapi.Frequency(legacy.UncoreMinFreq) || + cc.UncoreMaxFreq != policyapi.Frequency(legacy.UncoreMaxFreq) || + cc.FreqGovernor != legacy.FreqGovernor { + return false + } + if len(cc.DisabledCstates) != len(legacy.DisabledCstates) { + return false + } + for i := range cc.DisabledCstates { + if cc.DisabledCstates[i] != legacy.DisabledCstates[i] { + return false + } + } + return true +} + func (c *BalloonsPolicy) Validate() error { if c == nil { return nil diff --git a/pkg/apis/config/v1alpha1/balloons-policy_test.go b/pkg/apis/config/v1alpha1/balloons-policy_test.go new file mode 100644 index 000000000..fec21c3cc --- /dev/null +++ b/pkg/apis/config/v1alpha1/balloons-policy_test.go @@ -0,0 +1,109 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package v1alpha1 + +import ( + "testing" + + control "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control" + cpucfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/control/cpu" + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + balloonscfg "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy/balloons" +) + +// mkSpec builds a BalloonsPolicySpec carrying the given cpuClasses +// list and legacy control.cpu.classes map. Other fields are left at +// zero values. +func mkSpec(cpuClasses []*policyapi.CPUClass, legacy map[string]cpucfg.Class) *BalloonsPolicySpec { + return &BalloonsPolicySpec{ + Config: balloonscfg.Config{ + CPUClasses: cpuClasses, + }, + Control: control.Config{ + CPU: cpucfg.Config{ + Classes: legacy, + }, + }, + } +} + +// TestMergeLegacy_AddsMissingNames verifies that legacy entries +// whose names do not appear in cpuClasses are appended. +func TestMergeLegacy_AddsMissingNames(t *testing.T) { + spec := mkSpec(nil, map[string]cpucfg.Class{ + "old": {MinFreq: 1_000_000, MaxFreq: 2_000_000, FreqGovernor: "performance"}, + }) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != 1 { + t.Fatalf("want 1 cpuClass after merge, got %d", len(spec.CPUClasses)) + } + cc := spec.CPUClasses[0] + if cc.Name != "old" || cc.MinFreq.KHz() != 1_000_000 || cc.MaxFreq.KHz() != 2_000_000 || cc.FreqGovernor != "performance" { + t.Errorf("merged class wrong: %+v", cc) + } +} + +// TestMergeLegacy_ExplicitWins verifies that explicit cpuClasses +// entries take precedence over legacy entries with the same name. +func TestMergeLegacy_ExplicitWins(t *testing.T) { + explicit := &policyapi.CPUClass{ + Name: "hp", + MinFreq: policyapi.FrequencyBase, + MaxFreq: policyapi.FrequencyTurbo, + } + spec := mkSpec( + []*policyapi.CPUClass{explicit}, + map[string]cpucfg.Class{ + "hp": {MinFreq: 800_000, MaxFreq: 1_500_000}, + }, + ) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != 1 { + t.Fatalf("want 1 cpuClass (explicit unchanged), got %d", len(spec.CPUClasses)) + } + cc := spec.CPUClasses[0] + if cc != explicit { + t.Errorf("explicit entry was replaced") + } + if cc.MinFreq != policyapi.FrequencyBase { + t.Errorf("explicit symbolic MinFreq overwritten, got %v", cc.MinFreq) + } +} + +// TestMergeLegacy_Idempotent verifies that running the merge twice +// does not duplicate appended entries. +func TestMergeLegacy_Idempotent(t *testing.T) { + spec := mkSpec(nil, map[string]cpucfg.Class{ + "a": {MinFreq: 1_000_000}, + "b": {MaxFreq: 2_000_000}, + }) + mergeLegacyCpuClasses(spec) + first := len(spec.CPUClasses) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != first { + t.Errorf("second merge added entries: first=%d second=%d", first, len(spec.CPUClasses)) + } +} + +// TestMergeLegacy_NoLegacy_NoChange verifies that an empty legacy +// map leaves cpuClasses untouched. +func TestMergeLegacy_NoLegacy_NoChange(t *testing.T) { + keep := &policyapi.CPUClass{Name: "x"} + spec := mkSpec([]*policyapi.CPUClass{keep}, nil) + mergeLegacyCpuClasses(spec) + if len(spec.CPUClasses) != 1 || spec.CPUClasses[0] != keep { + t.Errorf("cpuClasses unexpectedly modified: %+v", spec.CPUClasses) + } +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go index 496f851a6..a8d6230d6 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/config.go @@ -32,6 +32,8 @@ type ( CPUTopologyLevel = policy.CPUTopologyLevel ComponentCreationStrategy = policy.ComponentCreationStrategy SchedulingClass = policy.SchedulingClass + CPUClass = policy.CPUClass + Frequency = policy.Frequency ) const ( @@ -135,6 +137,21 @@ type Config struct { // SchedulingClasses specify scheduling classes available in // balloon types. SchedulingClasses []*SchedulingClass `json:"schedulingClasses,omitempty"` + // CPUClasses define CPU frequency, C-state, and turbo + // attributes for CPU classes referenced by balloon types. + // Exclusive turbo frequency access is controlled via + // turboPriority. + CPUClasses []*CPUClass `json:"cpuClasses,omitempty"` + // TurboDomain selects the scope over which TurboPriority + // arbitration happens. The default is "package": every CPU + // package independently picks its own TurboPriority winner, + // so a low-priority balloon on one socket can keep turbo even + // when a higher-priority balloon is running on another + // socket. Set to "system" to pick single TurboPriority winner + // for the whole system. + // +kubebuilder:validation:Enum=package;system + // +kubebuilder:default=package + TurboDomain string `json:"turboDomain,omitempty"` } // BalloonDef contains a balloon definition. diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go index 74276ce1d..e4b9ceae1 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/balloons/zz_generated.deepcopy.go @@ -185,6 +185,17 @@ func (in *Config) DeepCopyInto(out *Config) { } } } + if in.CPUClasses != nil { + in, out := &in.CPUClasses, &out.CPUClasses + *out = make([]*CPUClass, len(*in)) + for i := range *in { + if (*in)[i] != nil { + in, out := &(*in)[i], &(*out)[i] + *out = new(CPUClass) + (*in).DeepCopyInto(*out) + } + } + } } // DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Config. diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go b/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go new file mode 100644 index 000000000..aab89472c --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/cpuclass.go @@ -0,0 +1,98 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +// CPUClass specifies CPU frequency, C-state, and turbo attributes +// for a CPU class. +// +k8s:deepcopy-gen=true +type CPUClass struct { + // Name of the CPU class. + // +kubebuilder:validation:Required + Name string `json:"name"` + // MinFreq is the minimum CPU frequency for this class. + // Accepts values with units: "3.2GHz", "2900MHz", "2900000kHz", + // or a plain number in kHz. Also accepts symbolic names: "min" + // (platform minimum), "base" (CPU base frequency), "turbo" + // (maximum turbo frequency), resolved at runtime from sysfs. + // When turboPriority is set, "turbo" resolves to actual turbo + // only for the highest-priority active class; others get base. + MinFreq Frequency `json:"minFreq,omitempty"` + // MaxFreq is the maximum CPU frequency for this class. + // Same format and symbolic names as MinFreq. + MaxFreq Frequency `json:"maxFreq,omitempty"` + // EnergyPerformancePreference for CPUs in this class. + // +kubebuilder:validation:Minimum=0 + EnergyPerformancePreference uint `json:"energyPerformancePreference,omitempty"` + // UncoreMinFreq is the minimum uncore frequency for this class. + // Accepts values with units like MinFreq. + UncoreMinFreq Frequency `json:"uncoreMinFreq,omitempty"` + // UncoreMaxFreq is the maximum uncore frequency for this class. + // Accepts values with units like MinFreq. + UncoreMaxFreq Frequency `json:"uncoreMaxFreq,omitempty"` + // FreqGovernor is the CPUFreq governor for this class + // (e.g., "performance", "powersave", "schedutil"). + FreqGovernor string `json:"freqGovernor,omitempty"` + // DisabledCstates lists C-states disabled for CPUs in this class. + // Example: ["C4", "C6", "C8", "C10"] + DisabledCstates []string `json:"disabledCstates,omitempty"` + // TurboPriority controls exclusive turbo frequency access. + // Among CPU classes with active balloons, only the class with + // the highest turboPriority gets the symbolic frequency "turbo" + // resolved to the actual turbo frequency. All other classes get + // "turbo" resolved to the base frequency instead. + // If all classes have turboPriority 0 (default), every class + // gets actual turbo frequencies -- no competition occurs. + // +kubebuilder:validation:Minimum=0 + TurboPriority int `json:"turboPriority,omitempty"` + // PctPriority requests Intel Priority Core Turbo (PCT) + // hardware support, via SST-CP CLOSes, for CPUs in this + // class. "high" associates the CPUs to the high-priority + // CLOS (HP cores, typically running at Pmax). "low" + // associates them to the low-priority CLOS (LP cores, + // typically capped at P1). Unset = PCT is not requested + // for this class. Mutually exclusive with PctClosID. + // +kubebuilder:validation:Enum=high;low + PctPriority string `json:"pctPriority,omitempty"` + // PctClosID pins this class to a specific SST-CP CLOS ID + // (0..ClosCount-1, typically 0..3) and signals "assoc-only" + // mode: nri-plugin will only associate this class's CPUs to + // the given CLOS, without touching the SoC-wide SST state + // (no CPReset, no TFEnable, no CLOS reconfiguration). Use + // this when an operator or the BIOS has pre-configured the + // CLOSes. Mutually exclusive with PctPriority. + // +kubebuilder:validation:Minimum=0 + PctClosID *int `json:"pctClosID,omitempty"` + // PctMinFreq overrides the CLOS minimum frequency that + // nri-plugin programs in managed mode. Defaults to MinFreq. + // Uses the same format as MinFreq but resolves "turbo" + // directly to the hardware maximum turbo frequency, + // without participating in the soft turboPriority + // arbitration. Ignored in assoc-only mode. + PctMinFreq Frequency `json:"pctMinFreq,omitempty"` + // PctMaxFreq overrides the CLOS maximum frequency that + // nri-plugin programs in managed mode. Defaults to MaxFreq. + // Same caveat as PctMinFreq. + PctMaxFreq Frequency `json:"pctMaxFreq,omitempty"` + // PublishExtendedResource opts this CPU class into publishing + // a node-level extended resource named + // "cpuclass.balloons.nri.io/" whose value reflects + // the number of logical CPUs that the balloons policy is + // currently able to route into this class on the node. The + // scheduler can then bin-pack/spread balloons by adding the + // same resource to pod requests, avoiding HP-CPU + // over-subscription on a single node. Has effect only when + // the class also carries PctPriority or PctClosID. Experimental. + PublishExtendedResource bool `json:"publishExtendedResource,omitempty"` +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go b/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go new file mode 100644 index 000000000..0095117ec --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/frequency.go @@ -0,0 +1,205 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +import ( + "encoding/json" + "fmt" + "math" + "regexp" + "strconv" + "strings" +) + +// Frequency represents a CPU frequency value that can be specified +// with human-readable units in YAML/JSON configuration. Supported +// formats: +// - "3.2G" or "3.2GHz" = 3200000 (kHz) +// - "2900M" or "2900MHz" = 2900000 (kHz) +// - "2900000k" or "2900000kHz" = 2900000 (kHz) +// - "2900000" (bare number) = 2900000 (kHz, backwards compatible) +// - 2900000 (JSON number) = 2900000 (kHz, backwards compatible) +// - "min" = platform minimum frequency (resolved at runtime) +// - "base" = CPU base frequency (resolved at runtime) +// - "turbo" = maximum turbo frequency (resolved at runtime) +// +// The internal representation is always in kHz (the unit used by Linux +// kernel sysfs cpufreq interface). Symbolic values ("min", "base", +// "turbo") are stored as sentinel constants and must be resolved with +// Resolve() before being passed to the CPU controller. +// +kubebuilder:validation:Type=string +type Frequency uint + +const ( + // FrequencyMin is a sentinel indicating the platform minimum frequency. + FrequencyMin Frequency = math.MaxUint - 2 + // FrequencyBase is a sentinel indicating the CPU base frequency. + FrequencyBase Frequency = math.MaxUint - 1 + // FrequencyTurbo is a sentinel indicating the maximum turbo frequency. + FrequencyTurbo Frequency = math.MaxUint +) + +var frequencyRegexp = regexp.MustCompile(`(?i)^\s*([0-9]*\.?[0-9]+)\s*(GHz|G|MHz|M|kHz|k)?\s*$`) + +// parseFrequency parses a frequency string into kHz. +func parseFrequency(s string) (Frequency, error) { + s = strings.TrimSpace(s) + if s == "" { + return 0, nil + } + + // Check for symbolic frequency names. + switch strings.ToLower(s) { + case "min": + return FrequencyMin, nil + case "base": + return FrequencyBase, nil + case "turbo": + return FrequencyTurbo, nil + } + + matches := frequencyRegexp.FindStringSubmatch(s) + if matches == nil { + return 0, fmt.Errorf("invalid frequency %q: expected number with optional unit (GHz, MHz, kHz) or symbolic name (min, base, turbo)", s) + } + + numStr := matches[1] + unit := strings.ToLower(matches[2]) + + val, err := strconv.ParseFloat(numStr, 64) + if err != nil { + return 0, fmt.Errorf("invalid frequency %q: %w", s, err) + } + if val < 0 { + return 0, fmt.Errorf("invalid frequency %q: negative value", s) + } + + var kHz float64 + switch unit { + case "ghz", "g": + kHz = val * 1_000_000 + case "mhz", "m": + kHz = val * 1_000 + case "khz", "k": + kHz = val + case "": + // Bare number: interpret as kHz for backwards compatibility + // with the existing uint config fields. + kHz = val + } + + result := uint(math.Round(kHz)) + if result == 0 && val > 0 { + return 0, fmt.Errorf("invalid frequency %q: value too small to represent in kHz", s) + } + + return Frequency(result), nil +} + +// UnmarshalJSON implements json.Unmarshaler. Accepts both JSON strings +// with units (e.g., "3.2GHz") and plain JSON numbers (interpreted as kHz). +func (f *Frequency) UnmarshalJSON(data []byte) error { + // Try string first (quoted value with optional unit). + var s string + if err := json.Unmarshal(data, &s); err == nil { + parsed, err := parseFrequency(s) + if err != nil { + return err + } + *f = parsed + return nil + } + + // Try plain number (backwards compatible with uint kHz). + var n float64 + if err := json.Unmarshal(data, &n); err == nil { + if n < 0 { + return fmt.Errorf("invalid frequency: negative value %v", n) + } + *f = Frequency(uint(math.Round(n))) + return nil + } + + return fmt.Errorf("invalid frequency: expected string or number, got %s", string(data)) +} + +// MarshalJSON implements json.Marshaler. Symbolic frequencies are +// marshaled as their string name; numeric values as plain numbers (kHz) +// for backwards compatibility. +func (f Frequency) MarshalJSON() ([]byte, error) { + switch f { + case FrequencyMin: + return json.Marshal("min") + case FrequencyBase: + return json.Marshal("base") + case FrequencyTurbo: + return json.Marshal("turbo") + } + return json.Marshal(uint(f)) +} + +// KHz returns the frequency value in kHz. For symbolic frequencies +// (min, base, turbo) this returns the sentinel value; use Resolve() +// first to obtain the actual platform frequency. +func (f Frequency) KHz() uint { + return uint(f) +} + +// IsSymbolic returns true if this frequency is a symbolic name +// (min, base, or turbo) that requires runtime resolution. +func (f Frequency) IsSymbolic() bool { + return f == FrequencyMin || f == FrequencyBase || f == FrequencyTurbo +} + +// Resolve converts a symbolic frequency to its concrete kHz value +// using platform frequency information. For non-symbolic frequencies, +// the value is returned unchanged. The parameters are: +// - minKHz: platform minimum frequency (cpufreq/cpuinfo_min_freq) +// - baseKHz: CPU base frequency (cpufreq/base_frequency) +// - turboKHz: maximum turbo frequency (cpufreq/cpuinfo_max_freq) +func (f Frequency) Resolve(minKHz, baseKHz, turboKHz uint) uint { + switch f { + case FrequencyMin: + return minKHz + case FrequencyBase: + return baseKHz + case FrequencyTurbo: + return turboKHz + } + return uint(f) +} + +// String returns a human-readable representation. +func (f Frequency) String() string { + switch f { + case FrequencyMin: + return "min" + case FrequencyBase: + return "base" + case FrequencyTurbo: + return "turbo" + } + kHz := uint(f) + if kHz == 0 { + return "0" + } + if kHz >= 1_000_000 && kHz%1_000_000 == 0 { + return fmt.Sprintf("%dGHz", kHz/1_000_000) + } + if kHz >= 1_000 && kHz%1_000 == 0 { + return fmt.Sprintf("%dMHz", kHz/1_000) + } + return fmt.Sprintf("%dkHz", kHz) +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go b/pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go new file mode 100644 index 000000000..53d8d1cc4 --- /dev/null +++ b/pkg/apis/config/v1alpha1/resmgr/policy/frequency_test.go @@ -0,0 +1,156 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package policy + +import ( + "encoding/json" + "testing" +) + +func TestFrequencyResolve(t *testing.T) { + const ( + minHz uint = 800000 + baseHz uint = 2400000 + turboHz uint = 3800000 + ) + cases := []struct { + name string + f Frequency + want uint + }{ + {"min sentinel", FrequencyMin, minHz}, + {"base sentinel", FrequencyBase, baseHz}, + {"turbo sentinel", FrequencyTurbo, turboHz}, + {"concrete value passed through", Frequency(1500000), 1500000}, + {"zero stays zero", Frequency(0), 0}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := tc.f.Resolve(minHz, baseHz, turboHz) + if got != tc.want { + t.Errorf("Resolve = %d, want %d", got, tc.want) + } + }) + } +} + +func TestFrequencyIsSymbolic(t *testing.T) { + if !FrequencyMin.IsSymbolic() { + t.Errorf("FrequencyMin.IsSymbolic = false, want true") + } + if !FrequencyBase.IsSymbolic() { + t.Errorf("FrequencyBase.IsSymbolic = false, want true") + } + if !FrequencyTurbo.IsSymbolic() { + t.Errorf("FrequencyTurbo.IsSymbolic = false, want true") + } + if Frequency(3000000).IsSymbolic() { + t.Errorf("concrete frequency must not be IsSymbolic") + } + if Frequency(0).IsSymbolic() { + t.Errorf("zero must not be IsSymbolic") + } +} + +func TestFrequencyUnmarshalJSON(t *testing.T) { + cases := []struct { + name string + input string + want Frequency + wantErr bool + }{ + {"symbolic min", `"min"`, FrequencyMin, false}, + {"symbolic base", `"base"`, FrequencyBase, false}, + {"symbolic turbo", `"turbo"`, FrequencyTurbo, false}, + {"symbolic uppercase", `"TURBO"`, FrequencyTurbo, false}, + {"GHz fractional", `"3.2GHz"`, Frequency(3200000), false}, + {"GHz short", `"2G"`, Frequency(2000000), false}, + {"MHz", `"2900MHz"`, Frequency(2900000), false}, + {"MHz short", `"2900M"`, Frequency(2900000), false}, + {"kHz explicit", `"2900000kHz"`, Frequency(2900000), false}, + {"kHz short", `"2900000k"`, Frequency(2900000), false}, + {"bare number as kHz", `"2900000"`, Frequency(2900000), false}, + {"json number as kHz", `2900000`, Frequency(2900000), false}, + {"empty string", `""`, Frequency(0), false}, + {"invalid unit", `"3GBz"`, 0, true}, + {"negative number", `-1000`, 0, true}, + {"garbage", `"abc"`, 0, true}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + var f Frequency + err := json.Unmarshal([]byte(tc.input), &f) + if tc.wantErr { + if err == nil { + t.Errorf("Unmarshal(%s) = nil err, want error", tc.input) + } + return + } + if err != nil { + t.Fatalf("Unmarshal(%s) unexpected err: %v", tc.input, err) + } + if f != tc.want { + t.Errorf("Unmarshal(%s) = %d, want %d", tc.input, uint(f), uint(tc.want)) + } + }) + } +} + +func TestFrequencyMarshalJSON(t *testing.T) { + cases := []struct { + name string + f Frequency + want string + }{ + {"min", FrequencyMin, `"min"`}, + {"base", FrequencyBase, `"base"`}, + {"turbo", FrequencyTurbo, `"turbo"`}, + {"concrete", Frequency(2900000), `2900000`}, + {"zero", Frequency(0), `0`}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + b, err := json.Marshal(tc.f) + if err != nil { + t.Fatalf("Marshal err: %v", err) + } + if string(b) != tc.want { + t.Errorf("Marshal = %s, want %s", string(b), tc.want) + } + }) + } +} + +func TestFrequencyRoundTrip(t *testing.T) { + cases := []Frequency{ + FrequencyMin, FrequencyBase, FrequencyTurbo, + Frequency(0), Frequency(2900000), Frequency(3800000), + } + for _, f := range cases { + t.Run(f.String(), func(t *testing.T) { + b, err := json.Marshal(f) + if err != nil { + t.Fatalf("Marshal err: %v", err) + } + var got Frequency + if err := json.Unmarshal(b, &got); err != nil { + t.Fatalf("Unmarshal err: %v", err) + } + if got != f { + t.Errorf("round-trip: got %d, want %d", uint(got), uint(f)) + } + }) + } +} diff --git a/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go b/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go index 3bef85a34..dc92ae349 100644 --- a/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go +++ b/pkg/apis/config/v1alpha1/resmgr/policy/zz_generated.deepcopy.go @@ -20,6 +20,31 @@ package policy import () +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *CPUClass) DeepCopyInto(out *CPUClass) { + *out = *in + if in.DisabledCstates != nil { + in, out := &in.DisabledCstates, &out.DisabledCstates + *out = make([]string, len(*in)) + copy(*out, *in) + } + if in.PctClosID != nil { + in, out := &in.PctClosID, &out.PctClosID + *out = new(int) + **out = **in + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new CPUClass. +func (in *CPUClass) DeepCopy() *CPUClass { + if in == nil { + return nil + } + out := new(CPUClass) + in.DeepCopyInto(out) + return out +} + // DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. func (in *SchedulingClass) DeepCopyInto(out *SchedulingClass) { *out = *in From 557c06952f4505c6def548477d06d84905efe262 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 05/10] cpuclass: introduce stand-alone CPU class package with PCT support Signed-off-by: Antti Kervinen --- pkg/resmgr/cpuclass/cpuclass.go | 286 +++++ pkg/resmgr/cpuclass/handler_commit_test.go | 347 ++++++ .../cpuclass/internal/cpufreq/cpufreq.go | 338 ++++++ .../cpuclass/internal/cpufreq/platform.go | 71 ++ pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go | 153 +++ .../cpuclass/internal/cpuidle/cpuidle.go | 122 ++ .../cpuclass/internal/cpuidle/overridefs.go | 166 +++ pkg/resmgr/cpuclass/internal/pct/pct.go | 974 +++++++++++++++ pkg/resmgr/cpuclass/internal/pct/pct_sst.go | 142 +++ .../internal/pct/pct_sst_goresctrl.go | 380 ++++++ .../cpuclass/internal/pct/pct_sst_mock.go | 452 +++++++ pkg/resmgr/cpuclass/internal/pct/pct_test.go | 1070 +++++++++++++++++ pkg/resmgr/cpuclass/internal/types/types.go | 91 ++ .../internal/uncorefreq/uncorefreq.go | 239 ++++ 14 files changed, 4831 insertions(+) create mode 100644 pkg/resmgr/cpuclass/cpuclass.go create mode 100644 pkg/resmgr/cpuclass/handler_commit_test.go create mode 100644 pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go create mode 100644 pkg/resmgr/cpuclass/internal/cpufreq/platform.go create mode 100644 pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go create mode 100644 pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go create mode 100644 pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go create mode 100644 pkg/resmgr/cpuclass/internal/pct/pct.go create mode 100644 pkg/resmgr/cpuclass/internal/pct/pct_sst.go create mode 100644 pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go create mode 100644 pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go create mode 100644 pkg/resmgr/cpuclass/internal/pct/pct_test.go create mode 100644 pkg/resmgr/cpuclass/internal/types/types.go create mode 100644 pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go diff --git a/pkg/resmgr/cpuclass/cpuclass.go b/pkg/resmgr/cpuclass/cpuclass.go new file mode 100644 index 000000000..21da445aa --- /dev/null +++ b/pkg/resmgr/cpuclass/cpuclass.go @@ -0,0 +1,286 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpuclass is the resource-manager-wide CPU class handler. +// It owns the per-CPU frequency, c-state, uncore-frequency and +// Intel Priority Core Turbo state implied by a list of user-facing +// CPU class definitions. +// +// Policies talk to a single *Handler, constructed with New(sys). +// Configure(spec) installs (or replaces) the class set; UseClass +// pins given CPUs to a named class; Commit() flushes deferred +// per-CPU sysfs writes; Hints() returns placement preferences a +// policy can use when picking new CPUs for an allocation. +package cpuclass + +import ( + "fmt" + "sort" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpufreq" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpuidle" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/pct" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/uncorefreq" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var log = logger.NewLogger("cpuclass") + +// AllocationIntent describes an upcoming CPU allocation for which +// the caller wants placement preferences. +type AllocationIntent = types.AllocationIntent + +// AllocationHints carries technology-agnostic placement preferences +// returned by Handler.Hints. +type AllocationHints = types.AllocationHints + +// CpuPreference is a named CPU set carrying a single placement +// preference (prefer or avoid). +type CpuPreference = types.CpuPreference + +// ConfigSpec carries cpuclass configuration applied via +// Handler.Configure. Idleness is intentionally absent: the caller +// decides which class name (if any) means "idle" and applies it via +// UseClass. +type ConfigSpec struct { + // Classes is the user-facing list of CPU classes. + Classes []*policyapi.CPUClass + // TurboDomain selects the per-domain turbo arbitration scope. + // Empty resolves to "package". + TurboDomain string + // Allowed bounds every cpuclass operation. CPUs outside this + // set are silently dropped by Configure, UseClass and Hints. + Allowed cpuset.CPUSet +} + +// Handler is the sole cpuclass entry point for policy code. It owns +// construction and configuration of the per-technology allocators +// (cpufreq, pct) and writers (cpufreq, cpuidle, uncorefreq). +type Handler struct { + sys sysfs.System + allowed cpuset.CPUSet + + cpufreq *cpufreq.Allocator + pct *pct.Allocator + + // defs maps synthetic class name -> resolved class definition. + // Populated by SetClassDef calls from the cpufreq allocator. + defs map[string]types.ClassDef + // cpuClass maps cpu id -> synthetic class name. Value "" means + // "explicitly assigned to no class". Absent CPUs are unmanaged. + cpuClass map[int]string + // dirtyCPUs tracks CPUs whose class assignment or whose class + // definition changed since the last Commit(). + dirtyCPUs map[int]bool + + freqWriter *cpufreq.Writer + idleWriter *cpuidle.Writer + uncoreWriter *uncorefreq.Writer +} + +// New constructs a Handler with both internal allocators (cpufreq +// and pct) ready in a "no configuration applied" state. Configure +// must be called before the handler is usable. +func New(sys sysfs.System) (*Handler, error) { + h := &Handler{ + sys: sys, + defs: map[string]types.ClassDef{}, + cpuClass: map[int]string{}, + dirtyCPUs: map[int]bool{}, + freqWriter: cpufreq.NewWriter(cpufreq.Hooks{}), + idleWriter: cpuidle.NewWriter(cpuidle.Hooks{}), + uncoreWriter: uncorefreq.NewWriter(uncorefreq.Hooks{}), + } + freq, err := cpufreq.New(sys, h) + if err != nil { + return nil, fmt.Errorf("cpuclass: failed to create cpufreq allocator: %w", err) + } + pctA, err := pct.NewAllocator(sys) + if err != nil { + return nil, fmt.Errorf("cpuclass: failed to create pct allocator: %w", err) + } + h.cpufreq = freq + h.pct = pctA + return h, nil +} + +// PctFreeClassCapacity returns the number of logical CPUs that the +// PCT allocator can still route into the named cpuClass on this +// node, given that 'held' lists CPUs already consumed by some +// balloon belonging to any other cpuClass. Returns 0 if PCT is +// inactive or the class has no PCT plan. +func (h *Handler) PctFreeClassCapacity(className string, held cpuset.CPUSet) int { + if h == nil || h.pct == nil { + return 0 + } + return h.pct.FreeClassCapacity(className, held) +} + +// PctActive reports whether PCT is in effect on this node. +func (h *Handler) PctActive() bool { + return h != nil && h.pct != nil && h.pct.Active() +} + +// Configure (re)applies a configuration spec. Idempotent: may be +// called repeatedly with changed classes, turbo-domain mode, or +// allowed set. +func (h *Handler) Configure(spec ConfigSpec) error { + h.allowed = spec.Allowed + h.defs = map[string]types.ClassDef{} + h.cpuClass = map[int]string{} + h.dirtyCPUs = map[int]bool{} + h.freqWriter.Reset() + h.uncoreWriter.Reset() + if err := h.cpufreq.Configure(spec.Classes, spec.TurboDomain, spec.Allowed); err != nil { + return fmt.Errorf("cpuclass: cpufreq configure: %w", err) + } + if name, needs := uncorefreq.RequiresAvailable(h.defs); needs && !h.uncoreWriter.Available() { + return uncorefreq.UnavailableError(name) + } + if err := h.pct.Configure(spec.Classes, spec.Allowed); err != nil { + return fmt.Errorf("cpuclass: pct configure: %w", err) + } + return nil +} + +// SetClassDef records a class definition keyed by its synthetic +// name. If the definition materially changes, every CPU currently +// assigned to that synthetic class is marked dirty. Implements the +// cpufreq.Sink interface. +func (h *Handler) SetClassDef(name string, def types.ClassDef) { + if name == "" { + return + } + prev, had := h.defs[name] + h.defs[name] = def + if had && prev.Equal(def) { + return + } + for cpu, cls := range h.cpuClass { + if cls == name { + h.dirtyCPUs[cpu] = true + } + } +} + +// AssignCPUs updates the (cpu -> synthetic class) map for the given +// CPUs. CPUs whose class changes are added to the dirty set. An +// empty class name means "no class". Implements the cpufreq.Sink +// interface. +func (h *Handler) AssignCPUs(name string, cpus []int) { + for _, cpu := range cpus { + prev, had := h.cpuClass[cpu] + if had && prev == name { + continue + } + h.cpuClass[cpu] = name + h.dirtyCPUs[cpu] = true + } +} + +// Commit flushes pending cpufreq, cpuidle and uncore changes to +// sysfs. Per-property writes are deduplicated against the writers' +// lastWritten caches. +func (h *Handler) Commit() error { + if h == nil || len(h.dirtyCPUs) == 0 { + return nil + } + perClass := map[string][]int{} + for cpu := range h.dirtyCPUs { + name, ok := h.cpuClass[cpu] + if !ok || name == "" { + continue + } + perClass[name] = append(perClass[name], cpu) + } + var firstErr error + for name, cpus := range perClass { + sort.Ints(cpus) + def, ok := h.defs[name] + if !ok { + log.Debugf("cpuclass: Commit: no definition for class %q; skipping cpus %v", name, cpus) + continue + } + if err := h.freqWriter.Enforce(name, def, cpus); err != nil && firstErr == nil { + firstErr = err + } + if err := h.idleWriter.Enforce(name, def.DisabledCstates, cpus); err != nil && firstErr == nil { + firstErr = err + } + } + dirtyDies := uncorefreq.DiesForCpus(h.sys, h.dirtyCPUs) + if err := h.uncoreWriter.Enforce(h.sys, h.defs, h.cpuClass, dirtyDies); err != nil && firstErr == nil { + firstErr = err + } + h.dirtyCPUs = map[int]bool{} + return firstErr +} + +// UseClass applies className to the given CPUs across every internal +// allocator. An empty className means "no class". CPUs outside the +// configured Allowed set are silently dropped. +func (h *Handler) UseClass(className string, cpus cpuset.CPUSet) error { + if err := h.cpufreq.UseClass(className, cpus); err != nil { + log.Warnf("cpuclass: cpufreq failed to apply class %q on CPUs %s: %v", className, cpus, err) + } + if err := h.pct.UseClass(className, cpus); err != nil { + log.Warnf("cpuclass: pct failed to apply class %q on CPUs %s: %v", className, cpus, err) + } + return nil +} + +// Hints returns technology-agnostic placement preferences for an +// upcoming CPU allocation. The returned CpuPreference sets are +// always subsets of the configured Allowed set. +func (h *Handler) Hints(intent AllocationIntent) AllocationHints { + hints := h.pct.Hints(intent) + if h.allowed.Size() > 0 { + hints = intersectHints(hints, h.allowed) + } + return hints +} + +// Shutdown releases any platform-level resources owned by the +// handler. Safe to call multiple times. +func (h *Handler) Shutdown() error { + if h == nil || h.pct == nil { + return nil + } + return h.pct.Shutdown() +} + +// intersectHints returns a copy of hints with every CpuPreference +// constrained to the given bound. Empty preferences are dropped. +func intersectHints(hints AllocationHints, bound cpuset.CPUSet) AllocationHints { + out := AllocationHints{} + for _, p := range hints.Prefer { + s := p.Cpus.Intersection(bound) + if s.IsEmpty() { + continue + } + out.Prefer = append(out.Prefer, CpuPreference{Name: p.Name, Cpus: s}) + } + for _, p := range hints.Avoid { + s := p.Cpus.Intersection(bound) + if s.IsEmpty() { + continue + } + out.Avoid = append(out.Avoid, CpuPreference{Name: p.Name, Cpus: s}) + } + return out +} diff --git a/pkg/resmgr/cpuclass/handler_commit_test.go b/pkg/resmgr/cpuclass/handler_commit_test.go new file mode 100644 index 000000000..b2b4d37d7 --- /dev/null +++ b/pkg/resmgr/cpuclass/handler_commit_test.go @@ -0,0 +1,347 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpuclass + +import ( + "sync" + "testing" + + idset "github.com/intel/goresctrl/pkg/utils" + + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpufreq" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/cpuidle" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/uncorefreq" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// dieFakePackage extends the package fake with die support so the +// uncore writer can enumerate (pkg, die) tuples. +type dieFakePackage struct { + sysfs.CPUPackage + id idset.ID + cpus cpuset.CPUSet + dies []idset.ID + dieCpus map[idset.ID]cpuset.CPUSet +} + +func (p *dieFakePackage) ID() idset.ID { return p.id } +func (p *dieFakePackage) CPUSet() cpuset.CPUSet { return p.cpus } +func (p *dieFakePackage) DieIDs() []idset.ID { return p.dies } +func (p *dieFakePackage) DieCPUSet(d idset.ID) cpuset.CPUSet { return p.dieCpus[d] } + +// dieFakeCPU augments the cpu fake with package id. +type dieFakeCPU struct { + sysfs.CPU + id idset.ID + pkg idset.ID +} + +func (c *dieFakeCPU) ID() idset.ID { return c.id } +func (c *dieFakeCPU) PackageID() idset.ID { return c.pkg } + +// dieFakeSys is the minimum sysfs.System surface used by the +// uncore writer (Package, CPU, PackageIDs, DieIDs, DieCPUSet). +// Unimplemented methods panic via the embedded nil interface. +type dieFakeSys struct { + sysfs.System + packages map[idset.ID]*dieFakePackage + cpuPkg map[int]idset.ID +} + +func (s *dieFakeSys) PackageIDs() []idset.ID { + ids := make([]idset.ID, 0, len(s.packages)) + for id := range s.packages { + ids = append(ids, id) + } + return ids +} + +func (s *dieFakeSys) Package(id idset.ID) sysfs.CPUPackage { + if p, ok := s.packages[id]; ok { + return p + } + return nil +} + +func (s *dieFakeSys) CPU(id idset.ID) sysfs.CPU { + pkg, ok := s.cpuPkg[int(id)] + if !ok { + return nil + } + return &dieFakeCPU{id: id, pkg: pkg} +} + +// dieFakeCpu specifies the (pkg, die) location of a single CPU when +// building a dieFakeSys. +type dieFakeCpu struct { + pkg int + die int +} + +// newDieFakeSys builds a dieFakeSys from a map cpu -> (pkg, die). +func newDieFakeSys(cpus map[int]dieFakeCpu) *dieFakeSys { + pkgs := map[idset.ID]*dieFakePackage{} + cpuPkg := map[int]idset.ID{} + type pkgDieKey struct{ pkg, die int } + dieCpus := map[pkgDieKey]cpuset.CPUSet{} + pkgCpus := map[int]cpuset.CPUSet{} + pkgDies := map[int]map[int]bool{} + for cpu, loc := range cpus { + cpuPkg[cpu] = idset.ID(loc.pkg) + pkgCpus[loc.pkg] = pkgCpus[loc.pkg].Union(cpuset.New(cpu)) + k := pkgDieKey(loc) + dieCpus[k] = dieCpus[k].Union(cpuset.New(cpu)) + if pkgDies[loc.pkg] == nil { + pkgDies[loc.pkg] = map[int]bool{} + } + pkgDies[loc.pkg][loc.die] = true + } + for pkg, dies := range pkgDies { + dList := make([]idset.ID, 0, len(dies)) + for d := range dies { + dList = append(dList, idset.ID(d)) + } + dc := map[idset.ID]cpuset.CPUSet{} + for d := range dies { + dc[idset.ID(d)] = dieCpus[pkgDieKey{pkg, d}] + } + pkgs[idset.ID(pkg)] = &dieFakePackage{ + id: idset.ID(pkg), + cpus: pkgCpus[pkg], + dies: dList, + dieCpus: dc, + } + } + return &dieFakeSys{packages: pkgs, cpuPkg: cpuPkg} +} + +// recordingWriters captures the per-CPU and per-die writes issued by +// Commit() so tests can assert exactly what was programmed. +type recordingWriters struct { + mu sync.Mutex + minF map[int]int + maxF map[int]int + gov map[int]string + minU map[uncorefreq.DieKey]int + maxU map[uncorefreq.DieKey]int + minCnt int + maxCnt int + govCnt int + uMinCnt int + uMaxCnt int +} + +func newRecordingWriters() *recordingWriters { + return &recordingWriters{ + minF: map[int]int{}, + maxF: map[int]int{}, + gov: map[int]string{}, + minU: map[uncorefreq.DieKey]int{}, + maxU: map[uncorefreq.DieKey]int{}, + } +} + +// installOn replaces the cpufreq and uncore writers of h with +// in-memory recorders. The cpuidle writer is replaced by a no-op so +// tests do not need a real cstates handle. +func (r *recordingWriters) installOn(h *Handler) { + h.freqWriter = cpufreq.NewWriter(cpufreq.Hooks{ + SetMin: func(cpu, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.minF[cpu] = freq + r.minCnt++ + return nil + }, + SetMax: func(cpu, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.maxF[cpu] = freq + r.maxCnt++ + return nil + }, + SetGov: func(cpu int, g string) error { + r.mu.Lock() + defer r.mu.Unlock() + r.gov[cpu] = g + r.govCnt++ + return nil + }, + }) + h.uncoreWriter = uncorefreq.NewWriter(uncorefreq.Hooks{ + SetMin: func(pkg, die, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.minU[uncorefreq.DieKey{Pkg: pkg, Die: die}] = freq + r.uMinCnt++ + return nil + }, + SetMax: func(pkg, die, freq int) error { + r.mu.Lock() + defer r.mu.Unlock() + r.maxU[uncorefreq.DieKey{Pkg: pkg, Die: die}] = freq + r.uMaxCnt++ + return nil + }, + }) + h.idleWriter = cpuidle.NewWriter(cpuidle.Hooks{}) +} + +// newBareHandler returns a Handler with empty state, no sysfs +// topology (callers may set h.sys), and the recording writers +// installed. The cpuidle writer is left in a state where Enforce +// will return early because no class has DisabledCstates. +func newBareHandler() (*Handler, *recordingWriters) { + h := &Handler{ + defs: map[string]types.ClassDef{}, + cpuClass: map[int]string{}, + dirtyCPUs: map[int]bool{}, + } + r := newRecordingWriters() + r.installOn(h) + return h, r +} + +// TestCommitIdempotentCpufreq verifies that a second Commit() with +// no state change re-issues zero sysfs writes. +func TestCommitIdempotentCpufreq(t *testing.T) { + h, r := newBareHandler() + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_600_000, FreqGovernor: "performance"}) + h.AssignCPUs("hp@d0", []int{0, 1}) + if err := h.Commit(); err != nil { + t.Fatalf("first Commit: %v", err) + } + if r.minCnt != 2 || r.maxCnt != 2 || r.govCnt != 2 { + t.Fatalf("expected 2 of each write, got min=%d max=%d gov=%d", r.minCnt, r.maxCnt, r.govCnt) + } + if err := h.Commit(); err != nil { + t.Fatalf("second Commit: %v", err) + } + if r.minCnt != 2 || r.maxCnt != 2 || r.govCnt != 2 { + t.Fatalf("second Commit should be no-op, got min=%d max=%d gov=%d", r.minCnt, r.maxCnt, r.govCnt) + } +} + +// TestClassDefChangeDirtiesAssignedCpus verifies that updating a +// class definition reprograms the CPUs already assigned to that +// class on the next Commit, without requiring a re-assign. +func TestClassDefChangeDirtiesAssignedCpus(t *testing.T) { + h, r := newBareHandler() + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_000_000}) + h.AssignCPUs("hp@d0", []int{0, 1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#1: %v", err) + } + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_600_000}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#2: %v", err) + } + for _, cpu := range []int{0, 1} { + if r.maxF[cpu] != 4_600_000 { + t.Errorf("cpu%d max=%d, want 4_600_000", cpu, r.maxF[cpu]) + } + } +} + +// TestAssignToEmptyClassDoesNotWriteCpufreq verifies that moving a +// CPU to the empty class leaves the writers untouched. +func TestAssignToEmptyClassDoesNotWriteCpufreq(t *testing.T) { + h, r := newBareHandler() + h.SetClassDef("hp@d0", types.ClassDef{MinFreq: 800_000, MaxFreq: 4_000_000, FreqGovernor: "performance"}) + h.AssignCPUs("hp@d0", []int{0}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#1: %v", err) + } + r.maxCnt, r.minCnt, r.govCnt = 0, 0, 0 + h.AssignCPUs("", []int{0}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#2: %v", err) + } + if r.minCnt+r.maxCnt+r.govCnt != 0 { + t.Errorf("empty class should not write to cpufreq, got min=%d max=%d gov=%d", r.minCnt, r.maxCnt, r.govCnt) + } +} + +// TestUncoreSkipBothZero verifies that a die with effective min=0 +// and max=0 produces no uncore writes. +func TestUncoreSkipBothZero(t *testing.T) { + sys := newDieFakeSys(map[int]dieFakeCpu{ + 0: {pkg: 0, die: 0}, + 1: {pkg: 0, die: 0}, + }) + h, r := newBareHandler() + h.sys = sys + h.SetClassDef("idle@d0", types.ClassDef{MinFreq: 800_000}) + h.AssignCPUs("idle@d0", []int{0, 1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + if r.uMinCnt != 0 || r.uMaxCnt != 0 { + t.Errorf("uncore should not be written when both limits are 0, got min=%d max=%d", r.uMinCnt, r.uMaxCnt) + } +} + +// TestUncoreMaxWinsAcrossClasses verifies the per-die max-wins +// reduction when multiple classes are active on the same die. +func TestUncoreMaxWinsAcrossClasses(t *testing.T) { + sys := newDieFakeSys(map[int]dieFakeCpu{ + 0: {pkg: 0, die: 0}, + 1: {pkg: 0, die: 0}, + }) + h, r := newBareHandler() + h.sys = sys + h.SetClassDef("lo@d0", types.ClassDef{UncoreMinFreq: 800_000, UncoreMaxFreq: 1_500_000}) + h.SetClassDef("hi@d0", types.ClassDef{UncoreMinFreq: 1_200_000, UncoreMaxFreq: 2_400_000}) + h.AssignCPUs("lo@d0", []int{0}) + h.AssignCPUs("hi@d0", []int{1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit: %v", err) + } + key := uncorefreq.DieKey{Pkg: 0, Die: 0} + if got := r.maxU[key]; got != 2_400_000 { + t.Errorf("uncore max = %d, want 2_400_000 (hi class wins)", got) + } + if got := r.minU[key]; got != 1_200_000 { + t.Errorf("uncore min = %d, want 1_200_000 (hi class wins)", got) + } +} + +// TestUncoreRecomputesOnAssignmentChange verifies that removing the +// winner class from a die triggers a fresh write with the loser's +// (lower) values. +func TestUncoreRecomputesOnAssignmentChange(t *testing.T) { + sys := newDieFakeSys(map[int]dieFakeCpu{ + 0: {pkg: 0, die: 0}, + 1: {pkg: 0, die: 0}, + }) + h, r := newBareHandler() + h.sys = sys + h.SetClassDef("lo@d0", types.ClassDef{UncoreMaxFreq: 1_500_000}) + h.SetClassDef("hi@d0", types.ClassDef{UncoreMaxFreq: 2_400_000}) + h.AssignCPUs("lo@d0", []int{0}) + h.AssignCPUs("hi@d0", []int{1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#1: %v", err) + } + h.AssignCPUs("lo@d0", []int{1}) + if err := h.Commit(); err != nil { + t.Fatalf("Commit#2: %v", err) + } + if got := r.maxU[uncorefreq.DieKey{Pkg: 0, Die: 0}]; got != 1_500_000 { + t.Errorf("uncore max after hi removed = %d, want 1_500_000", got) + } +} diff --git a/pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go b/pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go new file mode 100644 index 000000000..aee4fbe82 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpufreq/cpufreq.go @@ -0,0 +1,338 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpufreq owns the cpufreq-side CPU-class lifecycle: +// resolution of symbolic frequencies (min/base/turbo), turbo-priority +// winner selection per turbo domain, and the per-CPU sysfs writes +// that follow. The package is consumed by the cpuclass handler and +// exposes no behavior to user-facing code. +package cpufreq + +import ( + "fmt" + "sort" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var log = logger.NewLogger("cpuclass") + +// Sink is the back-channel through which the allocator publishes +// resolved class definitions and per-CPU class assignments to its +// owner (the cpuclass handler). The handler turns these into per-CPU +// dirty bits and sysfs writes performed by its Commit(). +type Sink interface { + SetClassDef(name string, def types.ClassDef) + AssignCPUs(name string, cpus []int) +} + +// Allocator owns the per-turbo-domain class state for cpufreq. +type Allocator struct { + sys sysfs.System + sink Sink + classes []*policyapi.CPUClass + classByName map[string]*policyapi.CPUClass + turboDomain string + turboInfo *platformTurboInfo + allowed cpuset.CPUSet + + cpuDomain map[int]domainID + domains []domainID + + // activeCpus[d][className] is the set of CPUs in turbo domain d + // currently assigned to className. + activeCpus map[domainID]map[string]cpuset.CPUSet + + // winnerPrio[d] is the highest TurboPriority among classes that + // had any active CPUs in domain d the last time + // recalculateTurbo(d) ran. -1 forces the first recalculation. + winnerPrio map[domainID]int +} + +// domainID identifies one turbo arbitration domain. +type domainID int + +const systemDomainID domainID = 0 + +const ( + turboDomainPackage = "package" + turboDomainSystem = "system" +) + +// New returns an Allocator that publishes class definitions and +// per-CPU assignments to sink. The constructor does not push any +// class definitions; the caller follows up with Configure(). +func New(sys sysfs.System, sink Sink) (*Allocator, error) { + if sys == nil { + return nil, fmt.Errorf("cpufreq: missing required argument sys") + } + if sink == nil { + return nil, fmt.Errorf("cpufreq: missing required argument sink") + } + a := &Allocator{ + sys: sys, + sink: sink, + activeCpus: map[domainID]map[string]cpuset.CPUSet{}, + winnerPrio: map[domainID]int{}, + } + a.discoverPlatformInfo() + return a, nil +} + +// Configure replaces the CPU class set, turbo domain mode and the +// set of allowed CPUs. Resets per-domain turbo winners and +// re-publishes class definitions to the sink. +func (a *Allocator) Configure(classes []*policyapi.CPUClass, turboDomain string, allowed cpuset.CPUSet) error { + a.classes = classes + a.classByName = make(map[string]*policyapi.CPUClass, len(classes)) + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + switch turboDomain { + case "", turboDomainPackage, turboDomainSystem: + a.turboDomain = turboDomain + default: + return fmt.Errorf("cpufreq: unsupported turboDomain %q (expected %q or %q)", + turboDomain, turboDomainPackage, turboDomainSystem) + } + a.allowed = allowed + a.buildCpuDomains() + a.activeCpus = map[domainID]map[string]cpuset.CPUSet{} + a.winnerPrio = map[domainID]int{} + a.pushInitialClassDefinitions() + return nil +} + +// IsKnownClass reports whether the given class name is known to the +// allocator's CPUClasses configuration. +func (a *Allocator) IsKnownClass(name string) bool { + _, ok := a.classByName[name] + return ok +} + +// resolveClassName logs an error for unknown names and returns the +// name unchanged so the caller sees what was requested. +func (a *Allocator) resolveClassName(name string) string { + if name == "" { + return "" + } + if a.IsKnownClass(name) { + return name + } + log.Errorf("unknown CPU class %q", name) + return name +} + +// UseClass marks the given CPUs as active under className, +// recalculates the turbo winner of every affected turbo domain, then +// publishes per-CPU assignments to the sink. CPUs outside the +// configured Allowed set are silently dropped. +func (a *Allocator) UseClass(className string, cpus cpuset.CPUSet) error { + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + return nil + } + className = a.resolveClassName(className) + a.removeCpusFromAllClasses(cpus) + byDomain := a.cpusByDomain(cpus) + if className != "" { + for d, dc := range byDomain { + if a.activeCpus[d] == nil { + a.activeCpus[d] = map[string]cpuset.CPUSet{} + } + a.activeCpus[d][className] = a.activeCpus[d][className].Union(dc) + } + } + for d := range byDomain { + a.recalculateTurbo(d) + } + for d, dc := range byDomain { + syn := a.syntheticName(className, d) + a.sink.AssignCPUs(syn, dc.UnsortedList()) + } + return nil +} + +// removeCpusFromAllClasses removes the given CPUs from every active +// class set, in every turbo domain. +func (a *Allocator) removeCpusFromAllClasses(cpus cpuset.CPUSet) { + for d, perClass := range a.activeCpus { + for name, set := range perClass { + newSet := set.Difference(cpus) + if newSet.IsEmpty() { + delete(perClass, name) + } else { + perClass[name] = newSet + } + } + if len(perClass) == 0 { + delete(a.activeCpus, d) + } + } +} + +func (a *Allocator) cpusByDomain(cpus cpuset.CPUSet) map[domainID]cpuset.CPUSet { + out := map[domainID]cpuset.CPUSet{} + for _, cpu := range cpus.UnsortedList() { + d, ok := a.cpuDomain[cpu] + if !ok { + d = systemDomainID + } + out[d] = out[d].Union(cpuset.New(cpu)) + } + return out +} + +func (a *Allocator) buildCpuDomains() { + a.cpuDomain = map[int]domainID{} + seen := map[domainID]bool{} + mode := a.turboDomain + if mode == "" { + mode = turboDomainPackage + } + for _, cpuID := range a.sys.CPUIDs() { + if a.allowed.Size() > 0 && !a.allowed.Contains(int(cpuID)) { + continue + } + c := a.sys.CPU(cpuID) + if c == nil { + continue + } + var d domainID + switch mode { + case turboDomainSystem: + d = systemDomainID + default: + d = domainID(c.PackageID()) + } + a.cpuDomain[int(cpuID)] = d + seen[d] = true + } + a.domains = a.domains[:0] + for d := range seen { + a.domains = append(a.domains, d) + } + sort.Slice(a.domains, func(i, j int) bool { return a.domains[i] < a.domains[j] }) + for _, d := range a.domains { + a.winnerPrio[d] = -1 + } + log.Debugf("turbo domains (mode=%s): %v (cpu->domain: %v)", mode, a.domains, a.cpuDomain) +} + +// syntheticName returns the per-domain internal name used to track a +// user-facing class in a turbo domain. Empty class names pass +// through unchanged. +func (a *Allocator) syntheticName(name string, d domainID) string { + if name == "" { + return "" + } + if _, ok := a.classByName[name]; !ok { + return name + } + return fmt.Sprintf("%s@d%d", name, d) +} + +// pushInitialClassDefinitions resolves symbolic frequencies in every +// CPUClass and publishes the resulting types.ClassDef to the sink, +// once per (class, turbo domain) pair. +func (a *Allocator) pushInitialClassDefinitions() { + if len(a.domains) == 0 { + return + } + for _, cc := range a.classes { + def := classDefFromCPUClass(cc, a.turboInfo, 0) + for _, d := range a.domains { + a.sink.SetClassDef(a.syntheticName(cc.Name, d), def) + } + log.Infof("cpuClass %q configured: minFreq=%s(%d) maxFreq=%s(%d) disabledCstates=%v", + cc.Name, cc.MinFreq, def.MinFreq, cc.MaxFreq, def.MaxFreq, cc.DisabledCstates) + } +} + +// recalculateTurbo resolves exclusive turbo frequency access in the +// given turbo domain based on TurboPriority across all CPU classes +// that currently have active CPUs in that domain. See the in-tree +// design notes for the algorithm. +func (a *Allocator) recalculateTurbo(d domainID) { + if len(a.classes) == 0 { + return + } + newPrio := 0 + if perClass, ok := a.activeCpus[d]; ok { + for _, cc := range a.classes { + if cc.TurboPriority <= newPrio { + continue + } + if set, ok := perClass[cc.Name]; ok && !set.IsEmpty() { + newPrio = cc.TurboPriority + } + } + } + if prev, ok := a.winnerPrio[d]; ok && prev == newPrio { + return + } + a.winnerPrio[d] = newPrio + if a.turboInfo == nil { + log.Warnf("turbo recalculation skipped (domain %d): no platform turbo info", d) + return + } + for _, cc := range a.classes { + effectiveTurboKHz := a.turboInfo.baseFreqKHz + if newPrio == 0 || cc.TurboPriority >= newPrio { + effectiveTurboKHz = a.turboInfo.maxTurboFreqKHz + } + def := classDefFromCPUClass(cc, a.turboInfo, effectiveTurboKHz) + a.sink.SetClassDef(a.syntheticName(cc.Name, d), def) + log.Infof("turbo: domain=%d class %q (prio=%d, winner=%v): minFreq=%d maxFreq=%d", + d, cc.Name, cc.TurboPriority, + newPrio == 0 || cc.TurboPriority >= newPrio, + def.MinFreq, def.MaxFreq) + } +} + +// classDefFromCPUClass converts a user-facing CPUClass to a +// resolved ClassDef. When info is nil, symbolic frequencies resolve +// to 0; when info is non-nil they resolve to the corresponding +// platform value (with effectiveTurboKHz overriding the turbo +// sentinel if non-zero). +func classDefFromCPUClass(cc *policyapi.CPUClass, info *platformTurboInfo, effectiveTurboKHz uint) types.ClassDef { + resolve := func(f policyapi.Frequency) uint { + if info != nil { + turboKHz := info.maxTurboFreqKHz + if effectiveTurboKHz > 0 { + turboKHz = effectiveTurboKHz + } + return f.Resolve(info.minFreqKHz, info.baseFreqKHz, turboKHz) + } + if f.IsSymbolic() { + return 0 + } + return f.KHz() + } + return types.ClassDef{ + MinFreq: resolve(cc.MinFreq), + MaxFreq: resolve(cc.MaxFreq), + EnergyPerformancePreference: cc.EnergyPerformancePreference, + UncoreMinFreq: resolve(cc.UncoreMinFreq), + UncoreMaxFreq: resolve(cc.UncoreMaxFreq), + FreqGovernor: cc.FreqGovernor, + DisabledCstates: cc.DisabledCstates, + } +} diff --git a/pkg/resmgr/cpuclass/internal/cpufreq/platform.go b/pkg/resmgr/cpuclass/internal/cpufreq/platform.go new file mode 100644 index 000000000..18ce2e177 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpufreq/platform.go @@ -0,0 +1,71 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpufreq + +import ( + "fmt" + + "github.com/containers/nri-plugins/pkg/sysfs" +) + +// platformTurboInfo holds platform-level turbo frequency capabilities +// discovered from sysfs. +type platformTurboInfo struct { + baseFreqKHz uint + maxTurboFreqKHz uint + minFreqKHz uint +} + +// discoverPlatformInfo populates a.turboInfo from sysfs. Failure is +// non-fatal: symbolic frequencies then resolve to 0. +func (a *Allocator) discoverPlatformInfo() { + info, err := discoverTurboInfo(a.sys) + if err != nil { + log.Warnf("cpufreq: cannot discover platform turbo info: %v", err) + return + } + a.turboInfo = info +} + +// discoverTurboInfo reads platform turbo capabilities from sysfs. It +// uses the first online CPU's frequency range as representative. +func discoverTurboInfo(sys sysfs.System) (*platformTurboInfo, error) { + cpuIDs := sys.CPUIDs() + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("no CPUs found in system topology") + } + for _, id := range cpuIDs { + cpu := sys.CPU(id) + if cpu == nil || !cpu.Online() { + continue + } + freq := cpu.FrequencyRange() + baseFreq := cpu.BaseFrequency() + if freq.Min == 0 && freq.Max == 0 { + log.Warnf("cannot detect cpu%d frequency range, skipping platform turbo info", id) + continue + } + if baseFreq == 0 { + log.Warnf("cannot detect cpu%d base frequency, default to max", id) + baseFreq = freq.Max + } + return &platformTurboInfo{ + baseFreqKHz: uint(baseFreq), + maxTurboFreqKHz: uint(freq.Max), + minFreqKHz: uint(freq.Min), + }, nil + } + return nil, fmt.Errorf("no online CPU with valid frequency information found") +} diff --git a/pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go b/pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go new file mode 100644 index 000000000..cf91491be --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpufreq/sysfs.go @@ -0,0 +1,153 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpufreq + +import ( + "github.com/intel/goresctrl/pkg/utils" + + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" +) + +// Hooks lets tests intercept per-CPU writes without touching real +// sysfs. Production use leaves all hooks nil; the writer then talks +// to the platform via goresctrl. +type Hooks struct { + SetMin func(cpu, kHz int) error + SetMax func(cpu, kHz int) error + SetGov func(cpu int, governor string) error +} + +// cpufreqWritten records the last successfully written values on a +// single CPU. Used for write deduplication. +type cpufreqWritten struct { + min uint + max uint + governor string + hasMin bool + hasMax bool + hasGov bool +} + +// Writer is the direct per-CPU cpufreq sysfs writer. Properties are +// written only when the desired value differs from the last +// successfully written one. Failures on individual CPUs/properties +// are logged but do not stop processing of the remaining ones; the +// first error encountered is returned. +type Writer struct { + hooks Hooks + lastWritten map[int]cpufreqWritten +} + +// NewWriter returns a Writer wired to the given hooks. Pass a +// zero-valued Hooks to use real sysfs via goresctrl. +func NewWriter(hooks Hooks) *Writer { + return &Writer{ + hooks: hooks, + lastWritten: make(map[int]cpufreqWritten), + } +} + +// Reset clears the per-CPU lastWritten cache so the next Enforce +// pass re-writes every desired value. Called by the handler when +// class definitions or the allowed set change. +func (w *Writer) Reset() { + w.lastWritten = make(map[int]cpufreqWritten) +} + +// Forget drops the lastWritten cache entries for the given CPUs. +func (w *Writer) Forget(cpus ...int) { + for _, c := range cpus { + delete(w.lastWritten, c) + } +} + +// Enforce writes min/max/governor to sysfs for every CPU in cpus, +// skipping properties whose desired value matches the last written +// one. A zero min or max means "don't enforce". An empty governor +// means "don't enforce". The first error encountered is returned. +func (w *Writer) Enforce(class string, def types.ClassDef, cpus []int) error { + if len(cpus) == 0 { + return nil + } + min := def.MinFreq + max := def.MaxFreq + governor := def.FreqGovernor + + var firstErr error + for _, cpu := range cpus { + state := w.lastWritten[cpu] + + if min > 0 && (!state.hasMin || state.min != min) { + log.Debugf("enforcing cpu frequency min %d from class %q on cpu %d", min, class, cpu) + if err := w.callSetMin(cpu, int(min)); err != nil { + log.Errorf("cpufreq: cpu%d: cannot set min=%d: %v", cpu, min, err) + if firstErr == nil { + firstErr = err + } + } + state.min = min + state.hasMin = true + } + + if max > 0 && (!state.hasMax || state.max != max) { + log.Debugf("enforcing cpu frequency max %d from class %q on cpu %d", max, class, cpu) + if err := w.callSetMax(cpu, int(max)); err != nil { + log.Errorf("cpufreq: cpu%d: cannot set max=%d: %v", cpu, max, err) + if firstErr == nil { + firstErr = err + } + } + state.max = max + state.hasMax = true + } + + if governor != "" && (!state.hasGov || state.governor != governor) { + log.Debugf("enforcing cpu frequency governor %q from class %q on cpu %d", governor, class, cpu) + if err := w.callSetGov(cpu, governor); err != nil { + log.Errorf("cpufreq: cpu%d: cannot set governor=%q: %v", cpu, governor, err) + if firstErr == nil { + firstErr = err + } + } + state.governor = governor + state.hasGov = true + } + + w.lastWritten[cpu] = state + } + + return firstErr +} + +func (w *Writer) callSetMin(cpu, freq int) error { + if w.hooks.SetMin != nil { + return w.hooks.SetMin(cpu, freq) + } + return utils.SetCPUScalingMinFreq(utils.ID(cpu), freq) +} + +func (w *Writer) callSetMax(cpu, freq int) error { + if w.hooks.SetMax != nil { + return w.hooks.SetMax(cpu, freq) + } + return utils.SetCPUScalingMaxFreq(utils.ID(cpu), freq) +} + +func (w *Writer) callSetGov(cpu int, governor string) error { + if w.hooks.SetGov != nil { + return w.hooks.SetGov(cpu, governor) + } + return utils.SetCPUScalingGovernor(utils.ID(cpu), governor) +} diff --git a/pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go b/pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go new file mode 100644 index 000000000..a208ff757 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpuidle/cpuidle.go @@ -0,0 +1,122 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package cpuidle is the C-state writer used by the cpuclass +// handler. It wraps the goresctrl cstates library, exposing a +// uniform Hooks-injectable interface that matches the cpufreq and +// uncorefreq writers. +package cpuidle + +import ( + "fmt" + + "github.com/intel/goresctrl/pkg/cstates" + + logger "github.com/containers/nri-plugins/pkg/log" +) + +var log = logger.NewLogger("cpuclass") + +// Hooks lets tests intercept the cstate apply operations without +// touching real sysfs. Production use leaves all hooks nil; the +// writer then talks to the platform via goresctrl. The two hooks +// mirror the two Apply calls performed per enforce(): enable and +// disable. +type Hooks struct { + Apply func(cpus []int, enabled, disabled []string) error +} + +// Writer enforces per-class enable/disable bits across the cstate +// names exposed by the platform. The cstates handle is created +// lazily on first enforce() call that has any disabled cstates; +// hosts and tests that never request a cstate change therefore +// never touch the cpuidle sysfs. +type Writer struct { + hooks Hooks + cs *cstates.Cstates +} + +// NewWriter returns a Writer wired to the given hooks. Pass a +// zero-valued Hooks to use real sysfs via goresctrl. +func NewWriter(hooks Hooks) *Writer { + return &Writer{hooks: hooks} +} + +// Enforce applies the class-specific C-state enable/disable mask on +// the given CPUs. An empty disabledCstates leaves the writer +// untouched as long as the cstates handle has never been +// initialized. Returns the first error encountered. +func (w *Writer) Enforce(class string, disabledCstates []string, cpus []int) error { + if len(cpus) == 0 { + return nil + } + if len(disabledCstates) == 0 && w.cs == nil && w.hooks.Apply == nil && cstatesEnvOverridesJson == "" { + return nil + } + if w.hooks.Apply != nil { + return w.hooks.Apply(cpus, nil, disabledCstates) + } + if err := w.ensureHandle(); err != nil { + return err + } + enabledCstates := []string{} + for _, name := range w.cs.Names() { + enabled := true + for _, d := range disabledCstates { + if name == d { + enabled = false + break + } + } + if enabled { + enabledCstates = append(enabledCstates, name) + } + } + cpuCstates := w.cs.Copy(cstates.NewBasicFilter().SetCPUs(cpus...)) + enCpuCstates := cpuCstates.Copy(cstates.NewBasicFilter().SetCstateNames(enabledCstates...)) + disCpuCstates := cpuCstates.Copy(cstates.NewBasicFilter().SetCstateNames(disabledCstates...)) + enCpuCstates.SetAttrs(cstates.AttrDisable, "0") + disCpuCstates.SetAttrs(cstates.AttrDisable, "1") + log.Debugf("cstates: class %q on cpus %v: enable=%v disable=%v", class, cpus, enabledCstates, disabledCstates) + if err := enCpuCstates.Apply(); err != nil { + return fmt.Errorf("cannot enable cstates %v on cpus %v: %w", enabledCstates, cpus, err) + } + if err := disCpuCstates.Apply(); err != nil { + return fmt.Errorf("cannot disable cstates %v on cpus %v: %w", disabledCstates, cpus, err) + } + return nil +} + +// ensureHandle lazily creates the cstates handle, picking the +// in-memory override fs when OVERRIDE_SYS_CSTATES is set. +func (w *Writer) ensureHandle() error { + if w.cs != nil { + return nil + } + filter := cstates.NewBasicFilter().SetAttributes(cstates.AttrDisable) + var ( + cs *cstates.Cstates + err error + ) + if cstatesEnvOverridesJson != "" { + cs, err = newCstatesFromOverride(filter) + } else { + cs, err = cstates.NewCstatesFromSysfs(filter) + } + if err != nil { + return fmt.Errorf("failed to read C-states: %w", err) + } + w.cs = cs + return nil +} diff --git a/pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go b/pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go new file mode 100644 index 000000000..0a8dec47d --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/cpuidle/overridefs.go @@ -0,0 +1,166 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package cpuidle + +import ( + "encoding/json" + "fmt" + "maps" + "os" + "slices" + "strconv" + "strings" + + "github.com/intel/goresctrl/pkg/cstates" + "github.com/intel/goresctrl/pkg/utils" +) + +// cstatesEnvOverridesJson lets e2e tests inject a simulated cstates +// sysfs through the OVERRIDE_SYS_CSTATES environment variable. The +// variable is read once at process start. Production deployments +// leave it unset and use real sysfs. +var ( + cstatesEnvOverridesVar = "OVERRIDE_SYS_CSTATES" + cstatesEnvOverridesJson = os.Getenv(cstatesEnvOverridesVar) +) + +type cstatesOverrides []cstatesOverride +type cstatesOverride struct { + Cpus string `json:"cpus"` + Names []string `json:"names"` + Files map[string]string `json:"files"` +} + +type cstatesOverrideFs struct { + overrides cstatesOverrides + stateName map[int]string + nameState map[string]int + cpuStateFile map[utils.ID]map[int]map[string]string +} + +// newCstatesFromOverride builds a *cstates.Cstates backed by an +// in-memory override fs constructed from the OVERRIDE_SYS_CSTATES +// JSON. Used only when that environment variable is set. +func newCstatesFromOverride(filter cstates.Filter) (*cstates.Cstates, error) { + cs := cstates.NewCstates() + ofs, err := newCstatesOverrideFs() + if err != nil { + return nil, fmt.Errorf("failed to create override fs from %s: %v", cstatesEnvOverridesVar, err) + } + cs.SetFs(ofs) + if err := cs.Read(filter); err != nil { + return nil, fmt.Errorf("failed to refresh cstates from %s overrides: %v", cstatesEnvOverridesVar, err) + } + return cs, nil +} + +func newCstatesOverrideFs() (*cstatesOverrideFs, error) { + ofs := &cstatesOverrideFs{ + stateName: make(map[int]string), + nameState: make(map[string]int), + cpuStateFile: make(map[utils.ID]map[int]map[string]string), + } + if err := json.Unmarshal([]byte(cstatesEnvOverridesJson), &ofs.overrides); err != nil { + return nil, err + } + if len(ofs.overrides) == 0 { + return nil, fmt.Errorf("no overrides found in %s", cstatesEnvOverridesVar) + } + names := make(map[string]bool) + for _, o := range ofs.overrides { + for _, name := range o.Names { + names[name] = true + } + } + orderedNames := make([]string, 0, len(names)) + for name := range names { + orderedNames = append(orderedNames, name) + } + slices.Sort(orderedNames) + for state, name := range orderedNames { + ofs.stateName[state] = name + ofs.nameState[name] = state + } + + for _, o := range ofs.overrides { + cpus, err := utils.NewIDSetFromString(o.Cpus) + if err != nil { + return nil, fmt.Errorf("invalid CPU list %q in %s: %v", o.Cpus, cstatesEnvOverridesVar, err) + } + for cpu := range cpus { + cpuid := utils.ID(cpu) + if _, ok := ofs.cpuStateFile[cpuid]; !ok { + ofs.cpuStateFile[cpuid] = make(map[int]map[string]string) + } + for _, name := range o.Names { + state := ofs.nameState[name] + if _, ok := ofs.cpuStateFile[cpuid][state]; !ok { + ofs.cpuStateFile[cpuid][state] = make(map[string]string) + } + maps.Copy(ofs.cpuStateFile[cpuid][state], o.Files) + ofs.cpuStateFile[cpuid][state]["name"] = name + } + } + } + log.Debugf("cstates override fs: loaded overrides for %d CPUs C-states: %s", len(ofs.cpuStateFile), strings.Join(orderedNames, ", ")) + return ofs, nil +} + +func (fs *cstatesOverrideFs) PossibleCpus() (string, error) { + maxCpu := utils.ID(-1) + for cpu := range fs.cpuStateFile { + if cpu > maxCpu { + maxCpu = cpu + } + } + if maxCpu < 0 { + return "", nil + } + return "0-" + strconv.Itoa(maxCpu), nil +} + +func (fs *cstatesOverrideFs) CpuidleStates(cpuID utils.ID) ([]int, error) { + states := []int{} + for state := range fs.stateName { + states = append(states, state) + } + slices.Sort(states) + return states, nil +} + +func (fs *cstatesOverrideFs) CpuidleStateAttrRead(cpu utils.ID, state int, attribute string) (string, error) { + if stateFiles, ok := fs.cpuStateFile[cpu]; ok { + if files, ok := stateFiles[state]; ok { + if val, ok := files[attribute]; ok { + log.Debugf("cstates override fs: read cpu%d cstate=%s %s=%q", cpu, fs.stateName[state], attribute, val) + return val, nil + } + } + } + log.Errorf("cstates override fs: cannot read cpu%d cstate=%s attribute %q", cpu, fs.stateName[state], attribute) + return "", os.ErrNotExist +} + +func (fs *cstatesOverrideFs) CpuidleStateAttrWrite(cpu utils.ID, state int, attribute string, value string) error { + if stateFiles, ok := fs.cpuStateFile[cpu]; ok { + if files, ok := stateFiles[state]; ok { + files[attribute] = value + log.Debugf("cstates override fs: wrote cpu%d cstate=%s %s=%q", cpu, fs.stateName[state], attribute, value) + return nil + } + } + log.Errorf("cstates override fs: write to non-existing cpu%d cstate=%d %s=%q ignored", cpu, state, attribute, value) + return nil +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct.go b/pkg/resmgr/cpuclass/internal/pct/pct.go new file mode 100644 index 000000000..1e4f06ec5 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct.go @@ -0,0 +1,974 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "fmt" + "sort" + + idset "github.com/intel/goresctrl/pkg/utils" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var log = logger.NewLogger("cpuclass") + +const ( + // pctDefaultHpClos / pctDefaultLpClos are the conventional + // CLOS slots used in managed mode when the user does not pin + // PctClosID explicitly. See the PCT Technical Article example. + pctDefaultHpClos = 0 + pctDefaultLpClos = 3 +) + +// pctMode is the operating mode of the PCT allocator. +type pctMode int + +const ( + pctModeDisabled pctMode = iota + pctModeManaged // nri-plugin owns SoC-wide SST + CLOS configs + pctModeAssocOnly // operator/BIOS owns CLOSes; we only associate CPUs +) + +// pctClassPlan records the CLOS that should be used for one PCT +// cpuClass and the freq bounds to program in managed mode. +type pctClassPlan struct { + ClosID int + MinFreq uint // kHz, 0 = leave alone + MaxFreq uint // kHz, 0 = leave alone +} + +// Sys is the subset of sysfs.System that Allocator depends +// on. Defined here so tests can substitute a fake without +// implementing the full sysfs.System surface. +type Sys interface { + PackageIDs() []idset.ID + Package(id idset.ID) sysfs.CPUPackage + CPU(id idset.ID) sysfs.CPU + CPUIDs() []idset.ID +} + +// Allocator manages Intel Priority Core Turbo CLOS associations +// driven by cpuClass definitions. +type Allocator struct { + sys Sys + sst sst + mode pctMode + classByName map[string]*policyapi.CPUClass + classPlan map[string]*pctClassPlan // class name -> CLOS plan (PCT classes only) + // fallbackClos is the hardware CLOS used for CPUs whose class + // is not a PCT class. After SST reset CLOS 0 is the default, + // so we use it here too. This is a hardware-level concept, + // not a user-visible "idle". + fallbackClos int + allowed cpuset.CPUSet + // hpClasses holds the names of cpuClasses currently + // classified as high priority. In managed mode this is every + // class with pctPriority=high. In assoc-only mode it is + // populated from GetClosConfig at Configure(): the CLOS with + // the largest programmed MaxFreq is HP; classes targeting + // that CLOS are HP. Tie-break (equal MaxFreq) goes to the + // smaller CLOS id, matching SST-CP ordered-priority + // convention. Empty when no HP class can be determined. + hpClasses map[string]bool + // punits is the per-punit topology cached from sst.Punits() + // at Configure() time, with each punit's CPUs already + // intersected with allowed. + punits []pctPunit + // punitByCpu maps each allowed CPU to its index in punits. + // CPUs outside any known punit are absent from the map; the + // allocator treats them as "no HP knowledge". + punitByCpu map[int]int + // hpUsed[i] is the set of CPUs currently held by HP-class + // workloads on punits[i]. + hpUsed map[int]cpuset.CPUSet + // hpEligiblePunit[i] reports whether punits[i] can actually + // host HP-class CPUs at top turbo. Populated at Configure(). + // In managed mode every punit becomes eligible (the plugin + // enables SST-TF itself). In assoc-only mode a punit is + // eligible only when SST-TF is currently enabled on it + // (operator's responsibility); otherwise its standard + // turbo-ratio bucket caps HP frequency and the punit must + // not contribute to scheduler-visible HP capacity. Missing + // entries are treated as not eligible. + hpEligiblePunit map[int]bool +} + +// NewAllocator returns a new PCT allocator in the disabled mode. +func NewAllocator(sys Sys) (*Allocator, error) { + s, err := newSst() + if err != nil { + return nil, err + } + return &Allocator{ + sys: sys, + sst: s, + mode: pctModeDisabled, + }, nil +} + +// configure selects the PCT operating mode from the given cpuClass +// definitions and, in managed mode, programs the corresponding SST +// CLOSes. Honors `allowed` as the boundary of CPUs the allocator may +// touch. +// +// - classes: cpuClass definitions to inspect for PCT fields. +// - allowed: CPUs the allocator may configure. +func (a *Allocator) Configure(classes []*policyapi.CPUClass, allowed cpuset.CPUSet) error { + a.classByName = make(map[string]*policyapi.CPUClass, len(classes)) + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + a.fallbackClos = pctDefaultHpClos // CLOS 0 == default-after-reset + a.allowed = allowed + a.hpUsed = map[int]cpuset.CPUSet{} + a.hpClasses = map[string]bool{} + a.hpEligiblePunit = map[int]bool{} + a.punits = nil + a.punitByCpu = nil + + mode, plans, err := a.planClasses(classes) + if err != nil { + return err + } + a.mode = mode + a.classPlan = plans + if mode == pctModeDisabled { + log.Debugf("pct: no cpuClasses request PCT; PCT allocator disabled") + return nil + } + if !a.sst.Supported() { + log.Warnf("pct: SST not supported on this host; ignoring PCT fields in cpuClasses") + a.mode = pctModeDisabled + a.classPlan = nil + return nil + } + + a.snapshotPunits() + log.Infof("pct: mode=%s, %d PCT cpuClass(es), %d punit(s) across %d package(s)", + a.modeString(), len(plans), len(a.punits), len(a.packageIDsFromPunits())) + + if mode == pctModeManaged { + if err := a.sst.PrepareManagedMode(); err != nil { + return fmt.Errorf("pct: failed to prepare managed mode: %w", err) + } + // Managed mode owns SST-TF and enables it on every punit + // (PrepareManagedMode). All snapshotted punits are thus + // HP-eligible. + for idx := range a.punits { + a.hpEligiblePunit[idx] = true + } + // Program every requested CLOS. + closesProgrammed := map[int]bool{} + closIDs := make([]int, 0, len(plans)) + for _, p := range plans { + if closesProgrammed[p.ClosID] { + continue + } + closIDs = append(closIDs, p.ClosID) + closesProgrammed[p.ClosID] = true + } + sort.Ints(closIDs) + for _, closID := range closIDs { + var minF, maxF int + for _, p := range plans { + if p.ClosID == closID { + minF = int(p.MinFreq) + maxF = int(p.MaxFreq) + break + } + } + cfg := pctClosConfig{ClosID: closID, MinFreq: minF, MaxFreq: maxF} + if err := a.sst.ConfigureClos(cfg); err != nil { + return fmt.Errorf("pct: failed to configure CLOS %d: %w", closID, err) + } + log.Infof("pct: programmed CLOS %d min=%d max=%d kHz", closID, minF, maxF) + } + if err := a.sst.EnableCP(); err != nil { + return fmt.Errorf("pct: failed to enable SST-CP: %w", err) + } + // Managed mode: HP classes are exactly those with pctPriority=high. + // LP classes are those with pctPriority=low. + var lpClos *int + for _, cc := range classes { + switch cc.PctPriority { + case "high": + a.hpClasses[cc.Name] = true + log.Infof("pct: cpuClass %q classified HP (managed: pctPriority=high, CLOS %d)", + cc.Name, plans[cc.Name].ClosID) + case "low": + id := plans[cc.Name].ClosID + lpClos = &id + log.Infof("pct: cpuClass %q classified LP (managed: pctPriority=low, CLOS %d)", + cc.Name, plans[cc.Name].ClosID) + } + } + // Idle / non-PCT CPUs must fall back to the LP CLOS (when + // defined). Leaving them on CLOS 0 inflates the SST-TF + // active-HP-core count on every punit and prevents bucket-0 + // turbo selection on punits hosting both an HP and an LP + // balloon. + if lpClos != nil { + a.fallbackClos = *lpClos + log.Infof("pct: fallback CLOS for non-PCT CPUs set to %d (LP)", a.fallbackClos) + } + } else { + // Assoc-only: classify HP/LP from CLOS configs programmed + // by the operator/BIOS. The CLOS with the largest MaxFreq + // among the CLOSes our cpuClasses target is HP. + a.classifyAssocOnlyHP(classes) + a.evaluateAssocOnlyHpEligibility() + } + return nil +} + +// evaluateAssocOnlyHpEligibility populates hpEligiblePunit and +// warns the operator about punits where SST-TF is disabled. In +// assoc-only mode the plugin must not toggle SST-TF (the operator +// owns global SST state). Without SST-TF the standard turbo-ratio +// table caps HP cores at the many-active-cores bucket frequency -- +// a low-CLOS-ID association alone is not enough to exceed it. +// Capacity for HP cpuClasses on such punits must therefore be +// reported as zero, otherwise the scheduler bin-packs HP pods onto +// nodes that cannot actually deliver top turbo. The warning points +// the operator at the intel-speed-select command that enables it. +func (a *Allocator) evaluateAssocOnlyHpEligibility() { + if len(a.punits) == 0 { + return + } + status, err := a.sst.TFStatus() + if err != nil { + log.Warnf("pct: assoc-only: cannot read SST-TF status: %v", err) + // Unknown TF state: leave every punit ineligible. Safer + // to under-publish HP capacity than to over-publish it. + return + } + for idx, pu := range a.punits { + enabled, ok := status[pctPunitID{PkgID: pu.PkgID, PunitID: pu.PunitID}] + if !ok { + // No entry: TF state unknown for this punit. Treat + // as ineligible. + continue + } + if enabled { + a.hpEligiblePunit[idx] = true + continue + } + // Pick one representative CPU from the punit for the + // operator hint -- intel-speed-select needs at least + // one CPU on the target punit. + repCPU := -1 + for _, c := range pu.CPUs.UnsortedList() { + repCPU = c + break + } + log.Warnf("pct: assoc-only: SST-TF disabled on pkg=%d punit=%d; "+ + "HP cores on this punit cannot exceed the standard "+ + "turbo-ratio bucket frequency. Enable with: "+ + "intel-speed-select -c %d turbo-freq enable -a", + pu.PkgID, pu.PunitID, repCPU) + } +} + +// snapshotPunits caches the per-punit topology from the sst +// backend, intersecting each punit's CPUs with the allowed set. +// Punits whose intersection with allowed is empty are dropped -- +// they cannot affect placement under this Configure(). The +// resulting punits and punitByCpu indices drive HP accounting and +// hpReserveCpus tier selection. +func (a *Allocator) snapshotPunits() { + raw := a.sst.Punits() + a.punits = make([]pctPunit, 0, len(raw)) + a.punitByCpu = map[int]int{} + for _, pu := range raw { + cpus := pu.CPUs + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + continue + } + idx := len(a.punits) + a.punits = append(a.punits, pctPunit{ + PkgID: pu.PkgID, + PunitID: pu.PunitID, + CPUs: cpus, + MaxHpCpus: pu.MaxHpCpus, + GuaranteedHpCpus: pu.GuaranteedHpCpus, + }) + for _, c := range cpus.UnsortedList() { + a.punitByCpu[c] = idx + } + } +} + +// packageIDsFromPunits returns the set of package IDs present in +// the cached punits, in stable sorted order. +func (a *Allocator) packageIDsFromPunits() []int { + seen := map[int]bool{} + ids := []int{} + for _, pu := range a.punits { + if seen[pu.PkgID] { + continue + } + seen[pu.PkgID] = true + ids = append(ids, pu.PkgID) + } + sort.Ints(ids) + return ids +} + +// classifyAssocOnlyHP populates hpClasses by reading the +// programmed MaxFreq of each CLOS referenced by an assoc-only +// cpuClass. The CLOS with the largest MaxFreq is treated as HP; +// ties go to the smaller CLOS id (matching SST-CP ordered-priority +// convention where lower CLOS ids have higher priority). When no +// CLOS reports a programmed MaxFreq, no class is classified as HP +// (HP-specific hints stay quiet for that class set). +func (a *Allocator) classifyAssocOnlyHP(classes []*policyapi.CPUClass) { + maxFreqs := map[int]int{} + closIDs := []int{} + for _, p := range a.classPlan { + if _, seen := maxFreqs[p.ClosID]; seen { + continue + } + cfg, ok, err := a.sst.GetClosConfig(p.ClosID) + if err != nil { + log.Warnf("pct: assoc-only: GetClosConfig(%d) failed: %v", p.ClosID, err) + continue + } + if !ok { + log.Infof("pct: assoc-only: CLOS %d not programmed; cannot classify HP/LP", p.ClosID) + continue + } + maxFreqs[p.ClosID] = cfg.MaxFreq + closIDs = append(closIDs, p.ClosID) + log.Infof("pct: assoc-only: CLOS %d programmed min=%d max=%d kHz", p.ClosID, cfg.MinFreq, cfg.MaxFreq) + } + if len(closIDs) == 0 { + return + } + sort.Ints(closIDs) + bestClos := -1 + bestMax := -1 + for _, id := range closIDs { + if maxFreqs[id] > bestMax { + bestMax = maxFreqs[id] + bestClos = id + } + } + if bestClos < 0 || bestMax <= 0 { + log.Infof("pct: assoc-only: no CLOS has a programmed MaxFreq; HP classification skipped") + return + } + for _, cc := range classes { + p, ok := a.classPlan[cc.Name] + if !ok || p.ClosID != bestClos { + continue + } + a.hpClasses[cc.Name] = true + log.Infof("pct: cpuClass %q classified HP (assoc-only: CLOS %d MaxFreq=%d kHz)", cc.Name, bestClos, bestMax) + } +} + +// planClasses returns the PCT operating mode and the per-class +// CLOS plan derived from cpuClasses. +func (a *Allocator) planClasses(classes []*policyapi.CPUClass) (pctMode, map[string]*pctClassPlan, error) { + plans := map[string]*pctClassPlan{} + managed, assocOnly := false, false + for _, cc := range classes { + switch { + case cc.PctPriority != "": + managed = true + plan := &pctClassPlan{} + switch cc.PctPriority { + case "high": + plan.ClosID = pctDefaultHpClos + case "low": + plan.ClosID = pctDefaultLpClos + default: + return pctModeDisabled, nil, fmt.Errorf("cpuClass %q: invalid pctPriority %q", cc.Name, cc.PctPriority) + } + minSrc, maxSrc := cc.PctMinFreq, cc.PctMaxFreq + if minSrc == 0 { + minSrc = cc.MinFreq + } + if maxSrc == 0 { + maxSrc = cc.MaxFreq + } + plan.MinFreq = a.resolveHWFreq(minSrc) + plan.MaxFreq = a.resolveHWFreq(maxSrc) + plans[cc.Name] = plan + case cc.PctClosID != nil: + assocOnly = true + plans[cc.Name] = &pctClassPlan{ClosID: *cc.PctClosID} + } + } + switch { + case !managed && !assocOnly: + return pctModeDisabled, nil, nil + case managed && assocOnly: + return pctModeDisabled, nil, fmt.Errorf("pct: cannot mix managed (pctPriority) and assoc-only (pctClosID) modes") + case managed: + return pctModeManaged, plans, nil + default: + return pctModeAssocOnly, plans, nil + } +} + +// resolveHWFreq returns the hardware frequency in kHz that the +// given symbolic policyapi.Frequency refers to. "turbo" resolves to the +// platform's maximum turbo frequency. +func (a *Allocator) resolveHWFreq(f policyapi.Frequency) uint { + if f == 0 { + return 0 + } + info, err := discoverTurboInfo(a.sys) + if err != nil || info == nil { + log.Warnf("pct: cannot discover platform turbo info: %v", err) + return uint(f) + } + return f.Resolve(info.minFreqKHz, info.baseFreqKHz, info.maxTurboFreqKHz) +} + +// active reports whether PCT is in effect (mode != disabled). +func (a *Allocator) Active() bool { + return a != nil && a.mode != pctModeDisabled +} + +// freeClassCapacity returns the number of logical CPUs that can +// still be allocated to className, given that 'held' lists CPUs +// already consumed by some balloon on this node (any class). +// +// Same formula in managed and assoc-only modes: +// - HP class: sum over HP-eligible punits of +// min(GuaranteedHpCpus, |pu.CPUs intersect Allowed minus held|). +// HP capacity is bounded by the punit's *guaranteed top-turbo* +// HP count (smallest non-zero SST-TF bucket +// HighPriorityCoreCount, or SST-BF HP CPU count when TF is +// unsupported) -- not by the larger MaxHpCpus the allocator +// uses for steering. The scheduler-visible capacity must +// reflect how many CPUs can *actually* sustain the highest +// turbo frequency this platform exposes; otherwise HP pods +// get scheduled past the guaranteed-turbo headroom and fall +// back to lower-bucket frequencies. +// - non-HP class: |Allowed minus held|. The allocator can +// re-associate any Allowed CPU to any CLOS on demand, so the +// gating set is what the plugin owns, not what currently +// lives on the target CLOS in hardware. +// +// The modes differ in how hpEligiblePunit is populated: +// - Managed mode: every snapshotted punit is HP-eligible (the +// plugin enables SST-TF itself via PrepareManagedMode). +// - Assoc-only mode: a punit is HP-eligible only when SST-TF +// is currently enabled on it (operator's responsibility). +// Punits where TF is disabled cannot exceed the standard +// turbo-ratio bucket and contribute 0 to HP capacity, so the +// scheduler does not bin-pack HP pods onto nodes that cannot +// deliver top turbo. +// +// Returns 0 for classes that have no PCT plan or when PCT is not +// active. Negative intermediate counts are clamped to 0. +func (a *Allocator) FreeClassCapacity(className string, held cpuset.CPUSet) int { + if !a.Active() { + return 0 + } + if _, ok := a.classPlan[className]; !ok { + return 0 + } + allowed := a.allowed + free := allowed + if free.Size() > 0 { + free = free.Difference(held) + } + if !a.classIsHighPriority(className) { + return free.Size() + } + total := 0 + for idx, pu := range a.punits { + if !a.hpEligiblePunit[idx] { + continue + } + puCpus := pu.CPUs + if allowed.Size() > 0 { + puCpus = puCpus.Intersection(allowed) + } + puFree := puCpus.Difference(held).Size() + gtdHp := pu.GuaranteedHpCpus + if gtdHp <= 0 { + continue + } + room := gtdHp + if puFree < room { + room = puFree + } + if room < 0 { + room = 0 + } + total += room + } + return total +} + +// useClass associates the given CPUs to the CLOS chosen for className. +// In managed mode, CPUs whose className is not a PCT class are +// associated to the fallback CLOS. In assoc-only mode such CPUs are +// left unchanged. CPUs outside the configured Allowed set are silently +// dropped. +func (a *Allocator) UseClass(className string, cpus cpuset.CPUSet) error { + if !a.Active() { + return nil + } + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + return nil + } + a.trackHpUsage(className, cpus) + plan, ok := a.classPlan[className] + if !ok { + if a.mode == pctModeAssocOnly { + return nil + } + return a.associate(cpus, a.fallbackClos) + } + return a.associate(cpus, plan.ClosID) +} + +// trackHpUsage updates per-punit HP CPU bookkeeping so cpus are +// recorded as held by an HP class if className is HP, and removed +// from HP bookkeeping otherwise. CPUs not mapped to any punit +// (e.g. outside Allowed at Configure time) are ignored: they +// cannot affect HP placement and tracking them would only confuse +// hpInUseCpus. +func (a *Allocator) trackHpUsage(className string, cpus cpuset.CPUSet) { + if !a.hpHintsActive() { + return + } + a.clearHpUsage(cpus) + if !a.classIsHighPriority(className) { + return + } + perPunit := map[int][]int{} + for _, cpu := range cpus.UnsortedList() { + idx, ok := a.punitByCpu[cpu] + if !ok { + continue + } + perPunit[idx] = append(perPunit[idx], cpu) + } + for idx, list := range perPunit { + set := a.hpUsed[idx] + a.hpUsed[idx] = set.Union(cpuset.New(list...)) + } +} + +// clearHpUsage removes cpus from per-punit HP bookkeeping. +func (a *Allocator) clearHpUsage(cpus cpuset.CPUSet) { + if !a.hpHintsActive() { + return + } + for idx, set := range a.hpUsed { + if remaining := set.Difference(cpus); remaining.Size() != set.Size() { + a.hpUsed[idx] = remaining + } + } +} + +func (a *Allocator) associate(cpus cpuset.CPUSet, clos int) error { + list := cpus.UnsortedList() + sort.Ints(list) + assocs := make([]pctClosAssoc, 0, len(list)) + for _, c := range list { + assocs = append(assocs, pctClosAssoc{CPU: c, ClosID: clos}) + } + if err := a.sst.AssociateCPUs(assocs); err != nil { + return fmt.Errorf("pct: associate cpus %s to CLOS %d: %w", cpus, clos, err) + } + log.Debugf("pct: associated cpus %s to CLOS %d", cpus, clos) + return nil +} + +// Shutdown restores the platform to its default state. Safe to +// call multiple times. +func (a *Allocator) Shutdown() error { + if a == nil || !a.sst.Supported() { + return nil + } + if a.mode != pctModeManaged { + return nil + } + return a.sst.Shutdown() +} + +func (a *Allocator) modeString() string { + switch a.mode { + case pctModeManaged: + return "managed" + case pctModeAssocOnly: + return "assoc-only" + default: + return "disabled" + } +} + +// classIsHighPriority reports whether className is currently +// classified as PCT high priority. In managed mode this comes from +// pctPriority=high; in assoc-only mode it comes from the largest +// programmed CLOS MaxFreq (see classifyAssocOnlyHP). The two +// regimes share one map so that hints() can treat HP/non-HP +// classes uniformly. +func (a *Allocator) classIsHighPriority(className string) bool { + if !a.Active() { + return false + } + return a.hpClasses[className] +} + +// hpHintsActive reports whether HP-room reasoning (hpReserveCpus, +// hpInUseCpus, trackHpUsage) is currently meaningful. It requires +// PCT to be active *and* at least one cpuClass to be classified as +// HP. In assoc-only mode without programmed CLOS frequencies this +// is false even though the allocator runs, because we cannot +// distinguish HP from LP CLOSes from the data we have. +func (a *Allocator) hpHintsActive() bool { + return a.Active() && len(a.hpClasses) > 0 +} + +// closCpus returns the subset of Allowed CPUs that are currently +// associated to CLOS closID. +func (a *Allocator) closCpus(closID int) cpuset.CPUSet { + if !a.Active() { + return cpuset.New() + } + out := []int{} + for _, cpu := range a.allowed.UnsortedList() { + id, err := a.sst.GetCPUClosID(cpu) + if err != nil { + continue + } + if id == closID { + out = append(out, cpu) + } + } + return cpuset.New(out...) +} + +// hpInUseCpus returns the union of CPUs of every punit currently +// hosting at least one HP CPU, constrained to Allowed. Expanding +// HP usage to whole-punit (rather than whole-package) granularity +// keeps the Avoid hint for non-HP classes from being unnecessarily +// broad on TPMI-class platforms with multiple punits per package. +func (a *Allocator) hpInUseCpus() cpuset.CPUSet { + if !a.hpHintsActive() { + return cpuset.New() + } + out := cpuset.New() + for idx, used := range a.hpUsed { + if used.IsEmpty() { + continue + } + if idx < 0 || idx >= len(a.punits) { + continue + } + out = out.Union(a.punits[idx].CPUs) + } + if a.allowed.Size() > 0 { + out = out.Intersection(a.allowed) + } + return out +} + +// hpReserveCpus returns the CPU set the upcoming HP allocation +// should prefer, computed with punit-granular HP-room accounting: +// +// room(punit) = MaxHpCpus(punit) - len(hpUsed[punit] \ excludeBln) +// +// Selection follows a strict tier order: +// +// - Tier A (single-punit win): the punit with the largest +// non-zero room and at least requested free CPUs. Returns the +// free CPUs of that punit. +// - Tier B (same-package union): when no single punit can host +// `requested` HP CPUs but some package's punits jointly can, +// return the union of free CPUs across that package's punits. +// The picked package is the one with the largest aggregate +// room; ties broken by largest aggregate free-CPU count. +// - Tier C (cross-package): never. Steering HP work across +// sockets defeats the turbo gains it would obtain, because +// cross-socket data traffic typically dominates per-core +// frequency benefits. +// +// When `requested` is 0 the function falls back to Tier A only -- +// pick the punit with the most HP room and at least one free CPU. +// Returns the empty set when no punit/package satisfies any tier +// or no free CPUs remain after Allowed-intersection; the caller +// then falls back to topology-only placement. +// +// - free: free CPUs to consider for placement. +// - excludeBln: CPUs to exclude from HP-room accounting (the +// caller's current CPU set, e.g. when expanding an existing +// allocation, so its current HP usage is not double-counted). +// - requested: number of CPUs the upcoming allocation wants. +// 0 means "unknown" (initial priming before the count is +// known); Tier A is used. +func (a *Allocator) hpReserveCpus(free cpuset.CPUSet, excludeBln cpuset.CPUSet, requested int) cpuset.CPUSet { + if !a.hpHintsActive() { + return cpuset.New() + } + if a.allowed.Size() > 0 { + free = free.Intersection(a.allowed) + } + if free.IsEmpty() { + return cpuset.New() + } + + type punitState struct { + free cpuset.CPUSet + room int + } + states := make([]punitState, len(a.punits)) + anyKnown := false + for i, pu := range a.punits { + states[i].free = pu.CPUs.Intersection(free) + if pu.MaxHpCpus <= 0 { + // Unknown capacity for this punit: do not let it + // influence HP steering. Leave room=0 so it never + // wins Tier A; package-aggregate Tier B still + // uses only known-capacity punits. + continue + } + anyKnown = true + used := a.hpUsed[i] + if excludeBln.Size() > 0 { + used = used.Difference(excludeBln) + } + room := pu.MaxHpCpus - used.Size() + if room < 0 { + room = 0 + } + states[i].room = room + } + if !anyKnown { + return cpuset.New() + } + + // Tier A: best single punit that satisfies the request. + need := requested + if need < 1 { + need = 1 + } + bestIdx := -1 + bestRoom := 0 + bestFree := -1 + for i := range a.punits { + s := states[i] + if s.free.IsEmpty() || s.room <= 0 { + continue + } + // Both the punit's free CPUs and its remaining HP + // room must be able to host the entire request. + if s.free.Size() < need || s.room < need { + continue + } + if s.room > bestRoom || (s.room == bestRoom && s.free.Size() > bestFree) { + bestIdx = i + bestRoom = s.room + bestFree = s.free.Size() + } + } + if bestIdx >= 0 { + log.Debugf("pct: hpReserveCpus tier=A punit=%d/%d room=%d free=%s", + a.punits[bestIdx].PkgID, a.punits[bestIdx].PunitID, bestRoom, states[bestIdx].free) + return states[bestIdx].free + } + + // Tier B: aggregate per package; pick the package whose + // punits together have the most room (and free CPUs). + if requested > 0 { + type pkgAgg struct { + room int + free cpuset.CPUSet + freeN int + } + agg := map[int]*pkgAgg{} + for i, pu := range a.punits { + if states[i].room <= 0 || states[i].free.IsEmpty() { + continue + } + e, ok := agg[pu.PkgID] + if !ok { + e = &pkgAgg{free: cpuset.New()} + agg[pu.PkgID] = e + } + e.room += states[i].room + e.free = e.free.Union(states[i].free) + } + pkgIDs := make([]int, 0, len(agg)) + for id, e := range agg { + e.freeN = e.free.Size() + pkgIDs = append(pkgIDs, id) + } + sort.Ints(pkgIDs) // deterministic tie-break order + bestPkg := -1 + bestPkgRoom := 0 + bestPkgFree := -1 + for _, id := range pkgIDs { + e := agg[id] + if e.room < requested { + continue + } + if e.freeN < requested { + continue + } + if e.room > bestPkgRoom || (e.room == bestPkgRoom && e.freeN > bestPkgFree) { + bestPkg = id + bestPkgRoom = e.room + bestPkgFree = e.freeN + } + } + if bestPkg >= 0 { + log.Debugf("pct: hpReserveCpus tier=B pkg=%d room=%d free=%s", + bestPkg, bestPkgRoom, agg[bestPkg].free) + return agg[bestPkg].free + } + } + + // Tier C is never taken: do not hint across packages. + log.Debugf("pct: hpReserveCpus tier=none (no punit or package has %d HP room with %d free CPUs)", + requested, free.Size()) + return cpuset.New() +} + +// classClosID returns the CLOS ID that the named cpuClass maps to, +// or (-1, false) if the class has no PCT plan. +func (a *Allocator) classClosID(className string) (int, bool) { + if !a.Active() { + return -1, false + } + p, ok := a.classPlan[className] + if !ok { + return -1, false + } + return p.ClosID, true +} + +// virtDevSstHpReserveHint and virtDevSstHpInUseHint are the +// human-readable hint names returned in types.CpuPreference.Name for the +// dynamic PCT placement preferences. +const ( + virtDevSstHpReserveHint = "sst-hp-reserve" + virtDevSstHpInUseHint = "sst-hp-in-use" +) + +// virtDevSstClosHint returns the human-readable hint name for the +// CLOS-membership preference of the given CLOS ID. +func virtDevSstClosHint(closID int) string { + return fmt.Sprintf("sst-clos-%d", closID) +} + +// hints returns prefer/avoid CPU sets that PCT would like an upcoming +// allocation under intent.ClassName to honor. Returned types.CpuPreference +// sets are not yet intersected with Allowed; the handler does that. +// +// Behavior: +// - Class has an explicit CLOS plan (assoc-only or managed): Prefer +// CLOS-member CPUs. +// - Class is currently classified HP: Prefer hpReserveCpus +// (best-fit punit; same-package union as fallback), and also +// CLOS-member CPUs. No cross-package hint is ever emitted. +// - Class is not HP and at least one HP class exists: Avoid +// hpInUseCpus (punits currently hosting HP work). +func (a *Allocator) Hints(intent types.AllocationIntent) types.AllocationHints { + if a == nil || !a.Active() { + return types.AllocationHints{} + } + out := types.AllocationHints{} + + if closID, ok := a.classClosID(intent.ClassName); ok { + closCpus := a.closCpus(closID) + if !closCpus.IsEmpty() { + out.Prefer = append(out.Prefer, types.CpuPreference{ + Name: virtDevSstClosHint(closID), + Cpus: closCpus, + }) + } + } + + if a.classIsHighPriority(intent.ClassName) { + reserve := a.hpReserveCpus(intent.FreeCpus, intent.CurrentCpus, intent.RequestedCount) + if !reserve.IsEmpty() { + out.Prefer = append(out.Prefer, types.CpuPreference{ + Name: virtDevSstHpReserveHint, + Cpus: reserve, + }) + } + return out + } + + if a.hpHintsActive() { + inUse := a.hpInUseCpus() + if !inUse.IsEmpty() { + out.Avoid = append(out.Avoid, types.CpuPreference{ + Name: virtDevSstHpInUseHint, + Cpus: inUse, + }) + } + } + return out +} + +// turboInfo holds the platform frequency reference used by the PCT +// allocator to resolve symbolic min/base/turbo frequencies. +type turboInfo struct { + baseFreqKHz uint + maxTurboFreqKHz uint + minFreqKHz uint +} + +// discoverTurboInfo reads platform turbo capabilities from sysfs via +// the first online CPU. Returns nil if no online CPU exposes valid +// frequency data. +func discoverTurboInfo(sys Sys) (*turboInfo, error) { + cpuIDs := sys.CPUIDs() + if len(cpuIDs) == 0 { + return nil, fmt.Errorf("no CPUs found in system topology") + } + for _, id := range cpuIDs { + cpu := sys.CPU(id) + if cpu == nil || !cpu.Online() { + continue + } + freq := cpu.FrequencyRange() + baseFreq := cpu.BaseFrequency() + if freq.Min == 0 && freq.Max == 0 { + continue + } + if baseFreq == 0 { + baseFreq = freq.Max + } + return &turboInfo{ + baseFreqKHz: uint(baseFreq), + maxTurboFreqKHz: uint(freq.Max), + minFreqKHz: uint(freq.Min), + }, nil + } + return nil, fmt.Errorf("no online CPU with valid frequency information found") +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_sst.go b/pkg/resmgr/cpuclass/internal/pct/pct_sst.go new file mode 100644 index 000000000..30ed1a97b --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_sst.go @@ -0,0 +1,142 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "os" + + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// pctClosConfig describes one CLOS configuration that the +// Allocator wants to program. +type pctClosConfig struct { + ClosID int + MinFreq int // kHz + MaxFreq int // kHz +} + +// pctClosAssoc records the desired CLOS association for a CPU. +type pctClosAssoc struct { + CPU int + ClosID int +} + +// pctPunit describes one SST power domain (punit) exposed by the +// platform. PkgID and PunitID together uniquely identify it; CPUs +// is the set of logical CPUs in this punit; MaxHpCpus is the +// maximum number of CPUs this punit can sustain at the elevated +// PCT high-priority frequency (SST-TF bucket count, or SST-BF HP +// CPU count when TF is unsupported). MaxHpCpus == 0 means the +// platform does not expose HP capacity for this punit; the +// allocator excludes such punits from HP steering. +type pctPunit struct { + PkgID int + PunitID int + CPUs cpuset.CPUSet + MaxHpCpus int + // GuaranteedHpCpus is the count of HP CPUs on this punit that + // can simultaneously sustain the highest turbo frequency the + // platform exposes: the smallest non-zero SST-TF bucket's + // HighPriorityCoreCount (smaller buckets unlock higher + // frequencies), or len(SST-BF HighPriorityCPUs) when TF is + // unsupported. 0 if neither feature exposes HP capacity. + // Used to publish scheduler-visible HP capacity that reflects + // "guaranteed top-turbo headroom" rather than the worst-case + // MaxHpCpus. + GuaranteedHpCpus int +} + +// pctClosCfg carries the frequency bounds programmed for one CLOS, +// in kHz. Zero stands for "not specified / leave alone". +type pctClosCfg struct { + MinFreq int + MaxFreq int +} + +// pctPunitID identifies one power domain by (package, punit) ID. +type pctPunitID struct { + PkgID int + PunitID int +} + +// sst is the subset of Intel SST functionality used by the +// cpuclass code. Implementations: sstGoresctrl for real +// hardware via goresctrl/pkg/sst, and sstMock for an +// in-memory fake seeded from OVERRIDE_SST. +type sst interface { + // Supported reports whether SST is available. + Supported() bool + + // ClosCount returns the number of CLOSes supported. + ClosCount() int + + // PackageIDs returns the IDs of all packages. + PackageIDs() []int + + // CPUsOfPackage returns the CPUs of the given package. + CPUsOfPackage(pkgID int) []int + + // Punits returns the per-punit topology and HP capacity of + // every package the platform exposes. Order is stable. + Punits() []pctPunit + + // GetClosConfig returns the frequency bounds currently + // programmed for closID. The second return value is false + // when no information is available (e.g. closID not in + // range, or the platform does not expose per-CLOS + // configuration). Used in assoc-only mode to classify a CLOS + // as HP or LP from its programmed MaxFreq. + GetClosConfig(closID int) (pctClosCfg, bool, error) + + // PrepareManagedMode resets and enables SST-TF on every + // package and selects ordered priority arbitration. + PrepareManagedMode() error + + // ConfigureClos programs CLOS frequency bounds on every + // package. + ConfigureClos(cfg pctClosConfig) error + + // EnableCP enables SST-CP on every package. + EnableCP() error + + // AssociateCPUs binds each CPU to the indicated CLOS. + AssociateCPUs(assocs []pctClosAssoc) error + + // TFStatus returns the current SST-TF enabled state per + // power domain. The map is empty when SST is unsupported. + // The status is read at call time (SST-TF can be toggled + // out-of-band by the operator). Used in assoc-only mode to + // warn at configure time when SST-TF is disabled on a punit + // hosting PCT-managed CPUs -- without SST-TF, HP cores on + // that punit cannot exceed the standard turbo-ratio bucket + // limit even if associated to a low-CLOS-ID (HP) CLOS. + TFStatus() (map[pctPunitID]bool, error) + + // GetCPUClosID returns the current CLOS association of a CPU. + GetCPUClosID(cpu int) (int, error) + + // Shutdown restores managed-mode platform state to defaults. + Shutdown() error +} + +// newSst returns an SST implementation: the in-memory mock when +// OVERRIDE_SST is set, otherwise the goresctrl-backed one. +func newSst() (sst, error) { + if v := os.Getenv(sstOverrideEnvVar); v != "" { + return newSstMock(v) + } + return newSstGoresctrl() +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go b/pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go new file mode 100644 index 000000000..b00809a5a --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_sst_goresctrl.go @@ -0,0 +1,380 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "fmt" + "sort" + + gosst "github.com/intel/goresctrl/pkg/sst" + "github.com/intel/goresctrl/pkg/utils" + + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// sstGoresctrl is the real-hardware sst backed by +// goresctrl/pkg/sst. Per-(pkg, punit) topology and HP capacity +// are snapshotted at Init() time -- the goresctrl Platform itself +// snapshots CPU topology at Init(), so refreshing here would not +// pick up CPU hotplug either. +type sstGoresctrl struct { + plat *gosst.Platform + // punits is the cached per-punit topology + HP capacity in + // stable order (sorted by PkgID, then PunitID). + punits []pctPunit +} + +func newSstGoresctrl() (sst, error) { + b := &sstGoresctrl{} + if !gosst.SstSupported() { + return b, nil + } + plat, err := gosst.Init() + if err != nil { + return nil, fmt.Errorf("SST init failed: %w", err) + } + b.plat = plat + b.punits = discoverPunits(plat) + return b, nil +} + +// discoverPunits snapshots per-punit topology and HP capacity for +// every package the platform exposes. The PP level is the current +// level of the first punit of each package, mirroring the +// approach of goresctrl's "sst info" CLI. Logged at INFO so +// operators can correlate placement decisions with the platform +// state observed at startup. A failure on one package does not +// abort discovery for the others. +func discoverPunits(plat *gosst.Platform) []pctPunit { + out := []pctPunit{} + if plat == nil { + return out + } + for _, pkg := range plat.Packages() { + pkgID := pkg.ID() + st, err := pkg.GetStatus() + if err != nil { + log.Warnf("pct: SST status unavailable for package %d: %v", pkgID, err) + continue + } + // Pick the current PP level from any punit (they share + // a level on every platform we have seen); warn on + // divergence and stick with the first one. + level := -1 + for _, pu := range st.Punits { + if level < 0 { + level = pu.PP.CurrentLevel + continue + } + if pu.PP.CurrentLevel != level { + log.Warnf("pct: package %d punits report differing PP levels; using level %d", pkgID, level) + break + } + } + if level < 0 { + log.Warnf("pct: package %d has no punits, skipping discovery", pkgID) + continue + } + info, err := pkg.GetPerfLevelInfo(level) + if err != nil { + log.Warnf("pct: SST PerfLevelInfo unavailable for package %d level %d: %v", pkgID, level, err) + continue + } + // Stable per-punit iteration. + punitIDs := make([]int, 0, len(st.Punits)) + for id := range st.Punits { + punitIDs = append(punitIDs, int(id)) + } + sort.Ints(punitIDs) + for _, pid := range punitIDs { + pu := st.Punits[utils.ID(pid)] + cpus := cpuset.New(pu.CPUs.Members()...) + max := 0 + gtd := 0 + if pi, ok := info[utils.ID(pid)]; ok { + max = punitMaxHpCpus(pi) + gtd = punitGuaranteedHpCpus(pi) + log.Infof("pct: SST discovered: pkg=%d punit=%d level=%d cpus=%s maxHpCpus=%d guaranteedHpCpus=%d (tf=%v bf=%v)", + pkgID, pid, level, cpus, max, gtd, pi.TF.Supported, pi.BF.Supported) + } else { + log.Infof("pct: SST discovered: pkg=%d punit=%d level=%d cpus=%s maxHpCpus=0 (no PerfLevelInfo)", + pkgID, pid, level, cpus) + } + out = append(out, pctPunit{ + PkgID: pkgID, + PunitID: pid, + CPUs: cpus, + MaxHpCpus: max, + GuaranteedHpCpus: gtd, + }) + } + } + return out +} + +// punitMaxHpCpus returns the maximum number of CPUs that can be +// promoted to high priority on this punit at the queried PP +// level. SST-TF takes precedence: the largest bucket's +// HighPriorityCoreCount sets the upper bound (smaller buckets +// allow higher turbo but admit fewer HP cores -- the allocator +// only needs to know the cap). When TF is unsupported or all +// buckets are empty, fall back to len(BF.HighPriorityCPUs); BF +// guarantees those CPUs run at an elevated *base* frequency, so +// the count is exact. Returns 0 only when neither feature +// exposes any HP CPUs. +func punitMaxHpCpus(pi *gosst.PerfLevelInfo) int { + if pi == nil { + return 0 + } + max := 0 + if pi.TF.Supported { + for _, b := range pi.TF.Buckets { + if b.HighPriorityCoreCount > max { + max = b.HighPriorityCoreCount + } + } + } + if max == 0 && pi.BF.Supported { + max = len(pi.BF.HighPriorityCPUs) + } + return max +} + +// punitGuaranteedHpCpus returns the count of HP CPUs that can +// simultaneously reach the platform's highest exposed turbo +// frequency on this punit. With SST-TF, smaller buckets unlock +// higher turbo frequencies, so the smallest non-zero +// HighPriorityCoreCount across buckets is the figure of merit: +// staying at or below it lets every HP CPU sustain the top-bucket +// frequency. When TF is unsupported, fall back to +// len(BF.HighPriorityCPUs) -- BF guarantees those CPUs run at the +// elevated base frequency, and there is no further headroom to +// reserve. Returns 0 when neither feature exposes HP capacity. +func punitGuaranteedHpCpus(pi *gosst.PerfLevelInfo) int { + if pi == nil { + return 0 + } + if pi.TF.Supported { + min := 0 + for _, b := range pi.TF.Buckets { + if b.HighPriorityCoreCount <= 0 { + continue + } + if min == 0 || b.HighPriorityCoreCount < min { + min = b.HighPriorityCoreCount + } + } + if min > 0 { + return min + } + } + if pi.BF.Supported { + return len(pi.BF.HighPriorityCPUs) + } + return 0 +} + +func (b *sstGoresctrl) Supported() bool { return b.plat != nil } + +func (b *sstGoresctrl) ClosCount() int { + if b.plat == nil { + return 0 + } + return b.plat.ClosCount() +} + +func (b *sstGoresctrl) PackageIDs() []int { + if b.plat == nil { + return nil + } + seen := map[int]bool{} + ids := []int{} + for _, pu := range b.punits { + if seen[pu.PkgID] { + continue + } + seen[pu.PkgID] = true + ids = append(ids, pu.PkgID) + } + sort.Ints(ids) + return ids +} + +func (b *sstGoresctrl) CPUsOfPackage(pkgID int) []int { + if b.plat == nil { + return nil + } + out := []int{} + for _, pu := range b.punits { + if pu.PkgID != pkgID { + continue + } + out = append(out, pu.CPUs.UnsortedList()...) + } + sort.Ints(out) + return out +} + +// Punits returns the cached per-punit topology and HP capacity. +func (b *sstGoresctrl) Punits() []pctPunit { + if b.plat == nil { + return nil + } + // Return a defensive copy so callers cannot mutate cached state. + out := make([]pctPunit, len(b.punits)) + copy(out, b.punits) + return out +} + +func (b *sstGoresctrl) PrepareManagedMode() error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + for _, pkg := range b.plat.Packages() { + if err := pkg.CPReset(); err != nil { + return fmt.Errorf("CPReset on package %d: %w", pkg.ID(), err) + } + if err := pkg.TFEnable(); err != nil { + return fmt.Errorf("TFEnable on package %d: %w", pkg.ID(), err) + } + if err := pkg.CPSetPriorityType(gosst.Ordered); err != nil { + return fmt.Errorf("CPSetPriorityType on package %d: %w", pkg.ID(), err) + } + } + return nil +} + +func (b *sstGoresctrl) ConfigureClos(cfg pctClosConfig) error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + // pctClosConfig stores frequencies in kHz; goresctrl ClosConfig + // uses MHz (max ratio-encoded 25500 MHz on mbox platforms). + cc := gosst.ClosConfig{MinFreq: cfg.MinFreq / 1000, MaxFreq: cfg.MaxFreq / 1000} + for _, pkg := range b.plat.Packages() { + if err := pkg.ClosConfigure(cfg.ClosID, cc); err != nil { + return fmt.Errorf("ClosConfigure(%d) on package %d: %w", cfg.ClosID, pkg.ID(), err) + } + } + return nil +} + +func (b *sstGoresctrl) EnableCP() error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + for _, pkg := range b.plat.Packages() { + if err := pkg.CPEnable(); err != nil { + return fmt.Errorf("CPEnable on package %d: %w", pkg.ID(), err) + } + } + return nil +} + +func (b *sstGoresctrl) AssociateCPUs(assocs []pctClosAssoc) error { + if b.plat == nil { + return fmt.Errorf("SST not supported on this host") + } + byClos := map[int]utils.IDSet{} + for _, a := range assocs { + if _, ok := byClos[a.ClosID]; !ok { + byClos[a.ClosID] = utils.NewIDSet() + } + byClos[a.ClosID].Add(utils.ID(a.CPU)) + } + for clos, cpus := range byClos { + if err := b.plat.ClosAssociate(clos, cpus); err != nil { + return fmt.Errorf("ClosAssociate(%d) for cpus %s: %w", clos, cpus, err) + } + } + return nil +} + +func (b *sstGoresctrl) GetCPUClosID(cpu int) (int, error) { + if b.plat == nil { + return 0, fmt.Errorf("SST not supported on this host") + } + return b.plat.GetCPUClosID(utils.ID(cpu)) +} + +func (b *sstGoresctrl) TFStatus() (map[pctPunitID]bool, error) { + out := map[pctPunitID]bool{} + if b.plat == nil { + return out, nil + } + for _, pkg := range b.plat.Packages() { + st, err := pkg.GetStatus() + if err != nil { + return nil, fmt.Errorf("TFStatus: package %d status: %w", pkg.ID(), err) + } + for pid, pu := range st.Punits { + out[pctPunitID{PkgID: pkg.ID(), PunitID: int(pid)}] = pu.TF.Enabled + } + } + return out, nil +} + +// GetClosConfig returns the frequency bounds programmed on CLOS +// closID, queried from the first package (CLOS programming is +// applied identically to every package by ConfigureClos). The +// second return value is false when SST is unsupported, the +// package status cannot be read, or closID is out of range. +func (b *sstGoresctrl) GetClosConfig(closID int) (pctClosCfg, bool, error) { + if b.plat == nil { + return pctClosCfg{}, false, nil + } + pkgs := b.plat.Packages() + if len(pkgs) == 0 { + return pctClosCfg{}, false, nil + } + st, err := pkgs[0].GetStatus() + if err != nil { + return pctClosCfg{}, false, fmt.Errorf("GetClosConfig: package %d status: %w", pkgs[0].ID(), err) + } + // Pick any punit -- per-package ConfigureClos programs all + // punits identically. goresctrl reports CLOS Config.Min/MaxFreq + // in MHz; convert to kHz so callers always see the same unit as + // they passed to ConfigureClos. + for _, pu := range st.Punits { + if closID < 0 || closID >= len(pu.Clos) { + return pctClosCfg{}, false, nil + } + return pctClosCfg{ + MinFreq: pu.Clos[closID].Config.MinFreq * 1000, + MaxFreq: pu.Clos[closID].Config.MaxFreq * 1000, + }, true, nil + } + return pctClosCfg{}, false, nil +} + +// MaxHpCpus method removed in favor of Punits(). + +func (b *sstGoresctrl) Shutdown() error { + if b.plat == nil { + return nil + } + for _, pkg := range b.plat.Packages() { + if err := pkg.CPReset(); err != nil { + return fmt.Errorf("CPReset on package %d: %w", pkg.ID(), err) + } + if err := pkg.TFDisable(); err != nil { + return fmt.Errorf("TFDisable on package %d: %w", pkg.ID(), err) + } + if err := pkg.CPDisable(); err != nil { + return fmt.Errorf("CPDisable on package %d: %w", pkg.ID(), err) + } + } + return nil +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go b/pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go new file mode 100644 index 000000000..77bb62777 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_sst_mock.go @@ -0,0 +1,452 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "encoding/json" + "fmt" + "os" + "path/filepath" + "sort" + + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// sstOverrideEnvVar holds JSON seeding the in-memory SST mock. +// Follows the existing OVERRIDE_SYS_CACHES / OVERRIDE_SYS_CPUFREQ +// convention in pkg/sysfs/system.go. +const ( + sstOverrideEnvVar = "OVERRIDE_SST" + sstOverrideStateDirVar = "OVERRIDE_SST_STATE_DIR" + sstOverrideStateFile = "state.json" +) + +// sstMockClos seeds the per-CLOS state of one package. +type sstMockClos struct { + ID int `json:"id"` + MinFreq int `json:"min_freq"` + MaxFreq int `json:"max_freq"` + CPUs string `json:"cpus,omitempty"` // listset like "0-15" +} + +// sstMockPunit seeds one punit's CPUs and HP capacity. +type sstMockPunit struct { + ID int `json:"id"` + CPUs string `json:"cpus"` // listset + MaxHpCpus int `json:"max_hp_cpus,omitempty"` + GuaranteedHpCpus int `json:"guaranteed_hp_cpus,omitempty"` +} + +// sstMockPackage seeds one package's worth of SST state. +type sstMockPackage struct { + ID int `json:"id"` + CPUs string `json:"cpus"` // listset of all CPUs in the package + TFSupported bool `json:"tf_supported"` + TFEnabled bool `json:"tf_enabled"` + CPSupported bool `json:"cp_supported"` + CPEnabled bool `json:"cp_enabled"` + CPPriority string `json:"cp_priority,omitempty"` // "ordered" or "proportional" + // MaxHpCpus seeds a per-package HP CPU count for the + // back-compat case where Punits is not specified -- one + // synthetic punit is created containing every package CPU + // and this MaxHpCpus value. + MaxHpCpus int `json:"max_hp_cpus,omitempty"` + Punits []*sstMockPunit `json:"punits,omitempty"` + Clos []*sstMockClos `json:"clos,omitempty"` +} + +// sstMockDoc is the full JSON document accepted in OVERRIDE_SST. +type sstMockDoc struct { + Supported bool `json:"supported"` + ClosCount int `json:"clos_count"` + Packages []*sstMockPackage `json:"packages"` +} + +// sstMock is an in-memory sst implementation. Seed +// state comes from OVERRIDE_SST; mutations from policy calls are +// recorded into the in-memory doc and persisted to a state file +// after every operation so e2e tests can inspect the result. +type sstMock struct { + doc *sstMockDoc + cpuPkg map[int]*sstMockPackage // cpu -> package + cpuClos map[int]int // cpu -> currently-associated CLOS id + stateDir string +} + +func newSstMock(jsonData string) (sst, error) { + doc := &sstMockDoc{} + if err := json.Unmarshal([]byte(jsonData), doc); err != nil { + return nil, fmt.Errorf("failed to parse %s JSON: %w", sstOverrideEnvVar, err) + } + if doc.ClosCount == 0 { + doc.ClosCount = 4 + } + b := &sstMock{ + doc: doc, + cpuPkg: map[int]*sstMockPackage{}, + cpuClos: map[int]int{}, + stateDir: os.Getenv(sstOverrideStateDirVar), + } + if b.stateDir == "" { + b.stateDir = "/tmp/nri-pct-mock" + } + for _, pkg := range doc.Packages { + cpus, err := parseCPUList(pkg.CPUs) + if err != nil { + return nil, fmt.Errorf("%s: invalid cpus %q in package %d: %w", sstOverrideEnvVar, pkg.CPUs, pkg.ID, err) + } + for _, c := range cpus { + b.cpuPkg[c] = pkg + b.cpuClos[c] = 0 + } + // If seed pre-associates CPUs to non-zero CLOSes, honor that. + for _, cl := range pkg.Clos { + if cl.CPUs == "" { + continue + } + clCpus, err := parseCPUList(cl.CPUs) + if err != nil { + return nil, fmt.Errorf("%s: invalid clos.cpus %q: %w", sstOverrideEnvVar, cl.CPUs, err) + } + for _, c := range clCpus { + b.cpuClos[c] = cl.ID + } + } + } + if err := b.persist(); err != nil { + log.Warnf("pct mock: failed to write initial state file: %v", err) + } + log.Infof("pct mock: seeded with %d package(s), supported=%v, closCount=%d, stateDir=%q", + len(doc.Packages), doc.Supported, doc.ClosCount, b.stateDir) + return b, nil +} + +func (b *sstMock) Supported() bool { return b.doc.Supported } + +func (b *sstMock) ClosCount() int { return b.doc.ClosCount } + +func (b *sstMock) PackageIDs() []int { + ids := make([]int, 0, len(b.doc.Packages)) + for _, p := range b.doc.Packages { + ids = append(ids, p.ID) + } + sort.Ints(ids) + return ids +} + +func (b *sstMock) CPUsOfPackage(pkgID int) []int { + for _, p := range b.doc.Packages { + if p.ID == pkgID { + cpus, _ := parseCPUList(p.CPUs) + return cpus + } + } + return nil +} + +func (b *sstMock) pkgEnsureClos(pkg *sstMockPackage, clos int) *sstMockClos { + for _, c := range pkg.Clos { + if c.ID == clos { + return c + } + } + c := &sstMockClos{ID: clos} + pkg.Clos = append(pkg.Clos, c) + sort.Slice(pkg.Clos, func(i, j int) bool { return pkg.Clos[i].ID < pkg.Clos[j].ID }) + return c +} + +func (b *sstMock) PrepareManagedMode() error { + for _, pkg := range b.doc.Packages { + // CPReset: clear CLOS configs, associate all CPUs to CLOS 0. + pkg.Clos = nil + cpus, _ := parseCPUList(pkg.CPUs) + for _, c := range cpus { + b.cpuClos[c] = 0 + } + pkg.TFEnabled = true + pkg.CPPriority = "ordered" + } + log.Debugf("pct mock: PrepareManagedMode done (CPReset+TFEnable+CPSetPriorityType=ordered)") + return b.persist() +} + +func (b *sstMock) ConfigureClos(cfg pctClosConfig) error { + for _, pkg := range b.doc.Packages { + c := b.pkgEnsureClos(pkg, cfg.ClosID) + c.MinFreq = cfg.MinFreq + c.MaxFreq = cfg.MaxFreq + } + log.Debugf("pct mock: ConfigureClos %+v", cfg) + return b.persist() +} + +func (b *sstMock) EnableCP() error { + for _, pkg := range b.doc.Packages { + pkg.CPEnabled = true + } + log.Debugf("pct mock: EnableCP done") + return b.persist() +} + +func (b *sstMock) AssociateCPUs(assocs []pctClosAssoc) error { + for _, a := range assocs { + if _, ok := b.cpuPkg[a.CPU]; !ok { + return fmt.Errorf("pct mock: CPU %d not present in any seeded package", a.CPU) + } + b.cpuClos[a.CPU] = a.ClosID + } + // Refresh per-CLOS CPU lists on each package for readable state. + for _, pkg := range b.doc.Packages { + clos2cpus := map[int][]int{} + cpus, _ := parseCPUList(pkg.CPUs) + for _, c := range cpus { + cl := b.cpuClos[c] + clos2cpus[cl] = append(clos2cpus[cl], c) + } + for _, cl := range pkg.Clos { + cl.CPUs = formatCPUList(clos2cpus[cl.ID]) + delete(clos2cpus, cl.ID) + } + for clID, list := range clos2cpus { + c := b.pkgEnsureClos(pkg, clID) + c.CPUs = formatCPUList(list) + } + } + log.Debugf("pct mock: AssociateCPUs %+v", assocs) + return b.persist() +} + +func (b *sstMock) GetCPUClosID(cpu int) (int, error) { + cl, ok := b.cpuClos[cpu] + if !ok { + return 0, fmt.Errorf("pct mock: CPU %d not present in any seeded package", cpu) + } + return cl, nil +} + +// Punits returns the per-punit topology of every seeded package. +// If a package's seed omits the Punits list, a single synthetic +// punit (ID 0) is returned spanning every CPU of the package, +// carrying the package-level MaxHpCpus for back-compat with the +// pre-punit OVERRIDE_SST schema. +func (b *sstMock) Punits() []pctPunit { + out := []pctPunit{} + // Stable order: sort packages by ID, punits by ID. + pkgIDs := make([]int, 0, len(b.doc.Packages)) + pkgByID := map[int]*sstMockPackage{} + for _, p := range b.doc.Packages { + pkgIDs = append(pkgIDs, p.ID) + pkgByID[p.ID] = p + } + sort.Ints(pkgIDs) + for _, pid := range pkgIDs { + pkg := pkgByID[pid] + if len(pkg.Punits) == 0 { + cpus, _ := parseCPUList(pkg.CPUs) + out = append(out, pctPunit{ + PkgID: pkg.ID, + PunitID: 0, + CPUs: cpuset.New(cpus...), + MaxHpCpus: pkg.MaxHpCpus, + GuaranteedHpCpus: pkg.MaxHpCpus, + }) + continue + } + punits := append([]*sstMockPunit(nil), pkg.Punits...) + sort.Slice(punits, func(i, j int) bool { return punits[i].ID < punits[j].ID }) + for _, pu := range punits { + cpus, _ := parseCPUList(pu.CPUs) + gtd := pu.GuaranteedHpCpus + if gtd == 0 { + gtd = pu.MaxHpCpus + } + out = append(out, pctPunit{ + PkgID: pkg.ID, + PunitID: pu.ID, + CPUs: cpuset.New(cpus...), + MaxHpCpus: pu.MaxHpCpus, + GuaranteedHpCpus: gtd, + }) + } + } + return out +} + +// GetClosConfig returns the frequency bounds currently programmed +// for closID. The mock's CLOS state is shared across packages by +// construction (ConfigureClos writes it to all packages); we +// return the first package's entry. +func (b *sstMock) GetClosConfig(closID int) (pctClosCfg, bool, error) { + for _, pkg := range b.doc.Packages { + for _, cl := range pkg.Clos { + if cl.ID != closID { + continue + } + return pctClosCfg{MinFreq: cl.MinFreq, MaxFreq: cl.MaxFreq}, true, nil + } + // First package checked, no entry for closID. + return pctClosCfg{}, false, nil + } + return pctClosCfg{}, false, nil +} + +// TFStatus mirrors the per-package TFEnabled flag onto each of +// the package's punits (the mock's TF state is per-package). +func (b *sstMock) TFStatus() (map[pctPunitID]bool, error) { + out := map[pctPunitID]bool{} + for _, pkg := range b.doc.Packages { + if len(pkg.Punits) == 0 { + out[pctPunitID{PkgID: pkg.ID, PunitID: 0}] = pkg.TFEnabled + continue + } + for _, pu := range pkg.Punits { + out[pctPunitID{PkgID: pkg.ID, PunitID: pu.ID}] = pkg.TFEnabled + } + } + return out, nil +} + +func (b *sstMock) Shutdown() error { + for cpu := range b.cpuClos { + b.cpuClos[cpu] = 0 + } + for _, pkg := range b.doc.Packages { + pkg.Clos = nil + pkg.TFEnabled = false + pkg.CPEnabled = false + } + log.Debugf("pct mock: Shutdown done") + return b.persist() +} + +func (b *sstMock) persist() error { + if err := os.MkdirAll(b.stateDir, 0o755); err != nil { + return err + } + data, err := json.MarshalIndent(b.doc, "", " ") + if err != nil { + return err + } + return os.WriteFile(filepath.Join(b.stateDir, sstOverrideStateFile), data, 0o644) +} + +// parseCPUList parses a listset string like "0-3,8,10-12". +func parseCPUList(s string) ([]int, error) { + if s == "" { + return nil, nil + } + out := []int{} + for _, part := range splitComma(s) { + if part == "" { + continue + } + lo, hi, err := parseRange(part) + if err != nil { + return nil, err + } + for i := lo; i <= hi; i++ { + out = append(out, i) + } + } + sort.Ints(out) + return out, nil +} + +// formatCPUList formats an int slice as a listset like "0-3,8,10-12". +func formatCPUList(ids []int) string { + if len(ids) == 0 { + return "" + } + sorted := append([]int(nil), ids...) + sort.Ints(sorted) + var parts []string + lo := sorted[0] + prev := lo + flush := func() { + if lo == prev { + parts = append(parts, fmt.Sprintf("%d", lo)) + } else { + parts = append(parts, fmt.Sprintf("%d-%d", lo, prev)) + } + } + for _, v := range sorted[1:] { + if v == prev+1 { + prev = v + continue + } + flush() + lo, prev = v, v + } + flush() + return joinComma(parts) +} + +func splitComma(s string) []string { + out := []string{} + cur := "" + for _, r := range s { + if r == ',' { + out = append(out, cur) + cur = "" + continue + } + cur += string(r) + } + if cur != "" { + out = append(out, cur) + } + return out +} + +func joinComma(parts []string) string { + out := "" + for i, p := range parts { + if i > 0 { + out += "," + } + out += p + } + return out +} + +func parseRange(s string) (int, int, error) { + for i, r := range s { + if r == '-' { + lo, err := atoi(s[:i]) + if err != nil { + return 0, 0, err + } + hi, err := atoi(s[i+1:]) + if err != nil { + return 0, 0, err + } + return lo, hi, nil + } + } + v, err := atoi(s) + if err != nil { + return 0, 0, err + } + return v, v, nil +} + +func atoi(s string) (int, error) { + var v int + if _, err := fmt.Sscanf(s, "%d", &v); err != nil { + return 0, fmt.Errorf("invalid integer %q: %w", s, err) + } + return v, nil +} diff --git a/pkg/resmgr/cpuclass/internal/pct/pct_test.go b/pkg/resmgr/cpuclass/internal/pct/pct_test.go new file mode 100644 index 000000000..61df8bd37 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/pct/pct_test.go @@ -0,0 +1,1070 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package pct + +import ( + "errors" + "sort" + "testing" + + gosst "github.com/intel/goresctrl/pkg/sst" + idset "github.com/intel/goresctrl/pkg/utils" + + policyapi "github.com/containers/nri-plugins/pkg/apis/config/v1alpha1/resmgr/policy" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +var errFakeSstNoClos = errors.New("fakeSst: no CLOS for CPU") + +// --- minimal sysfs.System / CPUPackage / CPU fakes ------------------ + +// fakePackage implements sysfs.CPUPackage via an embedded nil +// interface. Methods not overridden here panic if called, which is +// the desired guardrail in unit tests. +type fakePackage struct { + sysfs.CPUPackage + id idset.ID + cpus cpuset.CPUSet +} + +func (p *fakePackage) ID() idset.ID { return p.id } +func (p *fakePackage) CPUSet() cpuset.CPUSet { return p.cpus } + +// fakeCPU implements sysfs.CPU likewise. +type fakeCPU struct { + sysfs.CPU + id idset.ID + pkg idset.ID +} + +func (c *fakeCPU) ID() idset.ID { return c.id } +func (c *fakeCPU) PackageID() idset.ID { return c.pkg } + +// fakeSys is a minimal Sys implementation built from package +// CPU maps. +type fakeSys struct { + packageCpus map[idset.ID]cpuset.CPUSet // pkgID -> cpus + cpuPkg map[int]idset.ID // cpu -> pkgID +} + +func (s *fakeSys) PackageIDs() []idset.ID { + ids := make([]idset.ID, 0, len(s.packageCpus)) + for id := range s.packageCpus { + ids = append(ids, id) + } + return ids +} + +func (s *fakeSys) Package(id idset.ID) sysfs.CPUPackage { + cpus, ok := s.packageCpus[id] + if !ok { + return nil + } + return &fakePackage{id: id, cpus: cpus} +} + +func (s *fakeSys) CPU(id idset.ID) sysfs.CPU { + pkg, ok := s.cpuPkg[int(id)] + if !ok { + return nil + } + return &fakeCPU{id: id, pkg: pkg} +} + +func (s *fakeSys) CPUIDs() []idset.ID { return nil } + +// newTwoPackageFakeSys returns a fakeSys with two packages of 4 CPUs +// each: pkg0=0..3, pkg1=4..7. +func newTwoPackageFakeSys() *fakeSys { + return &fakeSys{ + packageCpus: map[idset.ID]cpuset.CPUSet{ + 0: cpuset.MustParse("0-3"), + 1: cpuset.MustParse("4-7"), + }, + cpuPkg: map[int]idset.ID{ + 0: 0, 1: 0, 2: 0, 3: 0, + 4: 1, 5: 1, 6: 1, 7: 1, + }, + } +} + +// --- minimal sst fake ------------------------------------------------ + +// fakeSst implements just the methods that Allocator.hints (and +// closCpus) actually call. +type fakeSst struct { + supported bool + cpuClos map[int]int // cpu -> CLOS id + maxHp map[int]int // pkgID -> max HP CPUs (missing = "unknown") + pkgCpus map[int]cpuset.CPUSet + // punits, when non-nil, overrides the synthesized one-punit-per-package + // Punits() output. Use to exercise multi-punit-per-package layouts. + punits []pctPunit + // closCfg, when non-nil, drives GetClosConfig() responses. + closCfg map[int]pctClosCfg +} + +func (s *fakeSst) Supported() bool { return s.supported } +func (s *fakeSst) ClosCount() int { return 4 } +func (s *fakeSst) PackageIDs() []int { return nil } +func (s *fakeSst) CPUsOfPackage(int) []int { return nil } +func (s *fakeSst) PrepareManagedMode() error { return nil } +func (s *fakeSst) ConfigureClos(pctClosConfig) error { return nil } +func (s *fakeSst) EnableCP() error { return nil } +func (s *fakeSst) AssociateCPUs([]pctClosAssoc) error { return nil } +func (s *fakeSst) GetCPUClosID(cpu int) (int, error) { + if clos, ok := s.cpuClos[cpu]; ok { + return clos, nil + } + // Return an error so closCpus skips this CPU rather than + // treating it as "associated to CLOS 0 by default". + return -1, errFakeSstNoClos +} + +// Punits synthesizes one punit per package whose CPUs come from +// pkgCpus (or maxHp keys if pkgCpus is nil) with MaxHpCpus set +// from the maxHp map. PunitID is always 0 (single punit per pkg +// preserves the legacy per-package test semantics). +func (s *fakeSst) Punits() []pctPunit { + if s.punits != nil { + out := make([]pctPunit, len(s.punits)) + copy(out, s.punits) + return out + } + pkgIDs := map[int]struct{}{} + for id := range s.pkgCpus { + pkgIDs[id] = struct{}{} + } + for id := range s.maxHp { + pkgIDs[id] = struct{}{} + } + out := make([]pctPunit, 0, len(pkgIDs)) + for id := range pkgIDs { + cpus, ok := s.pkgCpus[id] + if !ok { + // Derive a default cpu range matching newTwoPackageFakeSys layout. + switch id { + case 0: + cpus = cpuset.MustParse("0-3") + case 1: + cpus = cpuset.MustParse("4-7") + } + } + out = append(out, pctPunit{ + PkgID: id, + PunitID: 0, + CPUs: cpus, + MaxHpCpus: s.maxHp[id], + }) + } + sort.Slice(out, func(i, j int) bool { + if out[i].PkgID != out[j].PkgID { + return out[i].PkgID < out[j].PkgID + } + return out[i].PunitID < out[j].PunitID + }) + return out +} + +func (s *fakeSst) GetClosConfig(closID int) (pctClosCfg, bool, error) { + if c, ok := s.closCfg[closID]; ok { + return c, true, nil + } + return pctClosCfg{}, false, nil +} + +func (s *fakeSst) Shutdown() error { return nil } + +func (s *fakeSst) TFStatus() (map[pctPunitID]bool, error) { + // Tests do not care about SST-TF; report enabled everywhere. + out := map[pctPunitID]bool{} + for _, pu := range s.Punits() { + out[pctPunitID{PkgID: pu.PkgID, PunitID: pu.PunitID}] = true + } + return out, nil +} + +// --- helpers to construct a hand-wired Allocator ----------------- + +func newManagedPctForTest(t *testing.T, classes []*policyapi.CPUClass, plans map[string]*pctClassPlan, + allowed cpuset.CPUSet, sys *fakeSys, sst *fakeSst) *Allocator { + t.Helper() + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{}, + classPlan: plans, + allowed: allowed, + hpUsed: map[int]cpuset.CPUSet{}, + hpClasses: map[string]bool{}, + } + for _, cc := range classes { + a.classByName[cc.Name] = cc + if cc.PctPriority == "high" { + a.hpClasses[cc.Name] = true + } + } + pctTestWirePunits(a) + return a +} + +// pctTestWirePunits seeds a hand-built Allocator's punit caches +// from its sst's Punits(), intersected with allowed. It is the +// test-time equivalent of snapshotPunits() and lets struct-literal +// fixtures exercise the punit-keyed code paths. +func pctTestWirePunits(a *Allocator) { + if a.punitByCpu == nil { + a.punitByCpu = map[int]int{} + } + if a.hpClasses == nil { + a.hpClasses = map[string]bool{} + } + if a.hpEligiblePunit == nil { + a.hpEligiblePunit = map[int]bool{} + } + for name, cc := range a.classByName { + if cc.PctPriority == "high" { + a.hpClasses[name] = true + } + } + pus := a.sst.Punits() + a.punits = a.punits[:0] + for _, pu := range pus { + cpus := pu.CPUs + if a.allowed.Size() > 0 { + cpus = cpus.Intersection(a.allowed) + } + if cpus.IsEmpty() { + continue + } + idx := len(a.punits) + a.punits = append(a.punits, pctPunit{ + PkgID: pu.PkgID, PunitID: pu.PunitID, + CPUs: cpus, MaxHpCpus: pu.MaxHpCpus, + GuaranteedHpCpus: pu.GuaranteedHpCpus, + }) + for _, c := range cpus.UnsortedList() { + a.punitByCpu[c] = idx + } + // Default to HP-eligible so existing tests that don't + // care about TF state keep working. Tests that exercise + // HP-ineligibility set hpEligiblePunit explicitly after + // calling this helper. + a.hpEligiblePunit[idx] = true + } +} + +// --- hints() test suite --------------------------------------------- + +// TestPctHintsNoClassNoOp covers the "no plan and not managed-with-HP" +// branch where hints() must return an empty types.AllocationHints. +func TestPctHintsNoClassNoOp(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{supported: true} + + // disabled allocator: hints must short-circuit to empty. + a := &Allocator{sys: sys, sst: sst, mode: pctModeDisabled} + got := a.Hints(types.AllocationIntent{ClassName: "anything"}) + if len(got.Prefer) != 0 || len(got.Avoid) != 0 { + t.Errorf("disabled mode: hints=%+v, want empty", got) + } + + // managed mode with no HP class defined and an unknown + // className: no prefer, no avoid. + classes := []*policyapi.CPUClass{{Name: "lp", PctPriority: "low"}} + // "lp" is configured but classIsHighPriority is false; still the + // "anyHighPriorityClassDefined" gate must be false so no Avoid. + a2 := newManagedPctForTest(t, classes, + map[string]*pctClassPlan{"lp": {ClosID: 3}}, + cpuset.MustParse("0-7"), sys, sst) + got = a2.Hints(types.AllocationIntent{ClassName: "unknown-class"}) + if len(got.Avoid) != 0 { + t.Errorf("no HP class: Avoid=%+v, want empty", got.Avoid) + } +} + +// TestPctHintsAssocOnlyPreferClosCpus covers the "explicit CLOS plan" +// branch in assoc-only mode: hints prefer CPUs already associated to +// the class's CLOS, enabling bin packing. +func TestPctHintsAssocOnlyPreferClosCpus(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + // cpus 2 and 3 already on CLOS 1, others on default CLOS 0. + cpuClos: map[int]int{2: 1, 3: 1}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeAssocOnly, + classByName: map[string]*policyapi.CPUClass{"c1": {Name: "c1"}}, + classPlan: map[string]*pctClassPlan{"c1": {ClosID: 1}}, + allowed: cpuset.MustParse("0-7"), + hpUsed: map[int]cpuset.CPUSet{}, + } + pctTestWirePunits(a) + got := a.Hints(types.AllocationIntent{ClassName: "c1"}) + if len(got.Prefer) != 1 { + t.Fatalf("Prefer count = %d, want 1: got=%+v", len(got.Prefer), got) + } + if got.Prefer[0].Name != virtDevSstClosHint(1) { + t.Errorf("Prefer[0].Name = %q, want %q", got.Prefer[0].Name, virtDevSstClosHint(1)) + } + want := cpuset.MustParse("2-3") + if !got.Prefer[0].Cpus.Equals(want) { + t.Errorf("Prefer[0].Cpus = %s, want %s", got.Prefer[0].Cpus, want) + } + if len(got.Avoid) != 0 { + t.Errorf("assoc-only mode must not emit Avoid hints: %+v", got.Avoid) + } +} + +// TestPctHintsHighPriorityReserveAndClosCpus covers the HP class +// branch: hints contain (a) CPUs already on the HP CLOS for bin +// packing and (b) the HP-reserve preference (largest-room package). +func TestPctHintsHighPriorityReserveAndClosCpus(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + // cpu 0 already on CLOS 0 (HP). + cpuClos: map[int]int{0: 0}, + // max_hp_cpus = 2 per package on both packages. + maxHp: map[int]int{0: 2, 1: 2}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + }, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-7"), + // pkg0 has 1 HP cpu already used (cpu 0). + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("0")}, + } + pctTestWirePunits(a) + + // Free pool excludes the already-used cpu 0. + free := cpuset.MustParse("1-7") + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + CurrentCpus: cpuset.New(), + FreeCpus: free, + RequestedCount: 1, + }) + + // Expect two Prefer hints: CLOS 0 members (cpu 0) and HP reserve + // (the package with more HP room - pkg1, since pkg0 has 2-1=1 + // room left and pkg1 has 2-0=2 room left). + if len(got.Prefer) != 2 { + t.Fatalf("Prefer count = %d, want 2: got=%+v", len(got.Prefer), got.Prefer) + } + if got.Prefer[0].Name != virtDevSstClosHint(0) { + t.Errorf("Prefer[0].Name = %q, want %q", got.Prefer[0].Name, virtDevSstClosHint(0)) + } + if got.Prefer[1].Name != virtDevSstHpReserveHint { + t.Errorf("Prefer[1].Name = %q, want %q", got.Prefer[1].Name, virtDevSstHpReserveHint) + } + wantReserve := cpuset.MustParse("4-7") + if !got.Prefer[1].Cpus.Equals(wantReserve) { + t.Errorf("HP reserve = %s, want %s (largest-room package)", got.Prefer[1].Cpus, wantReserve) + } + // HP-class hints must NOT carry an Avoid (HP picks first). + if len(got.Avoid) != 0 { + t.Errorf("HP class: Avoid=%+v, want empty", got.Avoid) + } +} + +// TestPctHintsManagedNonHpAvoidsHpInUse covers the managed-mode +// non-HP-class branch: hints must Avoid CPUs on packages currently +// hosting HP-class CPUs, so non-HP classes do not steal HP turbo +// budget. THIS BRANCH IS NOT COVERED IN test19 e2e. +func TestPctHintsManagedNonHpAvoidsHpInUse(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + cpuClos: map[int]int{}, + maxHp: map[int]int{0: 2, 1: 2}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + "lp": {Name: "lp", PctPriority: "low"}, + }, + classPlan: map[string]*pctClassPlan{ + "hp": {ClosID: 0}, + "lp": {ClosID: 3}, + }, + allowed: cpuset.MustParse("0-7"), + // pkg0 hosts HP cpu 1. + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("1")}, + } + pctTestWirePunits(a) + got := a.Hints(types.AllocationIntent{ + ClassName: "lp", + FreeCpus: cpuset.MustParse("2-7"), + }) + + // LP has a CLOS plan, so Prefer must include CLOS 3 (empty in + // our setup) - but only if any CPU is currently on CLOS 3. With + // none, classClosID still matches but closCpus returns empty + // and the Prefer entry is skipped. So len(Prefer) == 0. + if len(got.Prefer) != 0 { + t.Errorf("Prefer = %+v, want empty (no LP CPUs currently on CLOS 3)", got.Prefer) + } + // Avoid must list pkg0's full CPU set (where HP is in use). + if len(got.Avoid) != 1 { + t.Fatalf("Avoid count = %d, want 1: got=%+v", len(got.Avoid), got.Avoid) + } + if got.Avoid[0].Name != virtDevSstHpInUseHint { + t.Errorf("Avoid[0].Name = %q, want %q", got.Avoid[0].Name, virtDevSstHpInUseHint) + } + wantAvoid := cpuset.MustParse("0-3") // entire pkg0 + if !got.Avoid[0].Cpus.Equals(wantAvoid) { + t.Errorf("Avoid[0].Cpus = %s, want %s (pkg0 == HP-in-use package)", got.Avoid[0].Cpus, wantAvoid) + } +} + +// TestPctHintsAllowedBoundsResults ensures that even with sst / +// hpUsed pointing at CPUs outside the allowed set, hints honor +// Allowed (via the handler-level intersectHints + pct-internal +// allowed intersections). +func TestPctHintsAllowedBoundsResults(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + cpuClos: map[int]int{0: 0, 4: 0}, // HP cpus on both packages + maxHp: map[int]int{0: 2, 1: 2}, + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + }, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + // allowed restricts to pkg0 only. + allowed: cpuset.MustParse("0-3"), + hpUsed: map[int]cpuset.CPUSet{ + 0: cpuset.MustParse("0"), + 1: cpuset.MustParse("4"), // outside allowed + }, + } + pctTestWirePunits(a) + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("1-3"), + RequestedCount: 1, + }) + // closCpus walks a.allowed, so cpu 4 is excluded automatically. + // Prefer[0] (closCpus) must contain only cpu 0. + if len(got.Prefer) == 0 { + t.Fatalf("Prefer empty, want at least closCpus hint") + } + if !got.Prefer[0].Cpus.Equals(cpuset.MustParse("0")) { + t.Errorf("Prefer[0].Cpus = %s, want {0} (cpu 4 outside allowed)", got.Prefer[0].Cpus) + } + // HP reserve must come from a package whose free CPUs are + // inside allowed; only pkg0 qualifies. + if len(got.Prefer) >= 2 { + want := cpuset.MustParse("1-3") + if !got.Prefer[1].Cpus.Equals(want) { + t.Errorf("HP reserve = %s, want %s (pkg0 free cpus inside allowed)", got.Prefer[1].Cpus, want) + } + } +} + +// --- Tier A/B/C reservation tests ---------------------------------- + +// newTwoPunitFakeSys returns a fakeSys whose package layout matches +// the standard two-punit-per-package fixture below: pkg0 = 0..7 +// (punit-0 = 0..3, punit-1 = 4..7), pkg1 = 8..15 (punit-2 = 8..11, +// punit-3 = 12..15). The synthesis function does not know about +// punits, only packages. +func newTwoPunitFakeSys() *fakeSys { + return &fakeSys{ + packageCpus: map[idset.ID]cpuset.CPUSet{ + 0: cpuset.MustParse("0-7"), + 1: cpuset.MustParse("8-15"), + }, + cpuPkg: map[int]idset.ID{ + 0: 0, 1: 0, 2: 0, 3: 0, 4: 0, 5: 0, 6: 0, 7: 0, + 8: 1, 9: 1, 10: 1, 11: 1, 12: 1, 13: 1, 14: 1, 15: 1, + }, + } +} + +// makeTwoPunitsPerPkg returns four punits laid out as in +// newTwoPunitFakeSys, with the given MaxHpCpus per punit. +func makeTwoPunitsPerPkg(hp0, hp1, hp2, hp3 int) []pctPunit { + return []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), MaxHpCpus: hp0}, + {PkgID: 0, PunitID: 1, CPUs: cpuset.MustParse("4-7"), MaxHpCpus: hp1}, + {PkgID: 1, PunitID: 2, CPUs: cpuset.MustParse("8-11"), MaxHpCpus: hp2}, + {PkgID: 1, PunitID: 3, CPUs: cpuset.MustParse("12-15"), MaxHpCpus: hp3}, + } +} + +// TestPctHints_HpRoomTierAPunitWins: punit-0 is fully occupied by +// HP work, punit-1 in the same package has full HP room. A request +// for 1 HP CPU must steer to punit-1 (Tier A), not to pkg1. +func TestPctHints_HpRoomTierAPunitWins(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + punits: makeTwoPunitsPerPkg(2, 2, 2, 2), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{"hp": {Name: "hp", PctPriority: "high"}}, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-15"), + // Punit-0 fully booked with HP (cpus 0,1 take both HP slots). + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("0-1")}, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("2-15"), + RequestedCount: 1, + }) + + // Find HP reserve hint. + var reserve cpuset.CPUSet + for _, p := range got.Prefer { + if p.Name == virtDevSstHpReserveHint { + reserve = p.Cpus + } + } + if reserve.IsEmpty() { + t.Fatalf("expected HP reserve hint, got Prefer=%+v", got.Prefer) + } + // Tier A: punit-1 (room=2) beats punit-0 (room=0) and the + // equal-room punits in pkg1 because punit-0/punit-1 both belong + // to pkg0 -- here we pick by largest room. + // Actually both punit-1 (room=2), punit-2 (room=2), punit-3 + // (room=2) tie; tie-break by free-CPU count (all 4) and then + // by iteration order (slice index 1 first). So expect punit-1. + want := cpuset.MustParse("4-7") + if !reserve.Equals(want) { + t.Errorf("Tier A HP reserve = %s, want %s (punit-1)", reserve, want) + } +} + +// TestPctHints_HpRoomTierBSamePackage: punit-0 and punit-1 in pkg0 +// each have only 1 HP slot left, but together they offer 2 slots -- +// enough for the request. Pkg1 has only 1 HP slot in total. The +// Tier-B aggregate must steer to pkg0 (free CPUs of both punits). +func TestPctHints_HpRoomTierBSamePackage(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + punits: makeTwoPunitsPerPkg(2, 2, 1, 0), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{"hp": {Name: "hp", PctPriority: "high"}}, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-15"), + // Both pkg0 punits already host 1 HP CPU each, leaving room=1 in each. + hpUsed: map[int]cpuset.CPUSet{ + 0: cpuset.MustParse("0"), // punit-0 idx 0 + 1: cpuset.MustParse("4"), // punit-1 idx 1 + }, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("1-3,5-15"), + RequestedCount: 2, + }) + var reserve cpuset.CPUSet + for _, p := range got.Prefer { + if p.Name == virtDevSstHpReserveHint { + reserve = p.Cpus + } + } + if reserve.IsEmpty() { + t.Fatalf("expected HP reserve hint, got Prefer=%+v", got.Prefer) + } + // Tier A is impossible (no single punit has room>=2 in pkg0, + // and pkg1 punit-2 has 1 cpu only). Tier B: pkg0 sum-room=2 + // >= 2, pkg1 sum-room=1 < 2. Reserve = pkg0 free CPUs. + want := cpuset.MustParse("1-3,5-7") + if !reserve.Equals(want) { + t.Errorf("Tier B HP reserve = %s, want %s (pkg0 union)", reserve, want) + } +} + +// TestPctHints_HpRoomTierCNoCrossPackage: request exceeds the HP +// room of every single package. Tier C is never taken - the +// allocator must return no HP-reserve hint so the caller falls back +// to topology-only placement on the same socket. +func TestPctHints_HpRoomTierCNoCrossPackage(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + // pkg0 has 2 HP CPUs total, pkg1 has 2 HP CPUs total. + punits: makeTwoPunitsPerPkg(1, 1, 1, 1), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{"hp": {Name: "hp", PctPriority: "high"}}, + classPlan: map[string]*pctClassPlan{"hp": {ClosID: 0}}, + allowed: cpuset.MustParse("0-15"), + hpUsed: map[int]cpuset.CPUSet{}, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "hp", + FreeCpus: cpuset.MustParse("0-15"), + RequestedCount: 3, // > any single package's HP capacity (2) + }) + for _, p := range got.Prefer { + if p.Name == virtDevSstHpReserveHint { + t.Errorf("Tier C must not emit HP reserve hint; got %+v", p) + } + } +} + +// TestPctHints_HpInUseIsPunitGranular: managed-mode non-HP class +// must Avoid only the punits currently hosting HP work, not the +// entire package. This is a regression guard for the punit-keyed +// rewrite of hpInUseCpus. +func TestPctHints_HpInUseIsPunitGranular(t *testing.T) { + sys := newTwoPunitFakeSys() + sst := &fakeSst{ + supported: true, + punits: makeTwoPunitsPerPkg(2, 2, 2, 2), + } + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeManaged, + classByName: map[string]*policyapi.CPUClass{ + "hp": {Name: "hp", PctPriority: "high"}, + "lp": {Name: "lp", PctPriority: "low"}, + }, + classPlan: map[string]*pctClassPlan{ + "hp": {ClosID: 0}, + "lp": {ClosID: 3}, + }, + allowed: cpuset.MustParse("0-15"), + // HP work on punit-0 only (pkg0). + hpUsed: map[int]cpuset.CPUSet{0: cpuset.MustParse("0")}, + } + pctTestWirePunits(a) + + got := a.Hints(types.AllocationIntent{ + ClassName: "lp", + FreeCpus: cpuset.MustParse("1-15"), + }) + if len(got.Avoid) != 1 { + t.Fatalf("Avoid count = %d, want 1: got=%+v", len(got.Avoid), got.Avoid) + } + // Must be punit-0 (cpus 0-3) ONLY, not all of pkg0 (0-7). + want := cpuset.MustParse("0-3") + if !got.Avoid[0].Cpus.Equals(want) { + t.Errorf("Avoid = %s, want %s (punit-0 only, not full pkg0)", got.Avoid[0].Cpus, want) + } +} + +// --- classifyAssocOnlyHP tests ------------------------------------- + +// TestPctClassifyAssocOnlyHP_MaxFreqWins: of two referenced CLOSes +// with programmed MaxFreq, the larger MaxFreq is the HP class. +func TestPctClassifyAssocOnlyHP_MaxFreqWins(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{ + supported: true, + closCfg: map[int]pctClosCfg{ + 1: {MinFreq: 1000000, MaxFreq: 3000000}, // base-ish + 2: {MinFreq: 2000000, MaxFreq: 3800000}, // turbo + }, + }, + classPlan: map[string]*pctClassPlan{ + "c-base": {ClosID: 1}, + "c-turbo": {ClosID: 2}, + }, + hpClasses: map[string]bool{}, + } + classes := []*policyapi.CPUClass{ + {Name: "c-base"}, + {Name: "c-turbo"}, + } + a.classifyAssocOnlyHP(classes) + if a.hpClasses["c-base"] { + t.Errorf("c-base must NOT be classified HP (lower MaxFreq)") + } + if !a.hpClasses["c-turbo"] { + t.Errorf("c-turbo must be classified HP (higher MaxFreq)") + } +} + +// TestPctClassifyAssocOnlyHP_TieBreakSmallerClos: when two CLOSes +// share the highest MaxFreq, the smaller CLOS id wins (SST-CP +// ordered-priority convention). +func TestPctClassifyAssocOnlyHP_TieBreakSmallerClos(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{ + supported: true, + closCfg: map[int]pctClosCfg{ + 1: {MaxFreq: 3800000}, + 2: {MaxFreq: 3800000}, // tie + }, + }, + classPlan: map[string]*pctClassPlan{ + "c1": {ClosID: 1}, + "c2": {ClosID: 2}, + }, + hpClasses: map[string]bool{}, + } + a.classifyAssocOnlyHP([]*policyapi.CPUClass{{Name: "c1"}, {Name: "c2"}}) + if !a.hpClasses["c1"] { + t.Errorf("c1 must win tie (smaller CLOS id)") + } + if a.hpClasses["c2"] { + t.Errorf("c2 must NOT be HP (lost tie)") + } +} + +// TestPctClassifyAssocOnlyHP_NoProgrammedFreq: when no CLOS has a +// programmed MaxFreq, no class is classified HP -- HP-specific +// hints stay quiet. +func TestPctClassifyAssocOnlyHP_NoProgrammedFreq(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{supported: true, closCfg: map[int]pctClosCfg{}}, + classPlan: map[string]*pctClassPlan{ + "c1": {ClosID: 1}, + }, + hpClasses: map[string]bool{}, + } + a.classifyAssocOnlyHP([]*policyapi.CPUClass{{Name: "c1"}}) + if len(a.hpClasses) != 0 { + t.Errorf("hpClasses=%v, want empty when no CLOS has programmed MaxFreq", a.hpClasses) + } +} + +// TestPctClassifyAssocOnlyHP_ZeroMaxFreqIgnored: a CLOS that +// returns (cfg, true, nil) but with MaxFreq==0 must not be +// classified HP (zero is "not specified"). +func TestPctClassifyAssocOnlyHP_ZeroMaxFreqIgnored(t *testing.T) { + a := &Allocator{ + sst: &fakeSst{ + supported: true, + closCfg: map[int]pctClosCfg{1: {MinFreq: 1000000}}, // MaxFreq=0 + }, + classPlan: map[string]*pctClassPlan{"c1": {ClosID: 1}}, + hpClasses: map[string]bool{}, + } + a.classifyAssocOnlyHP([]*policyapi.CPUClass{{Name: "c1"}}) + if a.hpClasses["c1"] { + t.Errorf("c1 must NOT be HP when MaxFreq=0") + } +} + +// --- BF fallback test ---------------------------------------------- + +// TestPctPunitMaxHpCpus_BfFallback: punit with TF unsupported but +// BF-supported high-priority CPU set must report MaxHpCpus equal +// to len(BF.HighPriorityCPUs). +func TestPctPunitMaxHpCpus_BfFallback(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + BF: gosst.BFInfo{ + Supported: true, + HighPriorityCPUs: idset.NewIDSet(0, 1, 2, 3), + }, + TF: gosst.TFInfo{Supported: false}, + } + if got := punitMaxHpCpus(pi); got != 4 { + t.Errorf("punitMaxHpCpus = %d, want 4 (BF fallback)", got) + } +} + +// TestPctPunitMaxHpCpus_TfWins: when both TF and BF are present, +// TF takes precedence (largest bucket HighPriorityCoreCount sets +// the cap). +func TestPctPunitMaxHpCpus_TfWins(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + BF: gosst.BFInfo{ + Supported: true, + HighPriorityCPUs: idset.NewIDSet(0, 1), // 2 + }, + TF: gosst.TFInfo{ + Supported: true, + Buckets: []gosst.TFBucketInfo{ + {ID: 0, HighPriorityCoreCount: 1}, + {ID: 1, HighPriorityCoreCount: 4}, // max + {ID: 2, HighPriorityCoreCount: 2}, + }, + }, + } + if got := punitMaxHpCpus(pi); got != 4 { + t.Errorf("punitMaxHpCpus = %d, want 4 (largest TF bucket)", got) + } +} + +// TestPctPunitMaxHpCpus_NeitherSupported: with neither TF nor BF +// supported, MaxHpCpus is 0 (the allocator excludes such punits +// from HP steering). +func TestPctPunitMaxHpCpus_NeitherSupported(t *testing.T) { + pi := &gosst.PerfLevelInfo{} + if got := punitMaxHpCpus(pi); got != 0 { + t.Errorf("punitMaxHpCpus = %d, want 0", got) + } +} + +// TestPctPunitGuaranteedHpCpus_TfSmallestBucket: with multiple +// non-zero TF buckets, the guaranteed top-turbo HP CPU count is +// the smallest HighPriorityCoreCount (smaller buckets unlock +// higher turbo frequencies). +func TestPctPunitGuaranteedHpCpus_TfSmallestBucket(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + TF: gosst.TFInfo{ + Supported: true, + Buckets: []gosst.TFBucketInfo{ + {ID: 0, HighPriorityCoreCount: 24}, + {ID: 1, HighPriorityCoreCount: 8}, // smallest non-zero + {ID: 2, HighPriorityCoreCount: 16}, + }, + }, + } + if got := punitGuaranteedHpCpus(pi); got != 8 { + t.Errorf("punitGuaranteedHpCpus = %d, want 8 (smallest TF bucket)", got) + } +} + +// TestPctPunitGuaranteedHpCpus_BfFallback: when TF is +// unsupported, fall back to len(BF.HighPriorityCPUs). +func TestPctPunitGuaranteedHpCpus_BfFallback(t *testing.T) { + pi := &gosst.PerfLevelInfo{ + BF: gosst.BFInfo{ + Supported: true, + HighPriorityCPUs: idset.NewIDSet(0, 1, 2, 3), + }, + } + if got := punitGuaranteedHpCpus(pi); got != 4 { + t.Errorf("punitGuaranteedHpCpus = %d, want 4 (BF fallback)", got) + } +} + +// TestPctPunitGuaranteedHpCpus_NeitherSupported: neither TF nor +// BF -> 0. +func TestPctPunitGuaranteedHpCpus_NeitherSupported(t *testing.T) { + pi := &gosst.PerfLevelInfo{} + if got := punitGuaranteedHpCpus(pi); got != 0 { + t.Errorf("punitGuaranteedHpCpus = %d, want 0", got) + } +} + +// --- FreeClassCapacity test suite ----------------------------------- + +// newAssocOnlyPctForTest mirrors newManagedPctForTest but configures +// the allocator in assoc-only mode. hpClasses, classPlan and +// hpEligiblePunit must be set up by the caller after the helper +// returns to keep the test intent explicit. +func newAssocOnlyPctForTest(t *testing.T, classes []*policyapi.CPUClass, plans map[string]*pctClassPlan, + allowed cpuset.CPUSet, sys *fakeSys, sst *fakeSst) *Allocator { + t.Helper() + a := &Allocator{ + sys: sys, + sst: sst, + mode: pctModeAssocOnly, + classByName: map[string]*policyapi.CPUClass{}, + classPlan: plans, + allowed: allowed, + hpUsed: map[int]cpuset.CPUSet{}, + hpClasses: map[string]bool{}, + hpEligiblePunit: map[int]bool{}, + } + for _, cc := range classes { + a.classByName[cc.Name] = cc + } + pctTestWirePunits(a) + return a +} + +// TestFreeClassCapacity_AssocOnlyHpFromFallbackCLOS verifies the +// real-world assoc-only bug fix: every CPU starts on the fallback +// (LP) CLOS in hardware because the balloons policy associates the +// idle/default class on Configure, yet HP capacity for an HP class +// must still report sum_pu min(GuaranteedHpCpus, |pu.CPUs \ held|) +// -- not zero. (Pre-fix the result was 0 because closCpus(HP CLOS) +// was empty.) +func TestFreeClassCapacity_AssocOnlyHpFromFallbackCLOS(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + // All CPUs are on CLOS 3 (the LP/fallback CLOS). The HP + // CLOS 0 has no CPUs associated to it. + cpuClos: map[int]int{ + 0: 3, 1: 3, 2: 3, 3: 3, + 4: 3, 5: 3, 6: 3, 7: 3, + }, + // Two punits (one per package); each guarantees 2 HP CPUs at top turbo. + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + classes := []*policyapi.CPUClass{ + {Name: "hp"}, // pctPriority not set; HP is decided by classifyAssocOnlyHP at runtime + {Name: "lp"}, + } + a := newAssocOnlyPctForTest(t, classes, + map[string]*pctClassPlan{"hp": {ClosID: 0}, "lp": {ClosID: 3}}, + cpuset.MustParse("0-7"), sys, sst) + a.hpClasses["hp"] = true // simulate classifyAssocOnlyHP result + + // Held by some non-HP balloon: 2 CPUs (one per punit). + held := cpuset.MustParse("3,7") + + gotHp := a.FreeClassCapacity("hp", held) + wantHp := 2 + 2 // both punits: min(2, |{0,1,2}|=3)=2 and min(2, |{4,5,6}|=3)=2 + if gotHp != wantHp { + t.Errorf("HP capacity (assoc-only, all cpus on fallback CLOS) = %d, want %d", + gotHp, wantHp) + } + + gotLp := a.FreeClassCapacity("lp", held) + wantLp := 8 - 2 // allowed (8) minus held (2) + if gotLp != wantLp { + t.Errorf("LP capacity (assoc-only) = %d, want %d", gotLp, wantLp) + } +} + +// TestFreeClassCapacity_AssocOnlyHpTFDisabledPunitExcluded verifies +// the eligibility gate: punits where SST-TF is disabled in +// assoc-only mode contribute zero HP capacity even when their +// GuaranteedHpCpus is non-zero. Prevents over-publishing HP +// capacity on nodes that cannot actually deliver top turbo. +func TestFreeClassCapacity_AssocOnlyHpTFDisabledPunitExcluded(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + a := newAssocOnlyPctForTest(t, []*policyapi.CPUClass{{Name: "hp"}}, + map[string]*pctClassPlan{"hp": {ClosID: 0}}, + cpuset.MustParse("0-7"), sys, sst) + a.hpClasses["hp"] = true + // pctTestWirePunits marked both eligible; flip pkg1 punit to + // TF-disabled to model the assoc-only "operator did not enable + // SST-TF on this punit" case. + a.hpEligiblePunit[1] = false + + got := a.FreeClassCapacity("hp", cpuset.New()) + want := 2 // only pkg0 contributes + if got != want { + t.Errorf("HP capacity with one TF-disabled punit = %d, want %d", got, want) + } +} + +// TestFreeClassCapacity_AssocOnlyNoHpClassification: assoc-only +// where no class was classified HP (e.g. no CLOS has a programmed +// MaxFreq) falls through to the non-HP formula |Allowed \ held|. +func TestFreeClassCapacity_AssocOnlyNoHpClassification(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + a := newAssocOnlyPctForTest(t, []*policyapi.CPUClass{{Name: "c1"}}, + map[string]*pctClassPlan{"c1": {ClosID: 1}}, + cpuset.MustParse("0-7"), sys, sst) + // Intentionally no entries in a.hpClasses. + + got := a.FreeClassCapacity("c1", cpuset.MustParse("1,5")) + want := 8 - 2 + if got != want { + t.Errorf("non-HP assoc-only capacity = %d, want %d", got, want) + } +} + +// TestFreeClassCapacity_ManagedHpRespectsEligibility keeps the +// existing managed-mode formula intact: every punit is HP-eligible +// (PrepareManagedMode enables SST-TF) and the result is the +// guaranteed-top-turbo sum, capped by per-punit free CPUs. +func TestFreeClassCapacity_ManagedHpRespectsEligibility(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{ + supported: true, + punits: []pctPunit{ + {PkgID: 0, PunitID: 0, CPUs: cpuset.MustParse("0-3"), GuaranteedHpCpus: 2}, + {PkgID: 1, PunitID: 0, CPUs: cpuset.MustParse("4-7"), GuaranteedHpCpus: 2}, + }, + } + classes := []*policyapi.CPUClass{ + {Name: "hp", PctPriority: "high"}, + {Name: "lp", PctPriority: "low"}, + } + a := newManagedPctForTest(t, classes, + map[string]*pctClassPlan{"hp": {ClosID: 0}, "lp": {ClosID: 3}}, + cpuset.MustParse("0-7"), sys, sst) + + gotHp := a.FreeClassCapacity("hp", cpuset.MustParse("3")) + wantHp := 2 + 2 // pkg0: min(2, 3)=2; pkg1: min(2, 4)=2 + if gotHp != wantHp { + t.Errorf("managed HP capacity = %d, want %d", gotHp, wantHp) + } + gotLp := a.FreeClassCapacity("lp", cpuset.MustParse("3")) + wantLp := 8 - 1 + if gotLp != wantLp { + t.Errorf("managed LP capacity = %d, want %d", gotLp, wantLp) + } + + // Squeeze pkg0: hold 3 of its 4 CPUs => pkg0 contributes min(2,1)=1. + gotHp = a.FreeClassCapacity("hp", cpuset.MustParse("0-2")) + wantHp = 1 + 2 + if gotHp != wantHp { + t.Errorf("managed HP capacity with squeezed pkg0 = %d, want %d", gotHp, wantHp) + } +} + +// TestFreeClassCapacity_UnknownClassReturnsZero: unknown class +// (no PCT plan) yields 0 regardless of mode. +func TestFreeClassCapacity_UnknownClassReturnsZero(t *testing.T) { + sys := newTwoPackageFakeSys() + sst := &fakeSst{supported: true} + a := newManagedPctForTest(t, []*policyapi.CPUClass{{Name: "hp", PctPriority: "high"}}, + map[string]*pctClassPlan{"hp": {ClosID: 0}}, + cpuset.MustParse("0-7"), sys, sst) + if got := a.FreeClassCapacity("nope", cpuset.New()); got != 0 { + t.Errorf("unknown class capacity = %d, want 0", got) + } +} diff --git a/pkg/resmgr/cpuclass/internal/types/types.go b/pkg/resmgr/cpuclass/internal/types/types.go new file mode 100644 index 000000000..b554beb63 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/types/types.go @@ -0,0 +1,91 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package types defines the internal class-definition struct used by +// the cpuclass writers (cpufreq, cpuidle, uncorefreq). It exists as +// a separate package so each writer can depend on the same struct +// without depending on the public cpuclass API or on the +// soon-to-be-deprecated control/cpu config package. +package types + +import ( + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// ClassDef is the resolved, platform-aware definition of a CPU class +// as consumed by the writers. All frequency fields are in kHz; zero +// means "no enforcement". Symbolic frequencies in the user-facing +// configuration are resolved before being placed into a ClassDef. +type ClassDef struct { + MinFreq uint + MaxFreq uint + EnergyPerformancePreference uint + UncoreMinFreq uint + UncoreMaxFreq uint + FreqGovernor string + DisabledCstates []string +} + +// Equal reports whether two ClassDef values describe identical +// per-CPU enforcement. Used by the handler to decide whether a +// class-table change actually requires re-programming CPUs. +func (c ClassDef) Equal(other ClassDef) bool { + if c.MinFreq != other.MinFreq || + c.MaxFreq != other.MaxFreq || + c.EnergyPerformancePreference != other.EnergyPerformancePreference || + c.UncoreMinFreq != other.UncoreMinFreq || + c.UncoreMaxFreq != other.UncoreMaxFreq || + c.FreqGovernor != other.FreqGovernor { + return false + } + if len(c.DisabledCstates) != len(other.DisabledCstates) { + return false + } + for i := range c.DisabledCstates { + if c.DisabledCstates[i] != other.DisabledCstates[i] { + return false + } + } + return true +} + +// AllocationIntent describes an upcoming CPU allocation for which +// the caller wants placement preferences. Lives here so internal +// helpers (e.g. pct) can implement Hints without depending on the +// public cpuclass package. +type AllocationIntent struct { + ClassName string + CurrentCpus cpuset.CPUSet + FreeCpus cpuset.CPUSet + RequestedCount int +} + +// CpuPreference is a named CPU set carrying a single placement +// preference (prefer or avoid depending on the slice it appears in). +type CpuPreference struct { + Name string + Cpus cpuset.CPUSet +} + +// AllocationHints carries technology-agnostic placement preferences +// for an upcoming allocation. Both slices are ordered by descending +// priority. +type AllocationHints struct { + Prefer []CpuPreference + Avoid []CpuPreference +} + +// CPUSet aliases cpuset.CPUSet for callers that want to refer to it +// via this package without re-importing pkg/utils/cpuset. +type CPUSet = cpuset.CPUSet diff --git a/pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go b/pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go new file mode 100644 index 000000000..33c589571 --- /dev/null +++ b/pkg/resmgr/cpuclass/internal/uncorefreq/uncorefreq.go @@ -0,0 +1,239 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +// Package uncorefreq is the per-die uncore frequency writer used by +// the cpuclass handler. It exposes a Hooks-injectable interface +// matching the cpufreq and cpuidle writers and computes the +// effective per-die min/max as the max-wins reduction over all +// classes that have at least one CPU on the die. +package uncorefreq + +import ( + "fmt" + + "github.com/intel/goresctrl/pkg/utils" + + logger "github.com/containers/nri-plugins/pkg/log" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass/internal/types" + "github.com/containers/nri-plugins/pkg/sysfs" +) + +var log = logger.NewLogger("cpuclass") + +// DieKey identifies one (package, die) uncore frequency domain. +type DieKey struct { + Pkg int + Die int +} + +// Hooks lets tests intercept per-die uncore writes without touching +// real sysfs. Production use leaves all hooks nil; the writer then +// talks to the platform via goresctrl. Setting any hook also forces +// "available" to true so tests can exercise enforce paths on VMs +// without the intel_uncore_frequency driver. +type Hooks struct { + SetMin func(pkg, die, kHz int) error + SetMax func(pkg, die, kHz int) error +} + +// uncoreWritten records the last successfully written min/max kHz +// on a single die. Used for write deduplication. +type uncoreWritten struct { + min uint + max uint + hasMin bool + hasMax bool +} + +// Writer enforces per-die uncore frequency limits. A die with +// effective min=max=0 is left untouched. Failures on individual +// dies are logged; the first error is returned to the caller. +type Writer struct { + hooks Hooks + available bool + lastWritten map[DieKey]uncoreWritten +} + +// NewWriter returns a Writer wired to the given hooks. Pass a +// zero-valued Hooks to use real sysfs via goresctrl. The "available" +// bit is probed once; setting any hook overrides the probe. +func NewWriter(hooks Hooks) *Writer { + available := utils.UncoreFreqAvailable() + if hooks.SetMin != nil || hooks.SetMax != nil { + available = true + } + return &Writer{ + hooks: hooks, + available: available, + lastWritten: make(map[DieKey]uncoreWritten), + } +} + +// Available reports whether the uncore frequency driver was found +// at construction time. Used by the handler to surface a helpful +// configuration error when classes request uncore limits but the +// driver is missing. +func (w *Writer) Available() bool { return w.available } + +// Reset clears the per-die lastWritten cache. Called by the handler +// when class definitions or the allowed set change. +func (w *Writer) Reset() { + w.lastWritten = make(map[DieKey]uncoreWritten) +} + +// RequiresAvailable reports whether any class definition requests +// uncore limits. Used by the handler to fail Configure with a +// helpful error when classes ask for uncore but the driver is not +// loaded. +func RequiresAvailable(defs map[string]types.ClassDef) (string, bool) { + for name, c := range defs { + if c.UncoreMinFreq != 0 || c.UncoreMaxFreq != 0 { + return name, true + } + } + return "", false +} + +// UnavailableError formats a configuration error when classes +// request uncore limits but the driver is missing. +func UnavailableError(className string) error { + return fmt.Errorf("uncore limits set in cpu class %q but uncore driver not available; load the intel_uncore_frequency driver", className) +} + +// Enforce recomputes and writes the effective uncore min/max for +// every dirty die. Parameters: +// - sys: narrow topology surface used to enumerate CPUs per die. +// - defs: class name -> definition. +// - cpuClass: cpu id -> class name (current assignments). +// - dirtyDies: set of (pkg, die) keys that need recomputation. +// +// Returns the first error encountered. Skips silently when the +// uncore driver is unavailable. +func (w *Writer) Enforce(sys sysfs.System, defs map[string]types.ClassDef, cpuClass map[int]string, dirtyDies map[DieKey]bool) error { + if !w.available || len(dirtyDies) == 0 { + return nil + } + var firstErr error + for key := range dirtyDies { + min, max, minCls, maxCls := effectiveUncoreFreqs(sys, key, defs, cpuClass) + if min == 0 && max == 0 { + log.Debugf("uncore: pkg/die %d/%d: no limits in effect", key.Pkg, key.Die) + continue + } + log.Debugf("uncore: pkg/die %d/%d: min=%d (class %q) max=%d (class %q)", + key.Pkg, key.Die, min, minCls, max, maxCls) + state := w.lastWritten[key] + if min > 0 && max > 0 && min > max { + log.Warnf("uncore: pkg/die %d/%d: min %d > max %d", key.Pkg, key.Die, min, max) + } + if min > 0 && (!state.hasMin || state.min != min) { + if err := w.callSetMin(key.Pkg, key.Die, int(min)); err != nil { + log.Errorf("uncore: pkg/die %d/%d: cannot set min=%d: %v", key.Pkg, key.Die, min, err) + if firstErr == nil { + firstErr = err + } + } + state.min = min + state.hasMin = true + } + if max > 0 && (!state.hasMax || state.max != max) { + if err := w.callSetMax(key.Pkg, key.Die, int(max)); err != nil { + log.Errorf("uncore: pkg/die %d/%d: cannot set max=%d: %v", key.Pkg, key.Die, max, err) + if firstErr == nil { + firstErr = err + } + } + state.max = max + state.hasMax = true + } + w.lastWritten[key] = state + } + return firstErr +} + +// effectiveUncoreFreqs computes the effective uncore min and max for +// a single die. Returns 0,0 when no class with uncore limits is +// active on the die. +func effectiveUncoreFreqs(sys sysfs.System, key DieKey, defs map[string]types.ClassDef, cpuClass map[int]string) (minFreq, maxFreq uint, minCls, maxCls string) { + pkg := sys.Package(utils.ID(key.Pkg)) + if pkg == nil { + return 0, 0, "", "" + } + dieCPUs := pkg.DieCPUSet(utils.ID(key.Die)) + seen := map[string]bool{} + for _, cpu := range dieCPUs.UnsortedList() { + name, ok := cpuClass[cpu] + if !ok || name == "" { + continue + } + if seen[name] { + continue + } + seen[name] = true + def, ok := defs[name] + if !ok { + continue + } + if def.UncoreMinFreq > minFreq { + minFreq = def.UncoreMinFreq + minCls = name + } + if def.UncoreMaxFreq > maxFreq { + maxFreq = def.UncoreMaxFreq + maxCls = name + } + } + return minFreq, maxFreq, minCls, maxCls +} + +// DiesForCpus returns the set of (pkg, die) keys that contain at +// least one cpu from cpus. +func DiesForCpus(sys sysfs.System, cpus map[int]bool) map[DieKey]bool { + out := map[DieKey]bool{} + if sys == nil { + return out + } + for cpu := range cpus { + c := sys.CPU(utils.ID(cpu)) + if c == nil { + continue + } + pkgID := int(c.PackageID()) + pkg := sys.Package(utils.ID(pkgID)) + if pkg == nil { + continue + } + for _, die := range pkg.DieIDs() { + if pkg.DieCPUSet(die).Contains(cpu) { + out[DieKey{Pkg: pkgID, Die: int(die)}] = true + break + } + } + } + return out +} + +func (w *Writer) callSetMin(pkg, die, freq int) error { + if w.hooks.SetMin != nil { + return w.hooks.SetMin(pkg, die, freq) + } + return utils.SetUncoreMinFreq(utils.ID(pkg), utils.ID(die), freq) +} + +func (w *Writer) callSetMax(pkg, die, freq int) error { + if w.hooks.SetMax != nil { + return w.hooks.SetMax(pkg, die, freq) + } + return utils.SetUncoreMaxFreq(utils.ID(pkg), utils.ID(die), freq) +} From 8b380b3e6e85b28e6679203f2153285d07f51e20 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 06/10] balloons: implement cpuClasses with turbo priority and PCT Signed-off-by: Antti Kervinen --- cmd/plugins/balloons/Dockerfile | 11 +- .../balloons/policy/balloons-policy.go | 410 +++++++++++++++--- cmd/plugins/balloons/policy/cpuclass_test.go | 175 ++++++++ cmd/plugins/balloons/policy/flags.go | 2 + 4 files changed, 542 insertions(+), 56 deletions(-) create mode 100644 cmd/plugins/balloons/policy/cpuclass_test.go diff --git a/cmd/plugins/balloons/Dockerfile b/cmd/plugins/balloons/Dockerfile index d81fe949c..1c44dc0b2 100644 --- a/cmd/plugins/balloons/Dockerfile +++ b/cmd/plugins/balloons/Dockerfile @@ -19,10 +19,17 @@ RUN --mount=type=cache,target=/go/pkg/mod/ \ GOBIN=/debug-extras/bin go install -tags osusergo,netgo -ldflags "-extldflags=-static" github.com/go-delve/delve/cmd/dlv@latest; \ fi -# Fetch go dependencies in a separate layer for caching +# Fetch go dependencies in a separate layer for caching. +# If vendor/ is present in the build context (e.g. when using a +# local replace directive), use vendor mode and skip the download. COPY go.mod go.sum . COPY pkg/topology/ pkg/topology/ -RUN --mount=type=cache,target=/go/pkg/mod/ go mod download +RUN --mount=type=cache,target=/go/pkg/mod/ \ + if grep -q '^replace .* => /' go.mod 2>/dev/null; then \ + echo "go.mod contains local replace; will rely on vendor/"; \ + else \ + go mod download; \ + fi # Build nri-resource-policy COPY . . diff --git a/cmd/plugins/balloons/policy/balloons-policy.go b/cmd/plugins/balloons/policy/balloons-policy.go index a2f31323f..db8e61372 100644 --- a/cmd/plugins/balloons/policy/balloons-policy.go +++ b/cmd/plugins/balloons/policy/balloons-policy.go @@ -28,7 +28,7 @@ import ( "github.com/containers/nri-plugins/pkg/kubernetes" logger "github.com/containers/nri-plugins/pkg/log" "github.com/containers/nri-plugins/pkg/resmgr/cache" - cpucontrol "github.com/containers/nri-plugins/pkg/resmgr/control/cpu" + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass" "github.com/containers/nri-plugins/pkg/resmgr/events" libmem "github.com/containers/nri-plugins/pkg/resmgr/lib/memory" policy "github.com/containers/nri-plugins/pkg/resmgr/policy" @@ -91,6 +91,7 @@ type balloons struct { cpuAllocator cpuallocator.CPUAllocator // CPU allocator used by the policy memAllocator *libmem.Allocator // memory allocator used by the policy + cpuClasses *cpuclass.Handler // CPU class handler (cpufreq + PCT internals) loadVirtDev map[string]*loadClassVirtDev // map LoadClasses to virtual devices } @@ -252,6 +253,7 @@ func (p *balloons) Start() error { func (p *balloons) Sync(add []cache.Container, del []cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() log.Debugf("synchronizing state...") for _, c := range del { @@ -275,6 +277,7 @@ func (p *balloons) Sync(add []cache.Container, del []cache.Container) error { func (p *balloons) AllocateResources(c cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() if c.PreserveCpuResources() { log.Infof("not handling resources of container %s, preserving CPUs %q and memory %q", c.PrettyName(), c.GetCpusetCpus(), c.GetCpusetMems()) @@ -328,6 +331,7 @@ func (p *balloons) AllocateResources(c cache.Container) error { func (p *balloons) ReleaseResources(c cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() log.Debugf("releasing container %s...", c.PrettyName()) if bln := p.balloonByContainer(c); bln != nil { @@ -361,6 +365,7 @@ func (p *balloons) ReleaseResources(c cache.Container) error { func (p *balloons) UpdateResources(c cache.Container) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() log.Debugf("(not) updating container %s...", c.PrettyName()) return nil @@ -573,6 +578,40 @@ func (p *balloons) GetTopologyZones() []*policy.TopologyZone { return zones } +// GetExtendedResources returns the node-level extended resources +// the balloons policy publishes for the local Node. +func (p *balloons) GetExtendedResources() map[string]int64 { + out := map[string]int64{} + if p.cpuClasses == nil || !p.cpuClasses.PctActive() { + return out + } + if p.bpoptions == nil { + return out + } + for _, cc := range p.bpoptions.CPUClasses { + if cc == nil || !cc.PublishExtendedResource { + continue + } + if cc.PctPriority == "" && cc.PctClosID == nil { + log.Warnf("ignoring publishExtendedResource on non-PCT cpuClass %q", cc.Name) + continue + } + held := cpuset.New() + for _, bln := range p.balloons { + if p.resolveCpuClassName(bln.Def.CpuClass) == cc.Name { + continue + } + held = held.Union(bln.Cpus) + } + free := p.cpuClasses.PctFreeClassCapacity(cc.Name, held) + if free < 0 { + free = 0 + } + out["cpuclass.balloons.nri.io/"+cc.Name] = int64(free) + } + return out +} + // balloonByContainer returns a balloon that contains a container. func (p *balloons) balloonByContainer(c cache.Container) *Balloon { podID := c.GetPodID() @@ -788,83 +827,104 @@ func largest(sliceLen int, valueOf func(i int) int) ([]int, int) { return largestIndices, largestValue } +// defaultCpuClassName is the name of the implicit "default" +// CPU class. When IdleCpuClass or a balloon type's CpuClass is left +// unset and a class with this name is configured, that class is used +// as the implicit fallback. This balloons-specific convention is kept +// out of the policy-neutral cpuclass package and applied here. +const defaultCpuClassName = "default" + +// resolveCpuClassName substitutes the configured "default" CPU class +// for an empty name when such a class exists. Non-empty names are +// returned unchanged. +func (p *balloons) resolveCpuClassName(name string) string { + if name != "" { + return name + } + for _, cc := range p.bpoptions.CPUClasses { + if cc.Name == defaultCpuClassName { + return defaultCpuClassName + } + } + return name +} + // resetCpuClass resets CPU configurations globally. All balloons can // be ignored, their CPU configurations will be applied later. func (p *balloons) resetCpuClass() error { - // Usual inputs: - // - p.allowed (cpuset.CPUset): all CPUs available for this - // policy. - // - p.IdleCpuClass (string): CPU class for allowed CPUs. - // - // Other inputs, if needed: - // - p.reserved (cpuset.CPUset): CPUs of ReservedResources - // (typically for kube-system containers). - // - // Note: p.useCpuClass(balloon) will be called before assigning - // containers on the balloon, including the reserved balloon. - // - // TODO: don't depend on cpu controller directly - if err := cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, p.allowed.UnsortedList()...); err != nil { + // p.useCpuClass(balloon) will be called later for every balloon, + // including the reserved balloon, to set the per-balloon CPU + // class. Here we only assign the idle class to all allowed CPUs. + if p.cpuClasses == nil { + return nil + } + idle := p.resolveCpuClassName(p.bpoptions.IdleCpuClass) + if err := p.cpuClasses.UseClass(idle, p.allowed); err != nil { log.Warnf("failed to reset class of available cpus: %v", err) } else { - log.Debugf("reset class of available cpus: %q (reserved: %q)", p.allowed, p.reserved) + log.Debugf("reset class of available cpus: %q to idle class %q (reserved: %q)", + p.allowed, idle, p.reserved) } return nil } -// useCpuClass configures CPUs of a balloon. +// commitCpuClasses flushes any pending cpufreq, cpuidle and uncore +// sysfs writes accumulated by previous UseClass / Configure calls +// since the last commit. Called from the deferred path of the +// public balloons lifecycle entry points so multiple class +// reassignments within one NRI request batch coalesce into a +// minimal set of writes. +func (p *balloons) commitCpuClasses() { + if p.cpuClasses == nil { + return + } + if err := p.cpuClasses.Commit(); err != nil { + log.Warnf("cpu class commit produced an error: %v", err) + } +} + +// useCpuClass configures CPUs of a balloon by delegating to the CPU +// class handler. func (p *balloons) useCpuClass(bln *Balloon) error { - // Usual inputs: - // - CPUs that cpuallocator has reserved for this balloon: - // bln.Cpus (cpuset.CPUSet). - // - User-defined CPU configuration for CPUs of balloon of this type: - // bln.Def.CpuClass (string). - // - Current configuration(?): feel free to add data - // structure for this. For instance policy-global p.cpuConfs, - // or balloon-local bln.cpuConfs. - // - // Other input examples, if needed: - // - Requested CPU resources by all containers in the balloon: - // p.requestedMilliCpus(bln). - // - Free CPU resources in the balloon: p.freeMilliCpus(bln). - // - Number of assigned containers: bln.ContainerCount(). - // - Container details: access p.cch with bln.ContainerIDs(). - // - User-defined CPU AllocatorPriority: bln.Def.AllocatorPriority. - // - All existing balloon instances: p.balloons. - // - CPU configurations by user: bln.Def.CpuClass (for bln in p.balloons) if len(bln.components) > 0 { - // If this is a composite balloon, CPU class is - // defined in the component balloons. - log.Debugf("apply CPU class %q on CPUs %s of composite balloon %q", - bln.Def.CpuClass, bln.Cpus, bln.PrettyName()) + // Composite balloon: each component carries its own CpuClass. + log.Debugf("apply CPU classes of components of composite balloon %q on CPUs %s", + bln.PrettyName(), bln.Cpus) for _, compBln := range bln.components { if err := p.useCpuClass(compBln); err != nil { log.Warnf("failed to apply CPU class %q on CPUs %s of %q in composite balloon %q: %v", compBln.Def.CpuClass, compBln.Cpus, compBln.PrettyName(), bln.PrettyName(), err) } - } return nil } - if err := cpucontrol.Assign(p.cch, bln.Def.CpuClass, bln.Cpus.UnsortedList()...); err != nil { - log.Warnf("failed to apply class %q on CPUs %q: %v", bln.Def.CpuClass, bln.Cpus, err) - } else { - log.Debugf("apply CPU class %q on CPUs %q of %q", bln.Def.CpuClass, bln.Cpus, bln.PrettyName()) + if p.cpuClasses == nil { + return nil + } + cpuClass := p.resolveCpuClassName(bln.Def.CpuClass) + log.Debugf("apply CPU class %q on CPUs %q of %q", cpuClass, bln.Cpus, bln.PrettyName()) + if err := p.cpuClasses.UseClass(cpuClass, bln.Cpus); err != nil { + log.Warnf("failed to apply class %q on CPUs %q: %v", cpuClass, bln.Cpus, err) } return nil } // forgetCpuClass is called when CPUs of a balloon are released from duty. +// It reassigns those CPUs to the configured idle class - the handler +// has no separate "forget" concept; every CPU is always in some class. func (p *balloons) forgetCpuClass(bln *Balloon) { - // Use p.IdleCpuClass for bln.Cpus. - // Usual inputs: see useCpuClass - if err := cpucontrol.Assign(p.cch, p.bpoptions.IdleCpuClass, bln.Cpus.UnsortedList()...); err != nil { - log.Warnf("failed to forget class %q of cpus %q: %v", bln.Def.CpuClass, bln.Cpus, err) + if p.cpuClasses == nil { + return + } + idle := p.resolveCpuClassName(p.bpoptions.IdleCpuClass) + if err := p.cpuClasses.UseClass(idle, bln.Cpus); err != nil { + log.Warnf("failed to forget class of cpus %q (idle class %q): %v", bln.Cpus, idle, err) } else { if len(bln.components) > 0 { - log.Debugf("forget classes of composite balloon %q cpus %q", bln.Def.Name, bln.Cpus) + log.Debugf("forget classes of composite balloon %q cpus %q (idle class %q)", + bln.Def.Name, bln.Cpus, idle) } else { - log.Debugf("forget class %q of cpus %q", bln.Def.CpuClass, bln.Cpus) + log.Debugf("forget class of cpus %q (idle class %q)", bln.Cpus, idle) } } } @@ -1063,8 +1123,8 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro allocatorOptions := cpuTreeAllocatorOptions{ topologyBalancing: p.bpoptions.AllocatorTopologyBalancing, preferSpreadOnPhysicalCores: p.bpoptions.PreferSpreadOnPhysicalCores, - preferCloseToDevices: blnDef.PreferCloseToDevices, - preferFarFromDevices: blnDef.PreferFarFromDevices, + preferCloseToDevices: append([]string(nil), blnDef.PreferCloseToDevices...), + preferFarFromDevices: append([]string(nil), blnDef.PreferFarFromDevices...), virtDevCpusets: map[string][]cpuset.CPUSet{ virtDevReservedCpus: {p.reserved}, virtDevIsolatedCpus: {p.options.System.Isolated()}, @@ -1072,6 +1132,7 @@ func (p *balloons) newBalloon(blnDef *BalloonDef, confCpus bool) (*Balloon, erro virtDevPCores: {p.cpuAllocator.GetCPUPriorities()[cpuallocator.PriorityHigh]}, }, } + p.applyCpuClassHints(&allocatorOptions, p.resolveCpuClassName(blnDef.CpuClass), cpuset.New(), 0) if blnDef.AllocatorTopologyBalancing != nil { allocatorOptions.topologyBalancing = *blnDef.AllocatorTopologyBalancing } @@ -1399,14 +1460,28 @@ func changesBalloons(opts0, opts1 *BalloonsOptions) bool { } o0 := opts0.DeepCopy() o1 := opts1.DeepCopy() - // Ignore differences in CPU class names. Every other change - // potentially changes balloons or workloads. + // Ignore differences in BalloonsOptions that do not affect + // CPU-to-balloon or container-to-balloon mapping. Such + // differences include: + // + // 1. CPUClass related parameters o0.IdleCpuClass = "" o1.IdleCpuClass = "" + o0.TurboDomain = "" + o1.TurboDomain = "" + o0.CPUClasses = nil + o1.CPUClasses = nil for i := range o0.BalloonDefs { o0.BalloonDefs[i].CpuClass = "" o1.BalloonDefs[i].CpuClass = "" } + // 2. Schedulingpolicy parameters + o0.SchedulingClasses = nil + o1.SchedulingClasses = nil + for i := range o0.BalloonDefs { + o0.BalloonDefs[i].SchedulingClass = "" + o1.BalloonDefs[i].SchedulingClass = "" + } return utils.DumpJSON(o0) != utils.DumpJSON(o1) } @@ -1424,6 +1499,9 @@ func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool { if opts0.IdleCpuClass != opts1.IdleCpuClass { return true } + if opts0.TurboDomain != opts1.TurboDomain { + return true + } if len(opts0.BalloonDefs) != len(opts1.BalloonDefs) { return true } @@ -1432,12 +1510,20 @@ func changesCpuClasses(opts0, opts1 *BalloonsOptions) bool { return true } } + // Detect changes in CPUClasses definitions (turbo attributes, frequencies, etc.) + if len(opts0.CPUClasses) != len(opts1.CPUClasses) { + return true + } + if utils.DumpJSON(opts0.CPUClasses) != utils.DumpJSON(opts1.CPUClasses) { + return true + } return false } func (p *balloons) Reconfigure(newCfg interface{}) error { p.BlockMeters() defer p.UnblockMeters() + defer p.commitCpuClasses() balloonsOptions, ok := newCfg.(*BalloonsOptions) if !ok { @@ -1454,6 +1540,19 @@ func (p *balloons) Reconfigure(newCfg interface{}) error { log.Infof("no configuration changes") } else { log.Infof("configuration changes only on CPU classes") + // Update CPUClasses definitions. + p.bpoptions.CPUClasses = newBalloonsOptions.CPUClasses + p.bpoptions.IdleCpuClass = newBalloonsOptions.IdleCpuClass + p.bpoptions.TurboDomain = newBalloonsOptions.TurboDomain + if p.cpuClasses != nil { + if err := p.cpuClasses.Configure(cpuclass.ConfigSpec{ + Classes: p.bpoptions.CPUClasses, + TurboDomain: p.bpoptions.TurboDomain, + Allowed: p.allowed, + }); err != nil { + log.Warnf("failed to reconfigure CPU class handler: %v", err) + } + } // Update new CPU classes to existing balloon // definitions. The same BalloonDef instances // must be kept in use, because each Balloon @@ -1600,6 +1699,95 @@ func (p *balloons) validateConfig(bpoptions *BalloonsOptions) error { if len(undefinedSchedulingClasses) > 0 { return balloonsError("schedulingClass(es) defined in balloonTypes but missing from schedulingClasses: %v", undefinedSchedulingClasses) } + // Validate CPUClasses. + cpuClassNames := map[string]struct{}{} + pctManaged := map[string]string{} // class name -> "high"/"low" + pctAssocOnly := map[string]int{} // class name -> CLOS id + for _, cc := range bpoptions.CPUClasses { + if cc.Name == "" { + return balloonsError("missing or empty name in a cpuClasses entry") + } + if _, dup := cpuClassNames[cc.Name]; dup { + return balloonsError("duplicate cpuClasses name: %q", cc.Name) + } + cpuClassNames[cc.Name] = struct{}{} + // Validate PCT fields. + if cc.PctPriority != "" && cc.PctClosID != nil { + return balloonsError("cpuClass %q: pctPriority and pctClosID are mutually exclusive", cc.Name) + } + switch cc.PctPriority { + case "", "high", "low": + default: + return balloonsError("cpuClass %q: invalid pctPriority %q (allowed: \"high\", \"low\")", cc.Name, cc.PctPriority) + } + if cc.PctPriority != "" { + pctManaged[cc.Name] = cc.PctPriority + } + if cc.PctClosID != nil { + if *cc.PctClosID < 0 { + return balloonsError("cpuClass %q: pctClosID must be >= 0, got %d", cc.Name, *cc.PctClosID) + } + pctAssocOnly[cc.Name] = *cc.PctClosID + } + // pctMinFreq/pctMaxFreq only take effect in managed + // mode (pctPriority); they program the SST CLOS that + // balloons owns. With pctClosID the CLOS is + // pre-programmed by intel-speed-select/BIOS, and + // without any PCT field the cpuClass is not a PCT + // class at all. In both cases these fields are silent + // no-ops; reject them so users don't tweak values that + // have no effect. + if cc.PctMinFreq != 0 || cc.PctMaxFreq != 0 { + switch { + case cc.PctClosID != nil: + return balloonsError("cpuClass %q: pctMinFreq/pctMaxFreq require pctPriority (managed mode); they are incompatible with pctClosID, where the SST CLOS is pre-programmed by intel-speed-select/BIOS", cc.Name) + case cc.PctPriority == "": + return balloonsError("cpuClass %q: pctMinFreq/pctMaxFreq require pctPriority (managed mode); the cpuClass is currently not a PCT class", cc.Name) + } + } + // publishExtendedResource only makes sense for PCT + // classes -- the agent computes capacity from a PCT + // plan. Reject it on non-PCT classes so users don't + // expect a node-level resource that will never be + // published. + if cc.PublishExtendedResource && cc.PctPriority == "" && cc.PctClosID == nil { + return balloonsError("cpuClass %q: publishExtendedResource requires the cpuClass to be a PCT class (set pctPriority or pctClosID)", cc.Name) + } + } + if len(pctManaged) > 0 && len(pctAssocOnly) > 0 { + return balloonsError("mixing managed (pctPriority) and assoc-only (pctClosID) PCT cpuClasses is not allowed: managed=%v, assocOnly=%v", pctManaged, pctAssocOnly) + } + if len(pctManaged) > 0 { + hpClasses, lpClasses := []string{}, []string{} + for name, prio := range pctManaged { + if prio == "high" { + hpClasses = append(hpClasses, name) + } else { + lpClasses = append(lpClasses, name) + } + } + if len(hpClasses) > 1 { + return balloonsError("at most one managed PCT cpuClass with pctPriority=high allowed, got %d: %v", len(hpClasses), hpClasses) + } + if len(lpClasses) > 1 { + return balloonsError("at most one managed PCT cpuClass with pctPriority=low allowed, got %d: %v", len(lpClasses), lpClasses) + } + } + // Verify that cpuClass references in balloon types are + // defined in cpuClasses. Using the legacy control.cpu.classes + // configuration is discouraged and it is possibly out-of-date + // at this point because resource-manager starts controllers + // only after policies. + for _, blnDef := range bpoptions.BalloonDefs { + if blnDef.CpuClass == "" { + continue + } + _, inCPUClasses := cpuClassNames[blnDef.CpuClass] + if !inCPUClasses { + log.Warnf("cpuClass %q referenced by balloon type %q is not defined in cpuClasses", + blnDef.CpuClass, blnDef.Name) + } + } var circularCheck func(name string, seen map[string]int) error circularCheck = func(name string, seen map[string]int) error { if seen[name] > 0 { @@ -1671,6 +1859,20 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { setOmittedDefaults(bpoptions) + // Set bpoptions early so the turbo allocator construction below + // has access to CPUClasses. + p.bpoptions = bpoptions + + // Construct the CPU class handler that fronts both cpufreq and + // PCT internals. + if p.cpuClasses == nil { + h, err := cpuclass.New(p.options.System) + if err != nil { + return balloonsError("failed to create CPU class handler: %w", err) + } + p.cpuClasses = h + } + reservedBalloonDef, defaultBalloonDef, err := p.fillBuiltinBalloonDefs(bpoptions) if err != nil { return err @@ -1678,6 +1880,16 @@ func (p *balloons) setConfig(bpoptions *BalloonsOptions) error { if err = p.validateConfig(bpoptions); err != nil { return balloonsError("invalid configuration: %w", err) } + // Configure the CPU class handler. Done after validation so we + // don't program platform state (e.g. SST CLOSes) if the + // user-facing config is malformed. + if err := p.cpuClasses.Configure(cpuclass.ConfigSpec{ + Classes: bpoptions.CPUClasses, + TurboDomain: bpoptions.TurboDomain, + Allowed: p.allowed, + }); err != nil { + return balloonsError("failed to configure CPU class handler: %w", err) + } p.fillLoadVirtDevices(bpoptions.LoadClasses) p.fillCloseToDevices(bpoptions.BalloonDefs) p.fillFarFromDevices(bpoptions.BalloonDefs) @@ -1873,6 +2085,95 @@ func (p *balloons) fillCloseToDevices(blnDefs []*BalloonDef) { } } +// cpuClassHintDevPrefix is the prefix used for synthetic virtual +// device names that carry cpuClass placement hints. All such entries +// are owned exclusively by applyCpuClassHints and are discarded on +// every new allocation round, because hints are only valid for the +// allocation they were requested for. +const cpuClassHintDevPrefix = "__cls_" + +// applyCpuClassHints queries the CPU class handler for placement +// hints for an upcoming allocation under cpuClass and merges them +// into opts as synthetic virtual devices. The names start with the +// reserved cpuClassHintDevPrefix so they cannot collide with +// user-configured device names. +// +// Any stale cpuClass hints left in opts from a previous allocation +// round are removed first: hints reflect the cpuClass handler's +// view at one specific moment and must not accumulate across +// resize cycles. +// +// - opts: allocator options to extend in place. +// - cpuClass: the cpuClass that the upcoming allocation will use. +// - currentCpus: CPUs the balloon already owns (excluded from HP +// room accounting in PCT hints). +// - requestedCount: number of CPUs the upcoming allocation wants. +// Pass 0 when unknown (e.g. balloon creation before sizing). +func (p *balloons) applyCpuClassHints(opts *cpuTreeAllocatorOptions, cpuClass string, currentCpus cpuset.CPUSet, requestedCount int) { + if p.cpuClasses == nil || opts == nil { + return + } + mergeCpuClassHints(opts, p.cpuClasses, cpuclass.AllocationIntent{ + ClassName: cpuClass, + CurrentCpus: currentCpus, + FreeCpus: p.freeCpus, + RequestedCount: requestedCount, + }) +} + +// cpuClassHints is the minimum surface of cpuclass.Handler that +// policy code relies on for placement hints. It exists so tests +// can substitute a fake provider. +type cpuClassHints interface { + Hints(cpuclass.AllocationIntent) cpuclass.AllocationHints +} + +// mergeCpuClassHints queries provider for placement hints described +// by intent and merges them into opts. It first removes any cpuClass +// hint entries left in opts from a previous allocation round so +// hints from this round are the only ones in effect. +func mergeCpuClassHints(opts *cpuTreeAllocatorOptions, provider cpuClassHints, intent cpuclass.AllocationIntent) { + if opts == nil || provider == nil { + return + } + if opts.virtDevCpusets == nil { + opts.virtDevCpusets = map[string][]cpuset.CPUSet{} + } + opts.preferCloseToDevices = filterOutHintDevs(opts.preferCloseToDevices) + opts.preferFarFromDevices = filterOutHintDevs(opts.preferFarFromDevices) + for name := range opts.virtDevCpusets { + if strings.HasPrefix(name, cpuClassHintDevPrefix) { + delete(opts.virtDevCpusets, name) + } + } + hints := provider.Hints(intent) + for i, pref := range hints.Prefer { + name := fmt.Sprintf("%spref_%d_%s", cpuClassHintDevPrefix, i, pref.Name) + opts.virtDevCpusets[name] = []cpuset.CPUSet{pref.Cpus} + opts.preferCloseToDevices = append(opts.preferCloseToDevices, name) + log.Debugf("cpuclass hint: prefer %q -> %s", name, pref.Cpus) + } + for i, av := range hints.Avoid { + name := fmt.Sprintf("%savoid_%d_%s", cpuClassHintDevPrefix, i, av.Name) + opts.virtDevCpusets[name] = []cpuset.CPUSet{av.Cpus} + opts.preferFarFromDevices = append(opts.preferFarFromDevices, name) + log.Debugf("cpuclass hint: avoid %q -> %s", name, av.Cpus) + } +} + +// filterOutHintDevs returns devs with all cpuClass hint device names +// (those carrying cpuClassHintDevPrefix) removed. The returned slice +// reuses devs' backing array. +func filterOutHintDevs(devs []string) []string { + out := devs[:0] + for _, d := range devs { + if !strings.HasPrefix(d, cpuClassHintDevPrefix) { + out = append(out, d) + } + } + return out +} + // fillFarFromDevices adds BalloonDefs implicit device anti-affinities // towards devices that other BalloonDefs prefer to be close to. func (p *balloons) fillFarFromDevices(blnDefs []*BalloonDef) { @@ -2010,6 +2311,7 @@ func (p *balloons) resizeBalloon(bln *Balloon, newMilliCpus int) error { } }() p.updateLoadedVirtDevsInAllocatorOptions(&bln.cpuTreeAlloc.options, bln.Def.Loads) + p.applyCpuClassHints(&bln.cpuTreeAlloc.options, p.resolveCpuClassName(bln.Def.CpuClass), bln.Cpus, cpuCountDelta) if cpuCountDelta > 0 { // Inflate the balloon. addFromCpus, _, err := bln.cpuTreeAlloc.ResizeCpus(bln.Cpus, p.freeCpus, cpuCountDelta) diff --git a/cmd/plugins/balloons/policy/cpuclass_test.go b/cmd/plugins/balloons/policy/cpuclass_test.go new file mode 100644 index 000000000..c2aa63345 --- /dev/null +++ b/cmd/plugins/balloons/policy/cpuclass_test.go @@ -0,0 +1,175 @@ +// Copyright The NRI Plugins Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package balloons + +import ( + "strings" + "testing" + + "github.com/containers/nri-plugins/pkg/resmgr/cpuclass" + "github.com/containers/nri-plugins/pkg/utils/cpuset" +) + +// fakeHintProvider returns a scripted sequence of cpuclass.AllocationHints, +// one per Hints() call. After the script is exhausted it keeps +// returning the last entry. +type fakeHintProvider struct { + script []cpuclass.AllocationHints + calls int +} + +func (f *fakeHintProvider) Hints(_ cpuclass.AllocationIntent) cpuclass.AllocationHints { + i := f.calls + if i >= len(f.script) { + i = len(f.script) - 1 + } + f.calls++ + return f.script[i] +} + +func countHintDevs(devs []string) int { + n := 0 + for _, d := range devs { + if strings.HasPrefix(d, cpuClassHintDevPrefix) { + n++ + } + } + return n +} + +func countHintMapKeys(m map[string][]cpuset.CPUSet) int { + n := 0 + for k := range m { + if strings.HasPrefix(k, cpuClassHintDevPrefix) { + n++ + } + } + return n +} + +// TestMergeCpuClassHintsNoAccumulation verifies that repeated +// allocation rounds do not cause cpuClass hint entries to accumulate +// in cpuTreeAllocatorOptions. Each round must leave behind exactly +// the hint count reported by the provider on that round, regardless +// of how many earlier rounds added different hints. +func TestMergeCpuClassHintsNoAccumulation(t *testing.T) { + cpusA := cpuset.MustParse("2-3") + cpusB := cpuset.MustParse("4-5") + cpusC := cpuset.MustParse("6-7") + cpusAvoid := cpuset.MustParse("0-1") + + provider := &fakeHintProvider{ + script: []cpuclass.AllocationHints{ + // Round 1: one prefer (A), one avoid. + { + Prefer: []cpuclass.CpuPreference{{Name: "hp-reserve", Cpus: cpusA}}, + Avoid: []cpuclass.CpuPreference{{Name: "lp-clos", Cpus: cpusAvoid}}, + }, + // Round 2: two prefers (A, B) - different name at index 1 + // so the slot-0 name stays stable, slot-1 is new. + { + Prefer: []cpuclass.CpuPreference{ + {Name: "hp-reserve", Cpus: cpusA}, + {Name: "extra", Cpus: cpusB}, + }, + Avoid: []cpuclass.CpuPreference{{Name: "lp-clos", Cpus: cpusAvoid}}, + }, + // Round 3: name at slot 0 CHANGES to C - without proper + // cleanup the stale "__cls_pref_0_hp-reserve" map key from + // rounds 1+2 would survive into round 3. + { + Prefer: []cpuclass.CpuPreference{{Name: "third", Cpus: cpusC}}, + Avoid: nil, + }, + }, + } + + opts := &cpuTreeAllocatorOptions{ + preferCloseToDevices: []string{"user-dev-A", "user-dev-B"}, + preferFarFromDevices: []string{"user-far"}, + virtDevCpusets: map[string][]cpuset.CPUSet{}, + } + + for round := 1; round <= 3; round++ { + mergeCpuClassHints(opts, provider, cpuclass.AllocationIntent{}) + + gotPrefDevs := countHintDevs(opts.preferCloseToDevices) + gotFarDevs := countHintDevs(opts.preferFarFromDevices) + gotMapKeys := countHintMapKeys(opts.virtDevCpusets) + + var expPref, expFar int + switch round { + case 1: + expPref, expFar = 1, 1 + case 2: + expPref, expFar = 2, 1 + case 3: + expPref, expFar = 1, 0 + } + if gotPrefDevs != expPref { + t.Errorf("round %d: preferCloseToDevices hint count = %d, want %d (slice=%v)", + round, gotPrefDevs, expPref, opts.preferCloseToDevices) + } + if gotFarDevs != expFar { + t.Errorf("round %d: preferFarFromDevices hint count = %d, want %d (slice=%v)", + round, gotFarDevs, expFar, opts.preferFarFromDevices) + } + if gotMapKeys != expPref+expFar { + t.Errorf("round %d: virtDevCpusets hint key count = %d, want %d (keys=%v)", + round, gotMapKeys, expPref+expFar, mapKeys(opts.virtDevCpusets)) + } + } + + // Sanity: user-supplied (non-hint) devices must survive untouched. + if got := userDevs(opts.preferCloseToDevices); len(got) != 2 || + got[0] != "user-dev-A" || got[1] != "user-dev-B" { + t.Errorf("user preferCloseToDevices were modified: got %v", got) + } + if got := userDevs(opts.preferFarFromDevices); len(got) != 1 || got[0] != "user-far" { + t.Errorf("user preferFarFromDevices were modified: got %v", got) + } +} + +func TestFilterOutHintDevs(t *testing.T) { + in := []string{"a", "__cls_pref_0_x", "b", "__cls_avoid_0_y", "c"} + got := filterOutHintDevs(in) + want := []string{"a", "b", "c"} + if len(got) != len(want) { + t.Fatalf("filterOutHintDevs len=%d, want %d: got=%v", len(got), len(want), got) + } + for i := range want { + if got[i] != want[i] { + t.Errorf("filterOutHintDevs[%d] = %q, want %q", i, got[i], want[i]) + } + } +} + +func userDevs(devs []string) []string { + out := []string{} + for _, d := range devs { + if !strings.HasPrefix(d, cpuClassHintDevPrefix) { + out = append(out, d) + } + } + return out +} + +func mapKeys(m map[string][]cpuset.CPUSet) []string { + out := make([]string, 0, len(m)) + for k := range m { + out = append(out, k) + } + return out +} diff --git a/cmd/plugins/balloons/policy/flags.go b/cmd/plugins/balloons/policy/flags.go index 19889ea15..e39809ca3 100644 --- a/cmd/plugins/balloons/policy/flags.go +++ b/cmd/plugins/balloons/policy/flags.go @@ -24,6 +24,8 @@ type ( BalloonDef = cfgapi.BalloonDef LoadClass = cfgapi.LoadClass SchedulingClass = cfgapi.SchedulingClass + CPUClass = cfgapi.CPUClass + Frequency = cfgapi.Frequency CPUTopologyLevel = cfgapi.CPUTopologyLevel ) From ec2b7a62fa303cf130a90cdd5ece07664790e27c Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 07/10] resmgr: add policy.GetExtendedResources interface and lifecycle wiring Signed-off-by: Antti Kervinen --- .../template/policy/template-policy.go | 6 ++++++ .../policy/topology-aware-policy.go | 6 ++++++ pkg/resmgr/main/main.go | 21 +++++++++++++++++++ pkg/resmgr/nri.go | 3 +++ pkg/resmgr/policy/policy.go | 17 +++++++++++++++ pkg/resmgr/resource-manager.go | 10 +++++++++ 6 files changed, 63 insertions(+) diff --git a/cmd/plugins/template/policy/template-policy.go b/cmd/plugins/template/policy/template-policy.go index 4a31092b9..cb9ede8c1 100644 --- a/cmd/plugins/template/policy/template-policy.go +++ b/cmd/plugins/template/policy/template-policy.go @@ -119,6 +119,12 @@ func (p *policy) GetTopologyZones() []*policyapi.TopologyZone { return nil } +// GetExtendedResources returns the node-level extended resources +// to publish for this policy. The template policy publishes none. +func (p *policy) GetExtendedResources() map[string]int64 { + return nil +} + // ExportResourceData provides resource data to export for the container. func (p *policy) ExportResourceData(c cache.Container) map[string]string { return nil diff --git a/cmd/plugins/topology-aware/policy/topology-aware-policy.go b/cmd/plugins/topology-aware/policy/topology-aware-policy.go index becdac4fb..65616bc62 100644 --- a/cmd/plugins/topology-aware/policy/topology-aware-policy.go +++ b/cmd/plugins/topology-aware/policy/topology-aware-policy.go @@ -395,6 +395,12 @@ func (p *policy) GetTopologyZones() []*policyapi.TopologyZone { return zones } +// GetExtendedResources returns the node-level extended resources +// to publish for this policy. The topology-aware policy publishes none. +func (p *policy) GetExtendedResources() map[string]int64 { + return nil +} + // ExportResourceData provides resource data to export for the container. func (p *policy) ExportResourceData(c cache.Container) map[string]string { grant, ok := p.allocations.grants[c.GetID()] diff --git a/pkg/resmgr/main/main.go b/pkg/resmgr/main/main.go index 89f4bc80a..b756366a6 100644 --- a/pkg/resmgr/main/main.go +++ b/pkg/resmgr/main/main.go @@ -18,6 +18,7 @@ import ( "flag" "fmt" "os" + "os/signal" "strings" "syscall" @@ -67,6 +68,26 @@ func (m *Main) Run() error { } defer m.stopTracing() + // Install a SIGTERM/SIGINT handler that triggers a graceful + // agent shutdown: this lets us clean up node state (e.g., + // extended resources we published) before the kubelet kills + // the container. Closing the agent's stop channel makes its + // event loop return, which unwinds m.mgr.Start() and lets + // Run() exit normally. + sigCh := make(chan os.Signal, 1) + signal.Notify(sigCh, syscall.SIGTERM, syscall.SIGINT) + go func() { + sig, ok := <-sigCh + if !ok { + return + } + log.Infof("received signal %s, shutting down gracefully", sig) + if m.agt != nil { + m.agt.Stop() + } + }() + defer signal.Stop(sigCh) + err := m.mgr.Start() return err } diff --git a/pkg/resmgr/nri.go b/pkg/resmgr/nri.go index 0678fe0bf..efb14ff81 100644 --- a/pkg/resmgr/nri.go +++ b/pkg/resmgr/nri.go @@ -284,6 +284,7 @@ func (p *nriPlugin) Synchronize(ctx context.Context, pods []*api.PodSandbox, con } m.updateTopologyZones() + m.updateNodeExtendedResources() return p.getPendingUpdates(nil), nil } @@ -445,6 +446,7 @@ func (p *nriPlugin) CreateContainer(ctx context.Context, pod *api.PodSandbox, co m.policy.ExportResourceData(c) m.updateTopologyZones() + m.updateNodeExtendedResources() adjust = p.getPendingAdjustment(container) updates = p.getPendingUpdates(container) @@ -596,6 +598,7 @@ func (p *nriPlugin) StopContainer(ctx context.Context, pod *api.PodSandbox, cont c.UpdateState(cache.ContainerStateExited) m.updateTopologyZones() + m.updateNodeExtendedResources() return p.getPendingUpdates(container), nil } diff --git a/pkg/resmgr/policy/policy.go b/pkg/resmgr/policy/policy.go index ceaff7934..a682422f6 100644 --- a/pkg/resmgr/policy/policy.go +++ b/pkg/resmgr/policy/policy.go @@ -117,6 +117,14 @@ type Backend interface { ExportResourceData(cache.Container) map[string]string // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs. GetTopologyZones() []*TopologyZone + // GetExtendedResources returns node-level extended resources + // this policy wishes to publish on the local Node, mapping + // fully-qualified resource name (e.g. + // "cpuclass.balloons.nri.io/hp-pct") to its current capacity + // (logical CPU count). Returning nil or an empty map means + // "publish nothing"; previously-published resources are then + // cleared by the agent. + GetExtendedResources() map[string]int64 } // Policy is the exposed interface for container resource allocations decision making. @@ -143,6 +151,9 @@ type Policy interface { ExportResourceData(cache.Container) // GetTopologyZones returns the policy/pool data for 'topology zone' CRDs. GetTopologyZones() []*TopologyZone + // GetExtendedResources returns node-level extended resources + // the active policy wishes to publish on the local Node. + GetExtendedResources() map[string]int64 } // Metrics is the interface we expect policy-specific metrics to implement. @@ -336,3 +347,9 @@ func (p *policy) ExportResourceData(c cache.Container) { func (p *policy) GetTopologyZones() []*TopologyZone { return p.active.GetTopologyZones() } + +// GetExtendedResources returns node-level extended resources the +// active policy wishes to publish on the local Node. +func (p *policy) GetExtendedResources() map[string]int64 { + return p.active.GetExtendedResources() +} diff --git a/pkg/resmgr/resource-manager.go b/pkg/resmgr/resource-manager.go index 28e5b9434..fe6335a41 100644 --- a/pkg/resmgr/resource-manager.go +++ b/pkg/resmgr/resource-manager.go @@ -160,6 +160,7 @@ func (m *resmgr) updateConfig(newCfg interface{}) (bool, error) { reconfErr := m.reconfigure(cfg) m.updateTopologyZones() + m.updateNodeExtendedResources() return false, reconfErr } @@ -297,6 +298,15 @@ func (m *resmgr) updateTopologyZones() { } } +// updateNodeExtendedResources publishes (or clears) the +// node-level extended resources the active policy advertises. +func (m *resmgr) updateNodeExtendedResources() { + resources := m.policy.GetExtendedResources() + if err := m.agent.UpdateNodeExtendedResources(resources); err != nil { + log.Errorf("failed to update node extended resources: %v", err) + } +} + func (m *resmgr) reconfigure(cfg cfgapi.ResmgrConfig) error { apply := func(cfg cfgapi.ResmgrConfig) error { mCfg := cfg.CommonConfig() From 46f374a1b2d1e89b6c693c8811da85313fe27ed0 Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 08/10] helm: grant balloons access to node extended resources Signed-off-by: Antti Kervinen --- .../helm/balloons/templates/clusterrole.yaml | 8 ++++++++ deployment/helm/balloons/templates/daemonset.yaml | 14 ++++++++++++++ deployment/helm/balloons/values.yaml | 8 ++++++++ 3 files changed, 30 insertions(+) diff --git a/deployment/helm/balloons/templates/clusterrole.yaml b/deployment/helm/balloons/templates/clusterrole.yaml index 3c40d3e47..4ff4199b1 100644 --- a/deployment/helm/balloons/templates/clusterrole.yaml +++ b/deployment/helm/balloons/templates/clusterrole.yaml @@ -12,6 +12,14 @@ rules: verbs: - get - watch +- apiGroups: + - "" + resources: + - nodes/status + verbs: + - get + - patch + - update - apiGroups: - topology.node.k8s.io resources: diff --git a/deployment/helm/balloons/templates/daemonset.yaml b/deployment/helm/balloons/templates/daemonset.yaml index 2d190fb94..0af965d6e 100644 --- a/deployment/helm/balloons/templates/daemonset.yaml +++ b/deployment/helm/balloons/templates/daemonset.yaml @@ -99,6 +99,9 @@ spec: image: {{ .Values.image.name }}:{{ .Values.image.tag | default .Chart.AppVersion }} imagePullPolicy: {{ .Values.image.pullPolicy }} securityContext: + {{- if .Values.pct }} + privileged: true + {{- else }} allowPrivilegeEscalation: false capabilities: drop: @@ -108,6 +111,7 @@ spec: - SYS_ADMIN - DAC_OVERRIDE {{- end }} + {{- end }} resources: requests: cpu: {{ .Values.resources.cpu }} @@ -124,6 +128,10 @@ spec: - name: pod-resources-socket mountPath: /var/lib/kubelet/pod-resources readOnly: true + {{- if .Values.pct }} + - name: hostdev + mountPath: /host/dev + {{- end }} {{- if .Values.podPriorityClassNodeCritical }} priorityClassName: system-node-critical {{- end }} @@ -147,6 +155,12 @@ spec: hostPath: path: /var/lib/kubelet/pod-resources type: DirectoryOrCreate + {{- if .Values.pct }} + - name: hostdev + hostPath: + path: /dev + type: Directory + {{- end }} {{- if .Values.nri.runtime.patchConfig }} - name: containerd-config hostPath: diff --git a/deployment/helm/balloons/values.yaml b/deployment/helm/balloons/values.yaml index a4e00d23d..e7f9bd595 100644 --- a/deployment/helm/balloons/values.yaml +++ b/deployment/helm/balloons/values.yaml @@ -160,6 +160,14 @@ nodeSelector: [] # nodeSelector: # kubernetes.io/disk: "ssd" +# Enable support for Intel Speed Select Technology (SST), required by +# the Priority Core Turbo (PCT) feature of the balloons policy. When +# true, the plugin pod is granted access to the host SST device by +# running as privileged and mounting /dev from the host at /host/dev. +# Enable this only on nodes where PCT cpuClasses (with pctPriority or +# pctClosID) are used. +pct: false + # NRI plugins should be considered as part of the container runtime. # By default we make them part of the system-node-critical priority # class. This should mitigate the potential risk of a plugin getting From 09b39f35bf2db8a576cac0560e2547d3c9758cdc Mon Sep 17 00:00:00 2001 From: Antti Kervinen Date: Fri, 5 Jun 2026 14:25:10 +0300 Subject: [PATCH 09/10] e2e: test balloons cpuClasses, turbo priority and PCT Signed-off-by: Antti Kervinen --- .../balloons/balloons-config.yaml.in | 42 +- .../balloons-cstates.cfg | 13 +- .../balloons-turbo-defaultclass.cfg | 36 ++ .../balloons-turbo-oldsyntax.cfg | 37 ++ .../test18-turbo-priority/balloons-turbo.cfg | 49 ++ .../n4c16/test18-turbo-priority/code.var.sh | 582 ++++++++++++++++++ .../test19-pct/balloons-pct-assoconly.cfg | 40 ++ .../n4c16/test19-pct/balloons-pct-invalid.cfg | 5 + .../n4c16/test19-pct/balloons-pct-managed.cfg | 56 ++ .../balloons/n4c16/test19-pct/code.var.sh | 237 +++++++ 10 files changed, 1074 insertions(+), 23 deletions(-) create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test19-pct/balloons-pct-assoconly.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test19-pct/balloons-pct-invalid.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test19-pct/balloons-pct-managed.cfg create mode 100644 test/e2e/policies.test-suite/balloons/n4c16/test19-pct/code.var.sh diff --git a/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in b/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in index 8ebcfaacb..aa11dcf33 100644 --- a/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in +++ b/test/e2e/policies.test-suite/balloons/balloons-config.yaml.in @@ -61,19 +61,29 @@ spec: debug: - policy - control: - cpu: - classes: - default: - minFreq: ${CPU_DEFAULT_MIN:-800000} - maxFreq: ${CPU_DEFAULT_MAX:-2800000} - classA: - minFreq: ${CPU_CLASSA_MIN:-900000} - maxFreq: ${CPU_CLASSA_MAX:-2900000} - classB: - minFreq: ${CPU_CLASSB_MIN:-1000000} - maxFreq: ${CPU_CLASSB_MAX:-3000000} - classC: - minFreq: ${CPU_CLASSC_MIN:-1100000} - maxFreq: ${CPU_CLASSC_MAX:-3100000} - energyPerformancePreference: ${CPU_CLASSC_EPP:-1} + cpuClasses: + + $([ -n "$CPUCLASS_DEFAULT_SKIP" ] || echo " + - name: default + minFreq: ${CPU_DEFAULT_MIN:-800MHz} + maxFreq: ${CPU_DEFAULT_MAX:-2.8GHz} + ") + + $([ -n "$CPUCLASS_A_SKIP" ] || echo " + - name: classA + minFreq: ${CPU_CLASSA_MIN:-900MHz} + maxFreq: ${CPU_CLASSA_MAX:-2.9GHz} + ") + + $([ -n "$CPUCLASS_B_SKIP" ] || echo " + - name: classB + minFreq: ${CPU_CLASSB_MIN:-1GHz} + maxFreq: ${CPU_CLASSB_MAX:-3GHz} + ") + + $([ -n "$CPUCLASS_C_SKIP" ] || echo " + - name: classC + minFreq: ${CPU_CLASSC_MIN:-1.1GHz} + maxFreq: ${CPU_CLASSC_MAX:-3.1GHz} + energyPerformancePreference: ${CPU_CLASSC_EPP:-1} + ") diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg index 215432bf1..03924e9b9 100644 --- a/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg +++ b/test/e2e/policies.test-suite/balloons/n4c16/test17-cstates-scheduling/balloons-cstates.cfg @@ -16,13 +16,11 @@ config: cpuClass: lowlatency-class schedulingClass: realtime - control: - cpu: - classes: - lowlatency-class: - disabledCstates: [C4, C6, C8, C10] - default-class: - disabledCstates: [] + cpuClasses: + - name: lowlatency-class + disabledCstates: [C4, C6, C8, C10] + - name: default-class + disabledCstates: [] schedulingClasses: - name: realtime @@ -40,5 +38,6 @@ config: - policy - nri-plugin - cpu + - cpuclass extraEnv: OVERRIDE_SYS_CSTATES: '''[{"cpus": "0-15", "names": ["C1E", "C2", "C4", "C8"], "files": {"disable": "0"}}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg new file mode 100644 index 000000000..ecc2d6389 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-defaultclass.cfg @@ -0,0 +1,36 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + # Intentionally no idleCPUClass and no cpuClass on the reserved + # balloon type: both must fall back to the cpuClass named "default". + balloonTypes: + - name: reserved + - name: fast-bln + cpuClass: fast + minCPUs: 1 + maxCPUs: 1 + + cpuClasses: + - name: default + minFreq: "min" + maxFreq: "base" + - name: fast + minFreq: "turbo" + maxFreq: "turbo" + + log: + debug: + - policy + - nri-plugin + - cpu + - cpuclass +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg new file mode 100644 index 000000000..90b6a4de2 --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo-oldsyntax.cfg @@ -0,0 +1,37 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + idleCPUClass: legacy-idle + + balloonTypes: + - name: legacy-bln + cpuClass: legacy-fast + minCPUs: 1 + maxCPUs: 1 + + control: + cpu: + classes: + legacy-idle: + minFreq: 800000 + maxFreq: 2900000 + legacy-fast: + minFreq: 3800000 + maxFreq: 3800000 + + log: + debug: + - policy + - nri-plugin + - cpu + - cpuclass +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg new file mode 100644 index 000000000..6ece6cf3b --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/balloons-turbo.cfg @@ -0,0 +1,49 @@ +config: + agent: + nodeResourceTopology: true + allocatorTopologyBalancing: false + availableResources: + cpu: cpuset:2-7,10-13 + reservedResources: + cpu: 750m + + pinCPU: true + + idleCPUClass: default-noturbo + + balloonTypes: + - name: reserved + cpuClass: default-turbo + - name: turbo-high-bln + cpuClass: turbo-high + minCPUs: 1 + maxCPUs: 2 + - name: turbo-low-bln + cpuClass: turbo-low + minCPUs: 1 + maxCPUs: 2 + + cpuClasses: + - name: turbo-high + minFreq: "turbo" + maxFreq: "turbo" + turboPriority: 10 + - name: turbo-low + minFreq: "turbo" + maxFreq: "turbo" + turboPriority: 1 + - name: default-turbo + minFreq: "min" + maxFreq: "turbo" + - name: default-noturbo + minFreq: "min" + maxFreq: "base" + + log: + debug: + - policy + - nri-plugin + - cpu + - cpuclass +extraEnv: + OVERRIDE_SYS_CPUFREQ: '''[{"cpus": "0-15", "base": 2900000, "min": 800000, "max": 3800000}]''' diff --git a/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh new file mode 100644 index 000000000..904b582cb --- /dev/null +++ b/test/e2e/policies.test-suite/balloons/n4c16/test18-turbo-priority/code.var.sh @@ -0,0 +1,582 @@ +# Test turbo priority: highest-priority active CPU class gets turbo, +# others get base. When the highest-priority balloon is removed, +# the next highest-priority class regains turbo. +# +# Also verifies CPU frequency write minimality: +# - no duplicate sysfs writes (each (cpu, prop, freq) tuple is logged +# at most once per recorded snapshot window, thanks to the per-CPU +# last-written cache in pkg/resmgr/control/cpu), +# - writes do happen on class transitions (turbo<->base) and when +# idle CPUs need their initial class settings, +# - a no-op event (creating a 2nd container that lands in the +# *same* turbo-low balloon as pod0) does not produce any new +# enforce writes. + +helm-terminate +helm_config=$TEST_DIR/balloons-turbo.cfg helm-launch balloons + +# turbo-log fetches the latest turbo recalculation log lines +turbo-log() { + local last_n=${1:-20} + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep -E 'turbo:|cpuClass' | tail -n $last_n" +} + +# verify-turbo-winner checks that the given class is logged as a turbo winner +# with the expected maxFreq, within the last N turbo log lines. +verify-turbo-winner() { + local class=$1 + local expected_max_freq=$2 + local last_n=${3:-20} + echo "verify turbo winner: class=$class maxFreq=$expected_max_freq" + turbo-log $last_n + grep "class \"$class\"" <<< "$COMMAND_OUTPUT" | grep "winner=true" | tail -n 1 | grep -q "maxFreq=$expected_max_freq" || { + command-error "expected class $class as turbo winner with maxFreq=$expected_max_freq" + } +} + +# verify-turbo-loser checks that the given class is logged as NOT a turbo winner +# (winner=false) with the expected maxFreq (base), within the last N turbo log lines. +verify-turbo-loser() { + local class=$1 + local expected_max_freq=$2 + local last_n=${3:-20} + echo "verify turbo loser: class=$class maxFreq=$expected_max_freq" + turbo-log $last_n + grep "class \"$class\"" <<< "$COMMAND_OUTPUT" | grep "winner=false" | tail -n 1 | grep -q "maxFreq=$expected_max_freq" || { + command-error "expected class $class as turbo loser with maxFreq=$expected_max_freq" + } +} + +ENFORCE_PATTERN='enforcing cpu frequency' + +# enforce-count returns the total number of "enforcing cpu frequency" log lines so far. +enforce-count() { + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep -c '$ENFORCE_PATTERN' || true" >/dev/null + echo "$COMMAND_OUTPUT" | tr -d '[:space:]' +} + +# wait-enforce-grows [timeout=15] +# Polls until the cumulative number of enforce writes is greater than . +wait-enforce-grows() { + local baseline=$1 + local timeout=${2:-15} + vm-run-until --timeout "$timeout" \ + "[ \$(kubectl -n kube-system logs ds/nri-resource-policy-balloons 2>/dev/null | grep -c '$ENFORCE_PATTERN') -gt $baseline ]" || { + command-error "expected enforce-count to grow above $baseline within ${timeout}s" + } +} + +# wait-pod-gone [timeout=30] +# Polls until the named pod no longer exists. +wait-pod-gone() { + local pod=$1 + local timeout=${2:-30} + vm-run-until --timeout "$timeout" "! kubectl get pod $pod -o name 2>/dev/null | grep -q ." || { + command-error "pod $pod did not disappear within ${timeout}s" + } +} + +# enforce-lines-since prints the enforce log lines added since the given absolute count. +enforce-lines-since() { + local from=$1 + vm-command "kubectl -n kube-system logs ds/nri-resource-policy-balloons | grep '$ENFORCE_PATTERN' | tail -n +$((from+1))" >/dev/null +} + +# assert-step-writes