Skip to content

Commit 62c31b1

Browse files
jkarytsorya
authored andcommitted
NVIDIA-596: pass DPU lease config via env vars on dpu-host/dpu DaemonSets
Add configurable DPU node lease renew interval and duration as env vars on ovnkube-controller, gated to dpu-host/dpu modes. Script-lib builds CLI flags from env vars. Values read from hardware-offload-config ConfigMap with defaults 10s/40s. Setting either to 0 disables the health check. Lease namespace derived via fieldRef. Jira: https://issues.redhat.com/browse/NVIDIA-596
1 parent 3b5ef2d commit 62c31b1

9 files changed

Lines changed: 398 additions & 93 deletions

File tree

bindata/network/ovn-kubernetes/common/008-script-lib.yaml

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -508,6 +508,9 @@ data:
508508
# Ensure openflow_probe_flag is always defined
509509
openflow_probe_flag=
510510

511+
# Ensure dpu_lease_flags is always defined
512+
dpu_lease_flags=
513+
511514
if [[ $# -ne 3 ]]; then
512515
echo "Expected three arguments but got $#"
513516
exit 1
@@ -550,7 +553,15 @@ data:
550553

551554
# disable init-ovnkube-controller for dpu-host mode as it is not supported
552555
init_ovnkube_controller=""
556+
fi
553557

558+
if [ "${OVN_NODE_MODE}" == "dpu-host" ] || [ "${OVN_NODE_MODE}" == "dpu" ]; then
559+
if [[ -n "${OVNKUBE_NODE_LEASE_RENEW_INTERVAL}" ]]; then
560+
dpu_lease_flags="--dpu-node-lease-renew-interval ${OVNKUBE_NODE_LEASE_RENEW_INTERVAL}"
561+
fi
562+
if [[ -n "${OVNKUBE_NODE_LEASE_DURATION}" ]]; then
563+
dpu_lease_flags="$dpu_lease_flags --dpu-node-lease-duration ${OVNKUBE_NODE_LEASE_DURATION}"
564+
fi
554565
fi
555566

556567
if [ "{{.OVN_GATEWAY_MODE}}" == "shared" ]; then
@@ -702,5 +713,6 @@ data:
702713
${ovn_v4_masquerade_subnet_opt} \
703714
${ovn_v6_masquerade_subnet_opt} \
704715
${ovn_v4_transit_switch_subnet_opt} \
705-
${ovn_v6_transit_switch_subnet_opt}
716+
${ovn_v6_transit_switch_subnet_opt} \
717+
${dpu_lease_flags}
706718
}

bindata/network/ovn-kubernetes/managed/ovnkube-node.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -437,6 +437,12 @@ spec:
437437
- name: OVNKUBE_NODE_MGMT_PORT_DP_RESOURCE_NAME
438438
value: {{ .MgmtPortResourceName }}
439439
{{ end }}
440+
{{ if and (or (eq .OVN_NODE_MODE "dpu-host") (eq .OVN_NODE_MODE "dpu")) (ne .DpuNodeLeaseRenewInterval "0") }}
441+
- name: OVNKUBE_NODE_LEASE_RENEW_INTERVAL
442+
value: "{{.DpuNodeLeaseRenewInterval}}"
443+
- name: OVNKUBE_NODE_LEASE_DURATION
444+
value: "{{.DpuNodeLeaseDuration}}"
445+
{{ end }}
440446
{{ if .HTTP_PROXY }}
441447
- name: "HTTP_PROXY"
442448
value: "{{ .HTTP_PROXY}}"

bindata/network/ovn-kubernetes/self-hosted/ovnkube-node.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -472,6 +472,12 @@ spec:
472472
- name: OVNKUBE_NODE_MGMT_PORT_DP_RESOURCE_NAME
473473
value: {{ .MgmtPortResourceName }}
474474
{{ end }}
475+
{{ if and (or (eq .OVN_NODE_MODE "dpu-host") (eq .OVN_NODE_MODE "dpu")) (ne .DpuNodeLeaseRenewInterval "0") }}
476+
- name: OVNKUBE_NODE_LEASE_RENEW_INTERVAL
477+
value: "{{.DpuNodeLeaseRenewInterval}}"
478+
- name: OVNKUBE_NODE_LEASE_DURATION
479+
value: "{{.DpuNodeLeaseDuration}}"
480+
{{ end }}
475481
- name: K8S_NODE
476482
valueFrom:
477483
fieldRef:

hack/hardware-offload-config.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,3 +12,5 @@ data:
1212
dpu-mode-label: "network.operator.openshift.io/dpu="
1313
smart-nic-mode-label: "network.operator.openshift.io/smart-nic="
1414
mgmt-port-resource-name: "openshift.io/mgmtvf"
15+
dpu-node-lease-renew-interval: "10"
16+
dpu-node-lease-duration: "40"

pkg/bootstrap/types.go

Lines changed: 14 additions & 12 deletions
Original file line numberDiff line numberDiff line change
@@ -30,18 +30,20 @@ type OVNHyperShiftBootstrapResult struct {
3030
}
3131

3232
type OVNConfigBoostrapResult struct {
33-
GatewayMode string
34-
HyperShiftConfig *OVNHyperShiftBootstrapResult
35-
DisableUDPAggregation bool
36-
DpuHostModeLabel string
37-
DpuHostModeNodes []string
38-
DpuHostModeValue string
39-
DpuModeLabel string
40-
DpuModeNodes []string
41-
SmartNicModeLabel string
42-
SmartNicModeNodes []string
43-
SmartNicModeValue string
44-
MgmtPortResourceName string
33+
GatewayMode string
34+
HyperShiftConfig *OVNHyperShiftBootstrapResult
35+
DisableUDPAggregation bool
36+
DpuHostModeLabel string
37+
DpuHostModeNodes []string
38+
DpuHostModeValue string
39+
DpuModeLabel string
40+
DpuModeNodes []string
41+
SmartNicModeLabel string
42+
SmartNicModeNodes []string
43+
SmartNicModeValue string
44+
MgmtPortResourceName string
45+
DpuNodeLeaseRenewInterval int
46+
DpuNodeLeaseDuration int
4547
// ConfigOverrides contains the overrides for the OVN Kubernetes configuration
4648
// This is used to set the hidden OVN Kubernetes configuration in the cluster
4749
// It is a map of key-value pairs where the key is the configuration option and the

pkg/network/kube_proxy_test.go

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -374,10 +374,12 @@ func TestFillKubeProxyDefaults(t *testing.T) {
374374
var FakeKubeProxyBootstrapResult = bootstrap.BootstrapResult{
375375
OVN: bootstrap.OVNBootstrapResult{
376376
OVNKubernetesConfig: &bootstrap.OVNConfigBoostrapResult{
377-
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
378-
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
379-
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
380-
MgmtPortResourceName: "",
377+
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
378+
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
379+
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
380+
MgmtPortResourceName: "",
381+
DpuNodeLeaseRenewInterval: DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT,
382+
DpuNodeLeaseDuration: DPU_NODE_LEASE_DURATION_DEFAULT,
381383
},
382384
},
383385
}

pkg/network/ovn_kubernetes.go

Lines changed: 40 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -64,6 +64,11 @@ const OVN_NODE_SELECTOR_DEFAULT_DPU = "network.operator.openshift.io/dpu="
6464
const OVN_NODE_SELECTOR_DEFAULT_SMART_NIC = "network.operator.openshift.io/smart-nic="
6565
const OVN_NODE_IDENTITY_CERT_DURATION = "24h"
6666

67+
// Default DPU health check lease configuration.
68+
// Setting renew-interval to 0 disables the health check.
69+
const DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT = 10
70+
const DPU_NODE_LEASE_DURATION_DEFAULT = 40
71+
6772
// gRPC healthcheck port. See: https://github.com/openshift/enhancements/pull/1209
6873
const OVN_EGRESSIP_HEALTHCHECK_PORT = "9107"
6974

@@ -218,6 +223,8 @@ func renderOVNKubernetes(conf *operv1.NetworkSpec, bootstrapResult *bootstrap.Bo
218223
data.Data["SmartNicModeLabel"] = bootstrapResult.OVN.OVNKubernetesConfig.SmartNicModeLabel
219224
data.Data["SmartNicModeValue"] = bootstrapResult.OVN.OVNKubernetesConfig.SmartNicModeValue
220225
data.Data["MgmtPortResourceName"] = bootstrapResult.OVN.OVNKubernetesConfig.MgmtPortResourceName
226+
data.Data["DpuNodeLeaseRenewInterval"] = strconv.Itoa(bootstrapResult.OVN.OVNKubernetesConfig.DpuNodeLeaseRenewInterval)
227+
data.Data["DpuNodeLeaseDuration"] = strconv.Itoa(bootstrapResult.OVN.OVNKubernetesConfig.DpuNodeLeaseDuration)
221228
data.Data["OVN_CONTROLLER_INACTIVITY_PROBE"] = os.Getenv("OVN_CONTROLLER_INACTIVITY_PROBE")
222229
controller_inactivity_probe := os.Getenv("OVN_CONTROLLER_INACTIVITY_PROBE")
223230
if len(controller_inactivity_probe) == 0 {
@@ -927,10 +934,12 @@ func findCommonNode(nodeLists ...[]string) (bool, string) {
927934
// if it exists, otherwise returns default configuration for OCP clusters using OVN-Kubernetes
928935
func bootstrapOVNConfig(conf *operv1.Network, kubeClient cnoclient.Client, hc *hypershift.HyperShiftConfig, infraStatus *bootstrap.InfraStatus) (*bootstrap.OVNConfigBoostrapResult, error) {
929936
ovnConfigResult := &bootstrap.OVNConfigBoostrapResult{
930-
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
931-
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
932-
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
933-
MgmtPortResourceName: "",
937+
DpuHostModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU_HOST,
938+
DpuModeLabel: OVN_NODE_SELECTOR_DEFAULT_DPU,
939+
SmartNicModeLabel: OVN_NODE_SELECTOR_DEFAULT_SMART_NIC,
940+
MgmtPortResourceName: "",
941+
DpuNodeLeaseRenewInterval: DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT,
942+
DpuNodeLeaseDuration: DPU_NODE_LEASE_DURATION_DEFAULT,
934943
}
935944
if conf.Spec.DefaultNetwork.OVNKubernetesConfig.GatewayConfig == nil {
936945
bootstrapOVNGatewayConfig(conf, kubeClient.ClientFor("").CRClient())
@@ -976,6 +985,33 @@ func bootstrapOVNConfig(conf *operv1.Network, kubeClient cnoclient.Client, hc *h
976985
if exists {
977986
ovnConfigResult.MgmtPortResourceName = mgmtPortresourceName
978987
}
988+
989+
if val, exists := cm.Data["dpu-node-lease-renew-interval"]; exists {
990+
parsed, err := strconv.Atoi(val)
991+
if err == nil && parsed >= 0 {
992+
ovnConfigResult.DpuNodeLeaseRenewInterval = parsed
993+
} else {
994+
klog.Warningf("Invalid dpu-node-lease-renew-interval %q, using default %d", val, DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT)
995+
}
996+
}
997+
if val, exists := cm.Data["dpu-node-lease-duration"]; exists {
998+
parsed, err := strconv.Atoi(val)
999+
if err == nil && parsed >= 0 {
1000+
ovnConfigResult.DpuNodeLeaseDuration = parsed
1001+
} else {
1002+
klog.Warningf("Invalid dpu-node-lease-duration %q, using default %d", val, DPU_NODE_LEASE_DURATION_DEFAULT)
1003+
}
1004+
}
1005+
1006+
// Setting either value to 0 disables the DPU health check.
1007+
// When both are non-zero, duration must be greater than interval.
1008+
if ovnConfigResult.DpuNodeLeaseRenewInterval != 0 && ovnConfigResult.DpuNodeLeaseDuration != 0 &&
1009+
ovnConfigResult.DpuNodeLeaseDuration <= ovnConfigResult.DpuNodeLeaseRenewInterval {
1010+
klog.Warningf("dpu-node-lease-duration (%d) must be greater than dpu-node-lease-renew-interval (%d), using defaults",
1011+
ovnConfigResult.DpuNodeLeaseDuration, ovnConfigResult.DpuNodeLeaseRenewInterval)
1012+
ovnConfigResult.DpuNodeLeaseRenewInterval = DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT
1013+
ovnConfigResult.DpuNodeLeaseDuration = DPU_NODE_LEASE_DURATION_DEFAULT
1014+
}
9791015
}
9801016

9811017
// We want to see if there are any nodes that are labeled for specific modes such as Full/SmartNIC/DPU Host/DPU

pkg/network/ovn_kubernetes_dpu_host_test.go

Lines changed: 91 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,7 @@
11
package network
22

33
import (
4+
"strconv"
45
"testing"
56

67
"github.com/ghodss/yaml"
@@ -160,6 +161,8 @@ func createTestRenderData(ovnNodeMode string) render.RenderData {
160161
data.Data["SmartNicModeValue"] = ""
161162
data.Data["DpuModeLabel"] = ""
162163
data.Data["MgmtPortResourceName"] = ""
164+
data.Data["DpuNodeLeaseRenewInterval"] = strconv.Itoa(DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT)
165+
data.Data["DpuNodeLeaseDuration"] = strconv.Itoa(DPU_NODE_LEASE_DURATION_DEFAULT)
163166
data.Data["HTTP_PROXY"] = ""
164167
data.Data["HTTPS_PROXY"] = ""
165168
data.Data["NO_PROXY"] = ""
@@ -210,6 +213,94 @@ func getMatchExpression(g *WithT, ds *appsv1.DaemonSet, label string) (corev1.No
210213
return corev1.NodeSelectorOpDoesNotExist, ""
211214
}
212215

216+
// TestOVNKubernetesLeaseEnvVars tests that DPU lease env vars are set
217+
// for DPU and DPU-host modes but not for full mode
218+
func TestOVNKubernetesLeaseEnvVars(t *testing.T) {
219+
templates := []struct {
220+
name string
221+
templatePath string
222+
}{
223+
{
224+
name: "managed",
225+
templatePath: "../../bindata/network/ovn-kubernetes/managed/ovnkube-node.yaml",
226+
},
227+
{
228+
name: "self-hosted",
229+
templatePath: "../../bindata/network/ovn-kubernetes/self-hosted/ovnkube-node.yaml",
230+
},
231+
}
232+
233+
testCases := []struct {
234+
name string
235+
ovnNodeMode string
236+
expectSet bool
237+
}{
238+
{
239+
name: "full mode should not have lease env vars",
240+
ovnNodeMode: "full",
241+
expectSet: false,
242+
},
243+
{
244+
name: "dpu-host mode should have lease env vars",
245+
ovnNodeMode: "dpu-host",
246+
expectSet: true,
247+
},
248+
{
249+
name: "dpu mode should have lease env vars",
250+
ovnNodeMode: "dpu",
251+
expectSet: true,
252+
},
253+
}
254+
255+
// Env vars with literal values
256+
leaseEnvVars := map[string]string{
257+
"OVNKUBE_NODE_LEASE_RENEW_INTERVAL": strconv.Itoa(DPU_NODE_LEASE_RENEW_INTERVAL_DEFAULT),
258+
"OVNKUBE_NODE_LEASE_DURATION": strconv.Itoa(DPU_NODE_LEASE_DURATION_DEFAULT),
259+
}
260+
for _, template := range templates {
261+
for _, tc := range testCases {
262+
testName := template.name + "_" + tc.name
263+
t.Run(testName, func(t *testing.T) {
264+
g := NewGomegaWithT(t)
265+
266+
data := createTestRenderData(tc.ovnNodeMode)
267+
268+
objs, err := render.RenderTemplate(template.templatePath, &data)
269+
g.Expect(err).NotTo(HaveOccurred())
270+
g.Expect(objs).To(HaveLen(1))
271+
272+
yamlBytes, err := yaml.Marshal(objs[0])
273+
g.Expect(err).NotTo(HaveOccurred())
274+
275+
ds := &appsv1.DaemonSet{}
276+
err = yaml.Unmarshal(yamlBytes, ds)
277+
g.Expect(err).NotTo(HaveOccurred())
278+
279+
for envName, expectedValue := range leaseEnvVars {
280+
found := false
281+
for _, container := range ds.Spec.Template.Spec.Containers {
282+
for _, env := range container.Env {
283+
if env.Name == envName {
284+
found = true
285+
g.Expect(env.Value).To(Equal(expectedValue),
286+
"%s should be set to %s", envName, expectedValue)
287+
}
288+
}
289+
}
290+
291+
if tc.expectSet {
292+
g.Expect(found).To(BeTrue(),
293+
"%s should be set for %s mode", envName, tc.ovnNodeMode)
294+
} else {
295+
g.Expect(found).To(BeFalse(),
296+
"%s should not be set for %s mode", envName, tc.ovnNodeMode)
297+
}
298+
}
299+
})
300+
}
301+
}
302+
}
303+
213304
// TestOVNKubernetesNodeSelectorOperator tests that the node selector operator works correctly with label values of different Full/SmartNIC/DPU modes
214305
func TestOVNKubernetesNodeSelectorOperator(t *testing.T) {
215306
templates := []struct {

0 commit comments

Comments
 (0)