From dd37a7f61b01402f80f647ed5662e89c2bf561fe Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=A4stner?= Date: Wed, 27 May 2026 18:51:30 +0200 Subject: [PATCH 1/2] Implement NX-OS reprovisioning via NX-API MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace the defunct GNMI-based reprovisioning with NXAPI JSON-RPC calls. The new implementation issues two requests: 1. boot poap enable + copy running-config startup-config (batched with stop-on-error rollback) 2. reload (separate request that tolerates transport errors because the device goes down before responding) Add nxapi.IsTransportError to distinguish network-level errors (EOF, timeout, connection reset) from logical errors (RPCError, HTTPError), enabling callers of disruptive commands to tolerate expected connection drops. Remove the now unused BootPOAP GNMI type. Signed-off-by: Felix Kästner --- internal/provider/cisco/nxos/provider.go | 45 ++++--- .../provider/cisco/nxos/reprovision_test.go | 122 ++++++++++++++++++ internal/provider/cisco/nxos/system.go | 8 -- internal/transport/nxapi/nxapi.go | 17 +++ internal/transport/nxapi/nxapi_test.go | 81 ++++++++++++ 5 files changed, 247 insertions(+), 26 deletions(-) create mode 100644 internal/provider/cisco/nxos/reprovision_test.go diff --git a/internal/provider/cisco/nxos/provider.go b/internal/provider/cisco/nxos/provider.go index 4a9c1df53..968eef9e3 100644 --- a/internal/provider/cisco/nxos/provider.go +++ b/internal/provider/cisco/nxos/provider.go @@ -70,13 +70,14 @@ type Provider struct { nxapi *nxapi.Client } +// timeout is the default timeout for all HTTP/gRPC requests made by the provider. +const timeout = 30 * time.Second + func NewProvider() provider.Provider { return &Provider{} } func (p *Provider) Connect(ctx context.Context, conn *deviceutil.Connection) (err error) { - // timeout is the default timeout for all HTTP/gRPC requests made by the provider. - const timeout = 30 * time.Second p.conn, err = grpcext.NewClient(conn, grpcext.WithDefaultTimeout(timeout)) if err != nil { return fmt.Errorf("failed to create grpc connection: %w", err) @@ -91,7 +92,7 @@ func (p *Provider) Connect(ctx context.Context, conn *deviceutil.Connection) (er } // NXAPI only uses the address for URI construction. c := *conn - c.Address = netip.MustParseAddrPort(conn.Address).String() + c.Address = netip.MustParseAddrPort(conn.Address).Addr().String() p.nxapi, err = nxapi.NewClient(&c, timeout) if err != nil { return fmt.Errorf("failed to create nxapi client: %w", err) @@ -141,23 +142,31 @@ func (p *Provider) FactoryReset(ctx context.Context, conn *deviceutil.Connection return FactoryReset(ctx, p.conn) } -func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) (reterr error) { - if err := p.Connect(ctx, conn); err != nil { - return err +func (p *Provider) Reprovision(ctx context.Context, conn *deviceutil.Connection) error { + c := *conn + c.Address = netip.MustParseAddrPort(conn.Address).Addr().String() + client, err := nxapi.NewClient(&c, timeout) + if err != nil { + return fmt.Errorf("failed to create nxapi client: %w", err) } - defer func() { - if err := p.Disconnect(ctx, conn); err != nil { - reterr = errors.Join(reterr, err) - } - }() - // This is currently defunct on NX-OS, as enabling POAP requires a `copy running-config startup-config` which we - // cannot issue via GNMI - // TODO add once NXAPI client is available - poap := BootPOAP("enable") - if err := p.client.Update(ctx, &poap); err != nil { - return err + + _, err = client.Do(ctx, nxapi.NewRequest( + "boot poap enable", + "copy running-config startup-config", + ).WithRollback(nxapi.Stop)) + if err != nil { + return fmt.Errorf("failed to prepare device for reprovisioning: %w", err) } - return Reboot(ctx, p.conn) + + // Reboot is issued as a separate request because it actually restarts + // the device. The connection will drop before a response is received, + // so transport errors are expected and tolerated. + _, err = client.Do(ctx, nxapi.NewRequest("reload")) + if err != nil && !nxapi.IsTransportError(err) { + return fmt.Errorf("failed to reboot device: %w", err) + } + + return nil } func (p *Provider) ListPorts(ctx context.Context) ([]provider.DevicePort, error) { diff --git a/internal/provider/cisco/nxos/reprovision_test.go b/internal/provider/cisco/nxos/reprovision_test.go new file mode 100644 index 000000000..6c9dba26e --- /dev/null +++ b/internal/provider/cisco/nxos/reprovision_test.go @@ -0,0 +1,122 @@ +// SPDX-FileCopyrightText: 2026 SAP SE or an SAP affiliate company and IronCore contributors +// SPDX-License-Identifier: Apache-2.0 + +package nxos + +import ( + "encoding/json" + "fmt" + "net/http" + "net/http/httptest" + "slices" + "testing" + + "github.com/ironcore-dev/network-operator/internal/deviceutil" +) + +func TestReprovision(t *testing.T) { + t.Run("success with connection drop on reload", func(t *testing.T) { + var requests [][]string + called := 0 + + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + var cmds []struct { + Params struct { + Cmd string `json:"cmd"` + } `json:"params"` + } + if err := json.NewDecoder(r.Body).Decode(&cmds); err != nil { + t.Fatalf("failed to decode request: %v", err) + } + + batch := make([]string, len(cmds)) + for i, c := range cmds { + batch[i] = c.Params.Cmd + } + requests = append(requests, batch) + called++ + + if called == 1 { + w.Header().Set("Content-Type", "application/json-rpc") + fmt.Fprint(w, `[ + {"jsonrpc":"2.0","result":null,"id":1}, + {"jsonrpc":"2.0","result":null,"id":2} + ]`) + return + } + + // Simulate device going down by closing connection abruptly. + hijacker, ok := w.(http.Hijacker) + if !ok { + t.Fatal("server does not support hijacking") + } + conn, _, err := hijacker.Hijack() + if err != nil { + t.Fatalf("hijack failed: %v", err) + } + conn.Close() + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err != nil { + t.Fatalf("Reprovision returned unexpected error: %v", err) + } + + if len(requests) != 2 { + t.Fatalf("expected 2 NXAPI requests, got %d", len(requests)) + } + if !slices.Equal(requests[0], []string{"boot poap enable", "copy running-config startup-config"}) { + t.Errorf("prep batch = %v", requests[0]) + } + if !slices.Equal(requests[1], []string{"reload"}) { + t.Errorf("reload request = %v", requests[1]) + } + }) + + t.Run("prep batch RPC error fails", func(t *testing.T) { + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + w.Header().Set("Content-Type", "application/json-rpc") + w.WriteHeader(http.StatusBadRequest) + fmt.Fprint(w, `[{"jsonrpc":"2.0","error":{"code":-32602,"message":"Invalid command"},"id":1}]`) + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err == nil { + t.Fatal("expected error from prep batch, got nil") + } + }) + + t.Run("reload RPC error fails", func(t *testing.T) { + called := 0 + srv := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) { + called++ + w.Header().Set("Content-Type", "application/json-rpc") + if called == 1 { + fmt.Fprint(w, `[ + {"jsonrpc":"2.0","result":null,"id":1}, + {"jsonrpc":"2.0","result":null,"id":2} + ]`) + return + } + w.WriteHeader(http.StatusBadRequest) + fmt.Fprint(w, `[{"jsonrpc":"2.0","error":{"code":-32602,"message":"Permission denied"},"id":1}]`) + })) + defer srv.Close() + + p := &Provider{} + conn := &deviceutil.Connection{Address: srv.Listener.Addr().String(), Username: "admin", Password: "secret"} + + err := p.Reprovision(t.Context(), conn) + if err == nil { + t.Fatal("expected error from reload RPC failure, got nil") + } + }) +} diff --git a/internal/provider/cisco/nxos/system.go b/internal/provider/cisco/nxos/system.go index 009c22f1d..b1d8999a7 100644 --- a/internal/provider/cisco/nxos/system.go +++ b/internal/provider/cisco/nxos/system.go @@ -68,14 +68,6 @@ func (*FirmwareVersion) XPath() string { return "System/showversion-items/nxosVersion" } -var _ gnmiext.DataElement = (*BootPOAP)(nil) - -type BootPOAP string - -func (*BootPOAP) XPath() string { - return "/System/boot-items/poap" -} - type BootTime UnixTime func (*BootTime) XPath() string { diff --git a/internal/transport/nxapi/nxapi.go b/internal/transport/nxapi/nxapi.go index 5881ecbfd..427e27f9c 100644 --- a/internal/transport/nxapi/nxapi.go +++ b/internal/transport/nxapi/nxapi.go @@ -10,6 +10,7 @@ import ( "errors" "fmt" "io" + "net" "net/http" "net/url" "time" @@ -257,3 +258,19 @@ type HTTPError struct { func (e *HTTPError) Error() string { return fmt.Sprintf("nxapi: non-2xx status code: %d - %s", e.Code, string(e.Body)) } + +// IsTransportError reports whether err is a network-level transport error +// (connection reset, timeout, EOF) as opposed to a logical error returned +// by the NX-API endpoint (RPCError, HTTPError). This is useful for callers +// that issue disruptive commands (e.g. reboot) where the device going down +// mid-request is expected. +func IsTransportError(err error) bool { + if err == nil { + return false + } + if errors.Is(err, io.EOF) || errors.Is(err, io.ErrUnexpectedEOF) { + return true + } + var netErr net.Error + return errors.As(err, &netErr) +} diff --git a/internal/transport/nxapi/nxapi_test.go b/internal/transport/nxapi/nxapi_test.go index 0b6c73a32..53e17f25b 100644 --- a/internal/transport/nxapi/nxapi_test.go +++ b/internal/transport/nxapi/nxapi_test.go @@ -9,8 +9,10 @@ import ( "encoding/json" "errors" "fmt" + "io" "net/http" "net/http/httptest" + "net/url" "strings" "testing" @@ -295,3 +297,82 @@ func TestDo(t *testing.T) { }) } } + +func TestIsTransportError(t *testing.T) { + tests := []struct { + desc string + err error + want bool + }{ + { + desc: "nil is not a transport error", + err: nil, + want: false, + }, + { + desc: "RPCError is not a transport error", + err: &RPCError{Code: -32602, Message: "Invalid params"}, + want: false, + }, + { + desc: "RPCErrors is not a transport error", + err: RPCErrors{&RPCError{Code: -32602, Message: "Invalid params"}}, + want: false, + }, + { + desc: "HTTPError is not a transport error", + err: &HTTPError{Code: 401, Body: []byte("unauthorized")}, + want: false, + }, + { + desc: "generic error is not a transport error", + err: errors.New("some logic error"), + want: false, + }, + { + desc: "io.EOF is a transport error", + err: io.EOF, + want: true, + }, + { + desc: "io.ErrUnexpectedEOF is a transport error", + err: io.ErrUnexpectedEOF, + want: true, + }, + { + desc: "wrapped io.EOF is a transport error", + err: fmt.Errorf("request failed: %w", io.EOF), + want: true, + }, + { + desc: "net.Error is a transport error", + err: &netError{msg: "i/o timeout"}, + want: true, + }, + { + desc: "url.Error wrapping net.Error is a transport error", + err: &url.Error{Op: "Post", URL: "http://x/ins", Err: &netError{msg: "i/o timeout"}}, + want: true, + }, + { + desc: "wrapped net.Error is a transport error", + err: fmt.Errorf("read tcp: %w", &netError{msg: "connection reset by peer"}), + want: true, + }, + } + for _, test := range tests { + t.Run(test.desc, func(t *testing.T) { + got := IsTransportError(test.err) + if got != test.want { + t.Errorf("IsTransportError(%v) = %t, want %t", test.err, got, test.want) + } + }) + } +} + +// netError is a mock net.Error for testing. +type netError struct{ msg string } + +func (e *netError) Error() string { return e.msg } +func (e *netError) Timeout() bool { return false } +func (e *netError) Temporary() bool { return false } From 72080cf31b93b857836d445cb6aa1264d44cd7ef Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Felix=20K=C3=A4stner?= Date: Fri, 19 Jun 2026 14:13:12 +0200 Subject: [PATCH 2/2] Fix maintenance annotation removal on failed operations MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the maintenance annotation deletion to after the switch statement so it is only removed when the action succeeds. This allows the reconciler to retry failed maintenance operations on the next reconciliation instead of requiring the user to re-apply the annotation. Additionally, return a terminal error for unknown maintenance actions since retrying will never succeed. Signed-off-by: Felix Kästner --- internal/controller/core/device_controller.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/internal/controller/core/device_controller.go b/internal/controller/core/device_controller.go index 9e70930a9..eb83ee044 100644 --- a/internal/controller/core/device_controller.go +++ b/internal/controller/core/device_controller.go @@ -425,8 +425,6 @@ func (r *DeviceReconciler) reconcileMaintenance(ctx context.Context, obj *v1alph if !ok { return nil } - delete(obj.Annotations, v1alpha1.DeviceMaintenanceAnnotation) - switch action { case v1alpha1.DeviceMaintenanceReboot: prov, ok := r.Provider().(provider.MaintenanceProvider) @@ -499,9 +497,12 @@ func (r *DeviceReconciler) reconcileMaintenance(ctx context.Context, obj *v1alph default: r.Recorder.Eventf(obj, nil, "Warning", "UnknownMaintenanceAction", "Maintenance", "Unknown maintenance action: %s", action) - return fmt.Errorf("unknown maintenance action: %s", action) + return reconcile.TerminalError(fmt.Errorf("unknown maintenance action: %s", action)) } + // Only remove the annotation after the operation succeeds so that + // failed actions are retried on the next reconciliation. + delete(obj.Annotations, v1alpha1.DeviceMaintenanceAnnotation) return nil }