diff --git a/Makefile b/Makefile index 511bb7ac..c6178de7 100644 --- a/Makefile +++ b/Makefile @@ -126,6 +126,7 @@ REACT_DEV ?= false .PHONY: all help fmt lint test build vulncheck check-deps kubectl-unbounded kubectl-unbounded-build install-tools install-protoc generate kubectl-unbounded forge unbounded-agent machina machina-build machina-oci machina-oci-push machina-manifests machine-ops-controller machine-ops-controller-build machine-ops-controller-oci machine-ops-controller-oci-push machine-ops-manifests metalman metalman-build metalman-oci metalman-oci-push gomod docs-serve unbounded-net-controller unbounded-net-node unbounded-net-routeplan-debug unping unroute notice notice-check .PHONY: net-frontend net-frontend-clean net-build-ebpf net-manifests release-manifests .PHONY: image-machina-local image-machine-ops-controller-local image-metalman-local image-net-controller-local image-net-node-local images-local +.PHONY: image-net-controller-push image-net-node-push images-net-all images-net-all-push .PHONY: unbounded-storage unbounded-storage-build unbounded-storage-test ##@ General @@ -190,7 +191,11 @@ help: ## Show this help @echo " image-machine-ops-controller-local Build machine-ops-controller image" @echo " image-metalman-local Build metalman image" @echo " image-net-controller-local Build unbounded-net-controller image" + @echo " image-net-controller-push Build and push unbounded-net-controller image" @echo " image-net-node-local Build unbounded-net-node image" + @echo " image-net-node-push Build and push unbounded-net-node image" + @echo " images-net-all Build all unbounded-net images" + @echo " images-net-all-push Build and push all unbounded-net images" @echo " images-local Build all local images" @echo " machina-oci-push Build machina image and push" @echo " machine-ops-controller-oci-push Build machine-ops-controller image and push" @@ -690,6 +695,16 @@ image-net-node-local: resources/cni-plugins-linux-$(HOST_GOARCH)-$(CNI_PLUGINS_V -f ./images/net-node/Dockerfile . $(call trivy-maybe,$(NET_NODE_IMAGE)) +image-net-controller-push: image-net-controller-local ## Build and push the unbounded-net-controller image + $(CONTAINER_ENGINE) push $(NET_CONTROLLER_IMAGE) + +image-net-node-push: image-net-node-local ## Build and push the unbounded-net-node image + $(CONTAINER_ENGINE) push $(NET_NODE_IMAGE) + +images-net-all: image-net-controller-local image-net-node-local ## Build all unbounded-net container images locally + +images-net-all-push: image-net-controller-push image-net-node-push ## Build and push all unbounded-net container images + images-local: image-machina-local image-machine-ops-controller-local image-metalman-local image-net-controller-local image-net-node-local ## Build all container images locally ##@ Net Frontend diff --git a/cmd/unbounded-net-controller/dashboard_auth.go b/cmd/unbounded-net-controller/dashboard_auth.go index 5bd54128..beb61ebf 100644 --- a/cmd/unbounded-net-controller/dashboard_auth.go +++ b/cmd/unbounded-net-controller/dashboard_auth.go @@ -176,9 +176,16 @@ func authorizeDashboardRequest(requireDashboardAuth bool, tokenIssuer *authn.Tok return authorizer.authorize(r.Context(), claims.Subject, claims.Groups) } -// authorizeDashboardOrAggregated allows a request if it arrives via the -// aggregated API server (verified front-proxy client certificate) or if it -// passes dashboard authentication and authorization. +// authorizeDashboardOrAggregated allows a request if it passes dashboard +// authentication and authorization (HMAC viewer Bearer token) or if it +// arrives via the aggregated API server (verified front-proxy client +// certificate). +// +// When the request carries a Bearer token, dashboard auth is attempted +// first to avoid logging spurious "no client certificate" rejections from +// the local kubectl-unbounded proxy, which never presents a client cert. +// Aggregator-style client-cert auth is still attempted as a fallback so +// requests proxied through the kube-aggregator continue to work. func authorizeDashboardOrAggregated( requireDashboardAuth bool, tokenIssuer *authn.TokenIssuer, @@ -186,9 +193,27 @@ func authorizeDashboardOrAggregated( webhookServer *webhookpkg.Server, r *http.Request, ) bool { + if hasBearerToken(r) && authorizeDashboardRequest(requireDashboardAuth, tokenIssuer, authorizer, r) { + return true + } + if webhookServer.IsTrustedAggregatedRequest(r) { return true } + // Final attempt: dashboard auth path for requests that did not carry a + // Bearer token (e.g. requireDashboardAuth=false). return authorizeDashboardRequest(requireDashboardAuth, tokenIssuer, authorizer, r) } + +// hasBearerToken reports whether the request has an Authorization: Bearer +// header. +func hasBearerToken(r *http.Request) bool { + if r == nil { + return false + } + + auth := r.Header.Get("Authorization") + + return strings.HasPrefix(auth, "Bearer ") +} diff --git a/cmd/unbounded-net-node/ebpf_geneve_config.go b/cmd/unbounded-net-node/ebpf_geneve_config.go index 72aaa068..46337c8d 100644 --- a/cmd/unbounded-net-node/ebpf_geneve_config.go +++ b/cmd/unbounded-net-node/ebpf_geneve_config.go @@ -8,7 +8,6 @@ import ( "fmt" "net" "os" - "os/exec" "path/filepath" "strings" @@ -230,7 +229,14 @@ func configureEBPFTunnelPeers( // SyncAddresses, SetLinkAddress, EnsureMTU). Some netlink operations // can cause the kernel to reset interface sysctls. disableRPFilter(ifName) - ensureTunnelForwardAccept(ifName) + + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(ifName) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(ifName) + } // Get tunnel interface index for BPF map entries (redirect target). geneveIface, err := net.InterfaceByName(ifName) @@ -249,7 +255,13 @@ func configureEBPFTunnelPeers( klog.V(2).Infof("eBPF: failed to remove %s: %v", ifName, err) } - removeTunnelForwardAccept(ifName) + if state.forwardManager != nil { + state.forwardManager.RemoveInterface(ifName) + } + + if state.notrackManager != nil { + state.notrackManager.RemoveInterface(ifName) + } } } @@ -283,7 +295,15 @@ func configureEBPFTunnelPeers( } disableRPFilter(ipipIfName) - ensureTunnelForwardAccept(ipipIfName) + + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(ipipIfName) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(ipipIfName) + } + // IPIP is layer 3 and does not support MAC addresses; skip // SetLinkAddress (the kernel returns ENOTSUP). } @@ -297,7 +317,13 @@ func configureEBPFTunnelPeers( klog.V(2).Infof("eBPF: failed to remove %s: %v", ipipIfName, err) } - removeTunnelForwardAccept(ipipIfName) + if state.forwardManager != nil { + state.forwardManager.RemoveInterface(ipipIfName) + } + + if state.notrackManager != nil { + state.notrackManager.RemoveInterface(ipipIfName) + } } } @@ -469,7 +495,14 @@ func configureEBPFVXLANPeers( // SyncAddresses, SetLinkAddress, EnsureMTU). Some netlink operations // can cause the kernel to reset interface sysctls. disableRPFilter(ifName) - ensureTunnelForwardAccept(ifName) + + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(ifName) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(ifName) + } iface, err := net.InterfaceByName(ifName) if err != nil { @@ -842,48 +875,9 @@ func disableRPFilter(ifName string) { } } -// ensureTunnelForwardAccept adds an iptables FORWARD chain rule that accepts -// forwarded traffic arriving on the specified tunnel interface. This must be -// before KUBE-FORWARD (which drops ctstate INVALID packets) so that transit -// overlay traffic through gateway nodes is not dropped. -func ensureTunnelForwardAccept(ifName string) { - check := exec.Command("nsenter", "-t", "1", "-n", "--", - "iptables", "-C", "FORWARD", - "-i", ifName, "-j", "ACCEPT", - "-m", "comment", "--comment", "unbounded-net: accept tunnel traffic") - if check.Run() == nil { - return - } - - out, err := exec.Command("nsenter", "-t", "1", "-n", "--", - "iptables", "-I", "FORWARD", "1", - "-i", ifName, "-j", "ACCEPT", - "-m", "comment", "--comment", "unbounded-net: accept tunnel traffic").CombinedOutput() - if err != nil { - klog.Warningf("failed to add FORWARD accept rule for %s: %v (%s)", ifName, err, strings.TrimSpace(string(out))) - } else { - klog.V(2).Infof("added FORWARD accept rule for %s", ifName) - } -} - -// removeTunnelForwardAccept removes the FORWARD chain ACCEPT rule for -// the specified tunnel interface. Called when an interface is deleted. -func removeTunnelForwardAccept(ifName string) { - out, err := exec.Command("nsenter", "-t", "1", "-n", "--", - "iptables", "-D", "FORWARD", - "-i", ifName, "-j", "ACCEPT", - "-m", "comment", "--comment", "unbounded-net: accept tunnel traffic").CombinedOutput() - if err != nil { - // Rule may not exist if the interface was never set up -- ignore. - klog.V(4).Infof("removeTunnelForwardAccept %s: %v (%s)", ifName, err, strings.TrimSpace(string(out))) - } else { - klog.V(2).Infof("removed FORWARD accept rule for %s", ifName) - } -} - // using. This avoids leaving stale geneve0, vxlan0, or ipip0 interfaces // when the tunnel protocol has been changed or when all peers use WireGuard. -func cleanupUnusedTunnelDevices(meshPeers []meshPeerInfo, gatewayPeers, wgGatewayPeers []gatewayPeerInfo) { +func cleanupUnusedTunnelDevices(meshPeers []meshPeerInfo, gatewayPeers, wgGatewayPeers []gatewayPeerInfo, fwdMgr *unboundednetnetlink.ForwardManager, notrackMgr *unboundednetnetlink.NotrackManager) { usesGeneve := false usesVXLAN := false usesIPIP := false @@ -944,7 +938,13 @@ func cleanupUnusedTunnelDevices(meshPeers []meshPeerInfo, gatewayPeers, wgGatewa klog.V(2).Infof("eBPF: failed to remove unused device %s: %v", d.name, err) } - removeTunnelForwardAccept(d.name) + if fwdMgr != nil { + fwdMgr.RemoveInterface(d.name) + } + + if notrackMgr != nil { + notrackMgr.RemoveInterface(d.name) + } } } } @@ -953,7 +953,7 @@ func cleanupUnusedTunnelDevices(meshPeers []meshPeerInfo, gatewayPeers, wgGatewa // are still in use. This must be called after cleanupUnusedTunnelDevices // because deleting interfaces can cause the kernel to reset rp_filter on // remaining interfaces. -func reapplyRPFilterOnActiveTunnels(meshPeers []meshPeerInfo, gatewayPeers, wgGatewayPeers []gatewayPeerInfo) { +func reapplyRPFilterOnActiveTunnels(meshPeers []meshPeerInfo, gatewayPeers, wgGatewayPeers []gatewayPeerInfo, fwdMgr *unboundednetnetlink.ForwardManager, isGateway bool, notrackMgr *unboundednetnetlink.NotrackManager) { usesGeneve := false usesVXLAN := false usesIPIP := false @@ -1002,7 +1002,14 @@ func reapplyRPFilterOnActiveTunnels(meshPeers []meshPeerInfo, gatewayPeers, wgGa } { if d.used { disableRPFilter(d.name) - ensureTunnelForwardAccept(d.name) + + if fwdMgr != nil { + fwdMgr.EnsureInterface(d.name) + } + + if isGateway && notrackMgr != nil { + notrackMgr.EnsureInterface(d.name) + } } } } diff --git a/cmd/unbounded-net-node/geneve_config.go b/cmd/unbounded-net-node/geneve_config.go index ecd10e7d..33a4000d 100644 --- a/cmd/unbounded-net-node/geneve_config.go +++ b/cmd/unbounded-net-node/geneve_config.go @@ -187,7 +187,14 @@ func configureTunnelPeers(ctx context.Context, cfg *config, meshPeers []meshPeer // SyncAddresses, EnsureMTU). Some netlink operations can cause // the kernel to reset interface sysctls. disableRPFilter(ifName) - ensureTunnelForwardAccept(ifName) + + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(ifName) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(ifName) + } iface, err := net.InterfaceByName(ifName) if err != nil { @@ -287,7 +294,14 @@ func removeStaleGeneveInterfaces(state *wireGuardState, desired map[string]bool) klog.Infof("Tunnel: removed stale interface %s", ifName) } - removeTunnelForwardAccept(ifName) + if state.forwardManager != nil { + state.forwardManager.RemoveInterface(ifName) + } + + if state.notrackManager != nil { + state.notrackManager.RemoveInterface(ifName) + } + delete(state.geneveInterfaces, ifName) } @@ -331,7 +345,13 @@ func removeStaleGeneveInterfaces(state *wireGuardState, desired map[string]bool) klog.Infof("Tunnel: removed unmanaged interface %s", name) } - removeTunnelForwardAccept(name) + if state.forwardManager != nil { + state.forwardManager.RemoveInterface(name) + } + + if state.notrackManager != nil { + state.notrackManager.RemoveInterface(name) + } } } diff --git a/cmd/unbounded-net-node/main.go b/cmd/unbounded-net-node/main.go index 1528ad54..567a61bf 100644 --- a/cmd/unbounded-net-node/main.go +++ b/cmd/unbounded-net-node/main.go @@ -270,7 +270,7 @@ then annotates the node with the public key.`, // WireGuard configuration flags flags.StringVar(&cfg.WireGuardDir, "wireguard-dir", "/etc/wireguard", "Directory to store WireGuard keys") flags.IntVar(&cfg.WireGuardPort, "wireguard-port", 51820, "WireGuard listen port") - flags.BoolVar(&cfg.EnablePolicyRouting, "enable-policy-routing", false, "Enable policy-based routing on gateway interfaces (deprecated, per-interface FORWARD rules replace PBR)") + flags.BoolVar(&cfg.EnablePolicyRouting, "enable-policy-routing", false, "Enable policy-based routing on gateway interfaces (deprecated, UNBOUNDED-FORWARD chain rules replace PBR)") // GENEVE configuration flags flags.IntVar(&cfg.GenevePort, "geneve-port", 6081, "GENEVE UDP destination port") diff --git a/cmd/unbounded-net-node/node_types.go b/cmd/unbounded-net-node/node_types.go index 69fe2797..7a5b3176 100644 --- a/cmd/unbounded-net-node/node_types.go +++ b/cmd/unbounded-net-node/node_types.go @@ -103,6 +103,12 @@ type wireGuardState struct { // MSS clamp manager for TCP MSS clamping on WireGuard interfaces mssClampManager *unboundednetnetlink.MSSClampManager + // Forward manager for tunnel-to-tunnel iptables FORWARD rules + forwardManager *unboundednetnetlink.ForwardManager + + // Notrack manager for skipping conntrack on transit traffic (gateway nodes only) + notrackManager *unboundednetnetlink.NotrackManager + // GENEVE tunnel managers -- per-peer interfaces with fixed Remote IP geneveInterfaces map[string]*unboundednetnetlink.LinkManager // per-peer GENEVE interface managers keyed by iface name diff --git a/cmd/unbounded-net-node/reconciliation_helpers.go b/cmd/unbounded-net-node/reconciliation_helpers.go index 29a27973..9c5495fc 100644 --- a/cmd/unbounded-net-node/reconciliation_helpers.go +++ b/cmd/unbounded-net-node/reconciliation_helpers.go @@ -639,6 +639,14 @@ func removeUnmanagedWireGuardInterfaces(cfg *config, state *wireGuardState, desi klog.Infof("Removed unmanaged WireGuard interface %s", name) + if state.forwardManager != nil { + state.forwardManager.RemoveInterface(name) + } + + if state.notrackManager != nil { + state.notrackManager.RemoveInterface(name) + } + delete(state.gatewayLinkManagers, name) delete(state.gatewayWireguardManagers, name) delete(state.gatewayHealthEndpoints, name) diff --git a/cmd/unbounded-net-node/site_watch_reconcile.go b/cmd/unbounded-net-node/site_watch_reconcile.go index 46052303..087c592e 100644 --- a/cmd/unbounded-net-node/site_watch_reconcile.go +++ b/cmd/unbounded-net-node/site_watch_reconcile.go @@ -209,6 +209,25 @@ func watchSiteAndConfigureWireGuard(ctx context.Context, clientset kubernetes.In } } + // Initialize forward manager for tunnel-to-tunnel iptables FORWARD rules. + // This replaces per-interface blanket ACCEPT rules with a two-tier chain + // (FORWARD jump + UNBOUNDED-FORWARD accept) so that only tunnel-to-tunnel + // forwarded traffic bypasses KUBE-FORWARD's conntrack checks. + fwdMgr, err := unboundednetnetlink.NewForwardManager() + if err != nil { + klog.Warningf("Failed to create forward manager (tunnel forwarding rules will be disabled): %v", err) + } else { + state.forwardManager = fwdMgr + } + + // Initialize notrack manager for skipping conntrack on gateway transit traffic. + notrackMgr, err := unboundednetnetlink.NewNotrackManager() + if err != nil { + klog.Warningf("Failed to create notrack manager (conntrack bypass will be disabled): %v", err) + } else { + state.notrackManager = notrackMgr + } + // Initialize gateway policy manager to allow cleanup of stale policy rules // even when policy routing is currently disabled. policyManager, err := unboundednetnetlink.NewGatewayPolicyManager(cfg.WireGuardPort) @@ -529,6 +548,8 @@ func cleanupNodeNetworkingOnShutdown(cfg *config, state *wireGuardState) { gatewayPolicyManager := state.gatewayPolicyManager masqueradeManager := state.masqueradeManager mssClampManager := state.mssClampManager + forwardManager := state.forwardManager + notrackManager := state.notrackManager healthCheckMgr := state.healthCheckManager gatewayLinkManagers := make(map[string]*unboundednetnetlink.LinkManager, len(state.gatewayLinkManagers)) @@ -601,6 +622,18 @@ func cleanupNodeNetworkingOnShutdown(cfg *config, state *wireGuardState) { } } + // Forward chain rules (UNBOUNDED-FORWARD) + if forwardManager != nil { + forwardManager.Cleanup() + klog.Info("Cleaned up forward chain rules on shutdown") + } + + // Notrack rules (UNBOUNDED-NOTRACK, gateway nodes only) + if notrackManager != nil { + notrackManager.Cleanup() + klog.Info("Cleaned up notrack rules on shutdown") + } + // WireGuard managers (close wgctrl clients) if mainWGManager != nil { if err := mainWGManager.Close(); err != nil { @@ -637,7 +670,6 @@ func cleanupNodeNetworkingOnShutdown(cfg *config, state *wireGuardState) { continue } - removeTunnelForwardAccept(ifaceName) klog.Infof("Deleted gateway WireGuard interface %s on shutdown", ifaceName) } @@ -652,7 +684,6 @@ func cleanupNodeNetworkingOnShutdown(cfg *config, state *wireGuardState) { continue } - removeTunnelForwardAccept(ifName) klog.Infof("Deleted GENEVE interface %s on shutdown", ifName) } @@ -670,7 +701,6 @@ func cleanupNodeNetworkingOnShutdown(cfg *config, state *wireGuardState) { if err := netlink.LinkDel(link); err != nil { klog.Warningf("Failed to delete managed tunnel interface %s on shutdown: %v", name, err) } else { - removeTunnelForwardAccept(name) klog.Infof("Deleted managed tunnel interface %s on shutdown", name) } } @@ -2050,11 +2080,56 @@ func updateWireGuardFromSlices(ctx context.Context, dynamicClient dynamic.Interf // Remove tunnel devices that no peers are using. This avoids leaving // stale interfaces (geneve0, vxlan0, ipip0) on nodes where the // tunnel protocol has been changed. - cleanupUnusedTunnelDevices(peers, gatewayPeers, wgGatewayPeers) + cleanupUnusedTunnelDevices(peers, gatewayPeers, wgGatewayPeers, state.forwardManager, state.notrackManager) // Re-apply rp_filter=0 on active tunnel interfaces. Deleting // interfaces during cleanup can cause the kernel to reset // rp_filter on remaining interfaces. - reapplyRPFilterOnActiveTunnels(peers, gatewayPeers, wgGatewayPeers) + reapplyRPFilterOnActiveTunnels(peers, gatewayPeers, wgGatewayPeers, state.forwardManager, state.isGatewayNode, state.notrackManager) + } + + // Reconcile notrack rules for conntrack bypass on gateway nodes. + if state.notrackManager != nil { + if isGatewayNode { + // Build the cluster supernet list from all available sources. + // Gateway nodes carry transit traffic for every site they peer + // with, so the notrack set must cover every site's pod-CIDR + // pool -- not just the gateway's own site -- otherwise return + // packets from a remote site can land on a different gateway + // instance than the forward path took, conntrack marks them + // INVALID, and KUBE-FORWARD drops them. + supernetSet := make(map[string]struct{}) + for _, site := range siteMap { + for _, assignment := range site.Spec.PodCidrAssignments { + for _, cidr := range assignment.CidrBlocks { + supernetSet[cidr] = struct{}{} + } + } + for _, cidr := range site.Spec.NodeCidrs { + supernetSet[cidr] = struct{}{} + } + } + + for _, cidr := range sitePodCIDRs { + supernetSet[cidr] = struct{}{} + } + + for _, gw := range gatewayPeers { + for _, cidr := range gw.RoutedCidrs { + supernetSet[cidr] = struct{}{} + } + } + + var supernets []string + for cidr := range supernetSet { + supernets = append(supernets, cidr) + } + + if err := state.notrackManager.ReconcileCIDRs(state.nodePodCIDRs, supernets); err != nil { + klog.Warningf("Failed to reconcile notrack CIDRs: %v", err) + } + } else { + state.notrackManager.Cleanup() + } } // Phase 3: Update state fields that getNodeStatus() reads (brief lock). diff --git a/cmd/unbounded-net-node/vxlan_config.go b/cmd/unbounded-net-node/vxlan_config.go index 356a851b..c733873c 100644 --- a/cmd/unbounded-net-node/vxlan_config.go +++ b/cmd/unbounded-net-node/vxlan_config.go @@ -76,7 +76,14 @@ func configureVXLANPeers( // here with overlay source IPs, but overlay routes point elsewhere. // Both strict and loose rp_filter would drop them. disableRPFilter(vxlanInterfaceName) - ensureTunnelForwardAccept(vxlanInterfaceName) + + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(vxlanInterfaceName) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(vxlanInterfaceName) + } // Assign this node's pod CIDR gateway IPs to vxlan0 so that // the kernel can source packets from the overlay addresses. diff --git a/cmd/unbounded-net-node/wireguard_config.go b/cmd/unbounded-net-node/wireguard_config.go index fffb5fdb..8acc7167 100644 --- a/cmd/unbounded-net-node/wireguard_config.go +++ b/cmd/unbounded-net-node/wireguard_config.go @@ -80,6 +80,14 @@ func configureWireGuard(ctx context.Context, cfg *config, privKey string, peers // routes. The informer sync guard in reconcileUpdate ensures we only // reach here after all caches are populated, so an empty peer list is // genuine (not a transient startup state). + // + // We still need to clean up any leftover gateway wg interfaces: + // - Tracked in state (e.g. SiteGatewayPoolAssignment was just deleted + // and the node had no other WG peers because intra-site uses GENEVE). + // - Present in the kernel but not in state (e.g. agent restarted after + // an SGPA was deleted, leaving orphan interfaces from the prior run). + // removeUnmanagedWireGuardInterfaces handles both cases by listing wg* + // links and deleting any that aren't in desiredGatewayIfaces. if len(peers) == 0 && len(gatewayPeers) == 0 { if state.linkManager.Exists() { klog.Infof("No WireGuard peers -- removing mesh interface %s", iface) @@ -88,6 +96,8 @@ func configureWireGuard(ctx context.Context, cfg *config, privKey string, peers klog.V(2).Infof("Failed to remove unused WireGuard interface %s: %v", iface, err) } } + // Tear down any tracked or kernel-resident gateway wg* interfaces. + removeUnmanagedWireGuardInterfaces(cfg, state, nil, iface) // Sync routes (clear WG routes, keep any GENEVE/IPIP additional routes) if state.routeManager != nil { if state.netlinkCache != nil { @@ -255,7 +265,13 @@ func configureWireGuard(ctx context.Context, cfg *config, privKey string, peers } // Accept forwarded traffic arriving on the mesh WG interface. - ensureTunnelForwardAccept(iface) + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(iface) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(iface) + } // Build routes for mesh interface via shared route builder. // All routing is per-peer for healthcheck granularity: bootstrap host routes, @@ -456,8 +472,14 @@ func configureWireGuard(ctx context.Context, cfg *config, privKey string, peers continue } - // Set fwmark on gateway WG interface for transit traffic forwarding. - ensureTunnelForwardAccept(gwIfaceName) + // Accept forwarded traffic arriving on the gateway WG interface. + if state.forwardManager != nil { + state.forwardManager.EnsureInterface(gwIfaceName) + } + + if state.isGatewayNode && state.notrackManager != nil { + state.notrackManager.EnsureInterface(gwIfaceName) + } // Configure policy routing for this gateway interface // This ensures return traffic leaves via the same interface it arrived on @@ -560,7 +582,13 @@ func configureWireGuard(ctx context.Context, cfg *config, privKey string, peers klog.Warningf("Failed to delete gateway interface %s: %v", ifaceName, err) } - removeTunnelForwardAccept(ifaceName) + if state.forwardManager != nil { + state.forwardManager.RemoveInterface(ifaceName) + } + + if state.notrackManager != nil { + state.notrackManager.RemoveInterface(ifaceName) + } } if cfg.EnablePolicyRouting && state.gatewayPolicyManager != nil { diff --git a/internal/net/config/runtime_config.go b/internal/net/config/runtime_config.go index d6f0bc40..9e56ae3b 100644 --- a/internal/net/config/runtime_config.go +++ b/internal/net/config/runtime_config.go @@ -66,8 +66,8 @@ type NodeRuntimeConfig struct { WireGuardDir string `yaml:"wireGuardDir"` WireGuardPort *int `yaml:"wireGuardPort"` // Deprecated: EnablePolicyRouting enables connmark/fwmark/ip-rule policy - // routing on gateway interfaces. Replaced by per-interface FORWARD ACCEPT - // rules. Defaults to false; retained for backward compatibility. + // routing on gateway interfaces. Replaced by the UNBOUNDED-FORWARD chain. + // Defaults to false; retained for backward compatibility. EnablePolicyRouting *bool `yaml:"enablePolicyRouting"` MTU *int `yaml:"mtu"` HealthPort *int `yaml:"healthPort"` diff --git a/internal/net/netlink/forward_manager.go b/internal/net/netlink/forward_manager.go new file mode 100644 index 00000000..8a84a727 --- /dev/null +++ b/internal/net/netlink/forward_manager.go @@ -0,0 +1,284 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package netlink + +import ( + "fmt" + "strings" + "sync" + + "github.com/coreos/go-iptables/iptables" + "k8s.io/klog/v2" +) + +const ( + // forwardChain is the custom chain for tunnel-to-tunnel forwarding rules. + forwardChain = "UNBOUNDED-FORWARD" + // forwardComment identifies rules created by this manager. + forwardComment = "unbounded-net: forward between managed tunnels" + // legacyForwardComment identifies old per-interface FORWARD ACCEPT rules + // from previous agent versions that should be cleaned up. + legacyForwardComment = "unbounded-net: accept tunnel traffic" +) + +// ForwardManager manages iptables FORWARD rules that restrict forwarded +// traffic to tunnel-to-tunnel paths. Instead of a blanket ACCEPT on each +// source interface, it installs: +// +// - FORWARD: -i -j UNBOUNDED-FORWARD (per source interface) +// - UNBOUNDED-FORWARD: -o -j ACCEPT (per destination interface) +// +// This gives 2*N rules instead of N^2 and ensures only traffic between +// managed tunnel interfaces bypasses KUBE-FORWARD's conntrack checks. +type ForwardManager struct { + ipt4 *iptables.IPTables + ipt6 *iptables.IPTables + mu sync.Mutex +} + +// NewForwardManager creates a ForwardManager, ensures the UNBOUNDED-FORWARD +// chain exists, and cleans up any legacy per-interface FORWARD ACCEPT rules +// left by previous agent versions. +func NewForwardManager() (*ForwardManager, error) { + ipt4, err := iptables.New() + if err != nil { + return nil, fmt.Errorf("failed to initialize IPv4 iptables: %w", err) + } + + ipt6, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) + if err != nil { + klog.Warningf("Failed to initialize IPv6 iptables (IPv6 forwarding rules will be disabled): %v", err) + + ipt6 = nil + } + + m := &ForwardManager{ipt4: ipt4, ipt6: ipt6} + + if err := m.ensureChain(ipt4, "IPv4"); err != nil { + return nil, fmt.Errorf("failed to create IPv4 forward chain: %w", err) + } + + if ipt6 != nil { + if err := m.ensureChain(ipt6, "IPv6"); err != nil { + klog.Warningf("Failed to create IPv6 forward chain: %v", err) + } + } + + m.cleanupLegacyRules(ipt4, "IPv4") + + if ipt6 != nil { + m.cleanupLegacyRules(ipt6, "IPv6") + } + + return m, nil +} + +// ensureChain creates the UNBOUNDED-FORWARD chain if it does not exist. +// It does NOT insert a jump rule in FORWARD here; jump rules are per-interface +// and managed by EnsureInterface. +func (m *ForwardManager) ensureChain(ipt *iptables.IPTables, family string) error { + exists, err := ipt.ChainExists("filter", forwardChain) + if err != nil { + return fmt.Errorf("failed to check if chain exists: %w", err) + } + + if !exists { + if err := ipt.NewChain("filter", forwardChain); err != nil { + return fmt.Errorf("failed to create chain: %w", err) + } + + klog.V(2).Infof("Created %s chain %s in filter table", family, forwardChain) + } + + return nil +} + +// jumpRule returns the FORWARD chain jump rule spec for a source interface. +func jumpRule(ifName string) []string { + return []string{"-i", ifName, "-m", "comment", "--comment", forwardComment, "-j", forwardChain} +} + +// acceptRule returns the UNBOUNDED-FORWARD chain accept rule spec for a destination interface. +func acceptRule(ifName string) []string { + return []string{"-o", ifName, "-m", "comment", "--comment", forwardComment, "-j", "ACCEPT"} +} + +// EnsureInterface adds the jump rule in FORWARD (-i ifName) and the accept +// rule in UNBOUNDED-FORWARD (-o ifName) if they do not already exist. The +// jump rule is inserted at position 1 so it is evaluated before KUBE-FORWARD. +func (m *ForwardManager) EnsureInterface(ifName string) { + m.mu.Lock() + defer m.mu.Unlock() + + m.ensureInterfaceForFamily(m.ipt4, "IPv4", ifName) + + if m.ipt6 != nil { + m.ensureInterfaceForFamily(m.ipt6, "IPv6", ifName) + } +} + +func (m *ForwardManager) ensureInterfaceForFamily(ipt *iptables.IPTables, family, ifName string) { + // Ensure the chain still exists (self-healing if someone flushed it). + if err := m.ensureChain(ipt, family); err != nil { + klog.Warningf("ForwardManager: failed to ensure %s chain for %s: %v", family, ifName, err) + + return + } + + // Jump rule in FORWARD: -i ifName -j UNBOUNDED-FORWARD + jRule := jumpRule(ifName) + + exists, err := ipt.Exists("filter", "FORWARD", jRule...) + if err != nil { + klog.Warningf("ForwardManager: failed to check %s FORWARD jump for %s: %v", family, ifName, err) + } else if !exists { + if err := ipt.Insert("filter", "FORWARD", 1, jRule...); err != nil { + klog.Warningf("ForwardManager: failed to insert %s FORWARD jump for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("ForwardManager: added %s FORWARD jump for %s", family, ifName) + } + } + + // Accept rule in UNBOUNDED-FORWARD: -o ifName -j ACCEPT + aRule := acceptRule(ifName) + + exists, err = ipt.Exists("filter", forwardChain, aRule...) + if err != nil { + klog.Warningf("ForwardManager: failed to check %s accept rule for %s: %v", family, ifName, err) + } else if !exists { + if err := ipt.Append("filter", forwardChain, aRule...); err != nil { + klog.Warningf("ForwardManager: failed to add %s accept rule for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("ForwardManager: added %s accept rule for -o %s", family, ifName) + } + } +} + +// RemoveInterface removes the jump rule from FORWARD and the accept rule +// from UNBOUNDED-FORWARD for the specified interface. +func (m *ForwardManager) RemoveInterface(ifName string) { + m.mu.Lock() + defer m.mu.Unlock() + + m.removeInterfaceForFamily(m.ipt4, "IPv4", ifName) + + if m.ipt6 != nil { + m.removeInterfaceForFamily(m.ipt6, "IPv6", ifName) + } +} + +func (m *ForwardManager) removeInterfaceForFamily(ipt *iptables.IPTables, family, ifName string) { + jRule := jumpRule(ifName) + if err := ipt.DeleteIfExists("filter", "FORWARD", jRule...); err != nil { + klog.V(4).Infof("ForwardManager: failed to remove %s FORWARD jump for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("ForwardManager: removed %s FORWARD jump for %s", family, ifName) + } + + aRule := acceptRule(ifName) + if err := ipt.DeleteIfExists("filter", forwardChain, aRule...); err != nil { + klog.V(4).Infof("ForwardManager: failed to remove %s accept rule for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("ForwardManager: removed %s accept rule for -o %s", family, ifName) + } +} + +// Cleanup removes all rules and the UNBOUNDED-FORWARD chain. Called on +// graceful shutdown. +func (m *ForwardManager) Cleanup() { + m.mu.Lock() + defer m.mu.Unlock() + + m.cleanupFamily(m.ipt4, "IPv4") + + if m.ipt6 != nil { + m.cleanupFamily(m.ipt6, "IPv6") + } + + klog.V(2).Info("ForwardManager: cleaned up forward chain and rules") +} + +func (m *ForwardManager) cleanupFamily(ipt *iptables.IPTables, family string) { + // Remove all jump rules from FORWARD that reference our chain. + rules, err := ipt.List("filter", "FORWARD") + if err != nil { + klog.Warningf("ForwardManager: failed to list %s FORWARD rules for cleanup: %v", family, err) + } else { + for _, rule := range rules { + if !strings.Contains(rule, forwardChain) || !strings.Contains(rule, forwardComment) { + continue + } + + // Parse the -i interface name from the rule. + ifName := parseInterfaceFromRule(rule, "-i") + if ifName != "" { + jRule := jumpRule(ifName) + if delErr := ipt.DeleteIfExists("filter", "FORWARD", jRule...); delErr != nil { + klog.V(4).Infof("ForwardManager: failed to remove %s FORWARD jump for %s during cleanup: %v", family, ifName, delErr) + } + } + } + } + + // Flush and delete the chain. + exists, err := ipt.ChainExists("filter", forwardChain) + if err != nil { + klog.Warningf("ForwardManager: failed to check if %s chain exists during cleanup: %v", family, err) + + return + } + + if exists { + if err := ipt.ClearChain("filter", forwardChain); err != nil { + klog.Warningf("ForwardManager: failed to flush %s chain during cleanup: %v", family, err) + } + + if err := ipt.DeleteChain("filter", forwardChain); err != nil { + klog.Warningf("ForwardManager: failed to delete %s chain during cleanup: %v", family, err) + } + } +} + +// cleanupLegacyRules removes old-style per-interface FORWARD ACCEPT rules +// from previous agent versions. These rules have the comment +// "unbounded-net: accept tunnel traffic". +func (m *ForwardManager) cleanupLegacyRules(ipt *iptables.IPTables, family string) { + rules, err := ipt.List("filter", "FORWARD") + if err != nil { + klog.V(4).Infof("ForwardManager: failed to list %s FORWARD rules for legacy cleanup: %v", family, err) + + return + } + + for _, rule := range rules { + if !strings.Contains(rule, legacyForwardComment) { + continue + } + + ifName := parseInterfaceFromRule(rule, "-i") + if ifName == "" { + continue + } + + legacyRule := []string{"-i", ifName, "-j", "ACCEPT", "-m", "comment", "--comment", legacyForwardComment} + if err := ipt.DeleteIfExists("filter", "FORWARD", legacyRule...); err != nil { + klog.V(4).Infof("ForwardManager: failed to remove legacy %s FORWARD rule for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("ForwardManager: removed legacy %s FORWARD ACCEPT rule for %s", family, ifName) + } + } +} + +// parseInterfaceFromRule extracts the interface name following the given flag +// (e.g. "-i" or "-o") from an iptables rule string. +func parseInterfaceFromRule(rule, flag string) string { + fields := strings.Fields(rule) + for i, f := range fields { + if f == flag && i+1 < len(fields) { + return fields[i+1] + } + } + + return "" +} diff --git a/internal/net/netlink/forward_manager_test.go b/internal/net/netlink/forward_manager_test.go new file mode 100644 index 00000000..1bba3b01 --- /dev/null +++ b/internal/net/netlink/forward_manager_test.go @@ -0,0 +1,95 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package netlink + +import ( + "testing" +) + +func TestParseInterfaceFromRule(t *testing.T) { + t.Parallel() + + tests := []struct { + name string + rule string + flag string + want string + }{ + { + name: "input interface", + rule: "-A FORWARD -i geneve0 -m comment --comment \"unbounded-net: forward between managed tunnels\" -j UNBOUNDED-FORWARD", + flag: "-i", + want: "geneve0", + }, + { + name: "output interface", + rule: "-A UNBOUNDED-FORWARD -o wg0 -m comment --comment \"unbounded-net: forward between managed tunnels\" -j ACCEPT", + flag: "-o", + want: "wg0", + }, + { + name: "flag not present", + rule: "-A FORWARD -j ACCEPT", + flag: "-i", + want: "", + }, + { + name: "flag at end of rule", + rule: "-A FORWARD -i", + flag: "-i", + want: "", + }, + { + name: "empty rule", + rule: "", + flag: "-i", + want: "", + }, + } + + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + t.Parallel() + + got := parseInterfaceFromRule(tc.rule, tc.flag) + if got != tc.want { + t.Errorf("parseInterfaceFromRule(%q, %q) = %q, want %q", tc.rule, tc.flag, got, tc.want) + } + }) + } +} + +func TestJumpRuleSpec(t *testing.T) { + t.Parallel() + + rule := jumpRule("geneve0") + expected := []string{"-i", "geneve0", "-m", "comment", "--comment", forwardComment, "-j", forwardChain} + + if len(rule) != len(expected) { + t.Fatalf("jumpRule length = %d, want %d", len(rule), len(expected)) + } + + for i, v := range rule { + if v != expected[i] { + t.Errorf("jumpRule[%d] = %q, want %q", i, v, expected[i]) + } + } +} + +func TestAcceptRuleSpec(t *testing.T) { + t.Parallel() + + rule := acceptRule("wg51820") + expected := []string{"-o", "wg51820", "-m", "comment", "--comment", forwardComment, "-j", "ACCEPT"} + + if len(rule) != len(expected) { + t.Fatalf("acceptRule length = %d, want %d", len(rule), len(expected)) + } + + for i, v := range rule { + if v != expected[i] { + t.Errorf("acceptRule[%d] = %q, want %q", i, v, expected[i]) + } + } +} diff --git a/internal/net/netlink/gateway_policy_manager.go b/internal/net/netlink/gateway_policy_manager.go index 838f3404..c4c3d254 100644 --- a/internal/net/netlink/gateway_policy_manager.go +++ b/internal/net/netlink/gateway_policy_manager.go @@ -34,8 +34,8 @@ func validateInterfaceName(name string) error { } // Deprecated: GatewayPolicyManager is deprecated. Policy-based routing (PBR) via -// connmark/fwmark/ip-rule is replaced by per-interface iptables FORWARD ACCEPT -// rules. This type is retained for backward compatibility when +// connmark/fwmark/ip-rule is replaced by the UNBOUNDED-FORWARD iptables chain. +// This type is retained for backward compatibility when // enablePolicyRouting is explicitly set to true. // // GatewayPolicyManager manages policy routing rules to ensure return traffic diff --git a/internal/net/netlink/notrack_manager.go b/internal/net/netlink/notrack_manager.go new file mode 100644 index 00000000..04614b09 --- /dev/null +++ b/internal/net/netlink/notrack_manager.go @@ -0,0 +1,387 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package netlink + +import ( + "fmt" + "net" + "sort" + "strings" + "sync" + + "github.com/coreos/go-iptables/iptables" + "k8s.io/klog/v2" +) + +const ( + // notrackChain is the custom chain in the raw table for NOTRACK rules. + notrackChain = "UNBOUNDED-NOTRACK" + // notrackComment identifies rules created by this manager. + notrackComment = "unbounded-net: skip conntrack for tunnel transit" +) + +// NotrackManager installs iptables raw-table rules on gateway nodes that +// skip connection tracking for transit tunnel traffic. This avoids +// unnecessary conntrack overhead and conntrack table exhaustion for packets +// that are simply forwarded between tunnel interfaces. +// +// The UNBOUNDED-NOTRACK chain is structured as: +// +// -m addrtype --dst-type LOCAL -j RETURN (keep conntrack for node itself) +// -d -j RETURN (keep conntrack for local pods) +// -d -j CT --notrack (skip conntrack for transit) +// (implicit RETURN) (anything else: normal conntrack) +// +// Jump rules in raw/PREROUTING are per-interface, matching the ForwardManager +// pattern. +type NotrackManager struct { + ipt4 *iptables.IPTables + ipt6 *iptables.IPTables + mu sync.Mutex + + // ctSupported tracks whether the CT target is available. + ctSupported4 bool + ctSupported6 bool + + // Track current CIDR sets to avoid unnecessary rebuilds. + currentPodCIDRs string + currentSupernets string +} + +// NewNotrackManager creates a NotrackManager with IPv4/IPv6 handles. +// It does NOT install any rules; call EnsureInterface and ReconcileCIDRs +// to install rules on gateway nodes. +func NewNotrackManager() (*NotrackManager, error) { + ipt4, err := iptables.New() + if err != nil { + return nil, fmt.Errorf("failed to initialize IPv4 iptables: %w", err) + } + + ipt6, err := iptables.NewWithProtocol(iptables.ProtocolIPv6) + if err != nil { + klog.Warningf("Failed to initialize IPv6 iptables (IPv6 notrack rules will be disabled): %v", err) + + ipt6 = nil + } + + m := &NotrackManager{ + ipt4: ipt4, + ipt6: ipt6, + ctSupported4: detectCTSupport(ipt4, "IPv4"), + ctSupported6: ipt6 != nil && detectCTSupport(ipt6, "IPv6"), + } + + if !m.ctSupported4 { + klog.Warningf("NotrackManager: CT target not supported for IPv4, notrack rules will be disabled") + } + + return m, nil +} + +// detectCTSupport checks whether the CT target is available by attempting +// to create and immediately delete a test rule. +func detectCTSupport(ipt *iptables.IPTables, family string) bool { + // Ensure the chain exists for the probe. + _ = ipt.NewChain("raw", notrackChain) //nolint:errcheck // best-effort; may already exist + + testRule := []string{"-d", "192.0.2.0/32", "-j", "CT", "--notrack"} + if family == "IPv6" { + testRule = []string{"-d", "2001:db8::/128", "-j", "CT", "--notrack"} + } + + if err := ipt.Append("raw", notrackChain, testRule...); err != nil { + klog.V(4).Infof("NotrackManager: CT target probe failed for %s: %v", family, err) + + // Clean up probe chain if we created it. + _ = ipt.ClearChain("raw", notrackChain) //nolint:errcheck // best-effort cleanup + _ = ipt.DeleteChain("raw", notrackChain) //nolint:errcheck // best-effort cleanup + + return false + } + + // Clean up test rule. + _ = ipt.Delete("raw", notrackChain, testRule...) //nolint:errcheck // best-effort cleanup + // Don't delete the chain - we'll use it. + + return true +} + +// notrackJumpRule returns the PREROUTING jump rule spec for a source interface. +func notrackJumpRule(ifName string) []string { + return []string{"-i", ifName, "-m", "comment", "--comment", notrackComment, "-j", notrackChain} +} + +// EnsureInterface adds a raw/PREROUTING jump rule for the given interface +// so that traffic arriving on it enters the UNBOUNDED-NOTRACK chain. +// The jump rule is inserted at position 1. +func (m *NotrackManager) EnsureInterface(ifName string) { + m.mu.Lock() + defer m.mu.Unlock() + + m.ensureInterfaceForFamily(m.ipt4, "IPv4", m.ctSupported4, ifName) + + if m.ipt6 != nil { + m.ensureInterfaceForFamily(m.ipt6, "IPv6", m.ctSupported6, ifName) + } +} + +func (m *NotrackManager) ensureInterfaceForFamily(ipt *iptables.IPTables, family string, ctSupported bool, ifName string) { + if !ctSupported { + return + } + + // Ensure chain exists (self-healing). + if err := m.ensureChain(ipt, family); err != nil { + klog.Warningf("NotrackManager: failed to ensure %s chain for %s: %v", family, ifName, err) + + return + } + + jRule := notrackJumpRule(ifName) + + exists, err := ipt.Exists("raw", "PREROUTING", jRule...) + if err != nil { + klog.Warningf("NotrackManager: failed to check %s PREROUTING jump for %s: %v", family, ifName, err) + } else if !exists { + if err := ipt.Insert("raw", "PREROUTING", 1, jRule...); err != nil { + klog.Warningf("NotrackManager: failed to insert %s PREROUTING jump for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("NotrackManager: added %s PREROUTING jump for %s", family, ifName) + } + } +} + +// RemoveInterface removes the raw/PREROUTING jump rule for the given interface. +func (m *NotrackManager) RemoveInterface(ifName string) { + m.mu.Lock() + defer m.mu.Unlock() + + m.removeInterfaceForFamily(m.ipt4, "IPv4", ifName) + + if m.ipt6 != nil { + m.removeInterfaceForFamily(m.ipt6, "IPv6", ifName) + } +} + +func (m *NotrackManager) removeInterfaceForFamily(ipt *iptables.IPTables, family, ifName string) { + jRule := notrackJumpRule(ifName) + if err := ipt.DeleteIfExists("raw", "PREROUTING", jRule...); err != nil { + klog.V(4).Infof("NotrackManager: failed to remove %s PREROUTING jump for %s: %v", family, ifName, err) + } else { + klog.V(2).Infof("NotrackManager: removed %s PREROUTING jump for %s", family, ifName) + } +} + +// ReconcileCIDRs rebuilds the UNBOUNDED-NOTRACK chain contents if the +// podCIDRs or supernets have changed since the last call. It installs: +// - addrtype LOCAL RETURN (always first) +// - per-podCIDR RETURN rules +// - per-supernet CT --notrack rules +// +// CIDRs are automatically sorted into IPv4 and IPv6 buckets. +func (m *NotrackManager) ReconcileCIDRs(podCIDRs, supernets []string) error { + m.mu.Lock() + defer m.mu.Unlock() + + // Deduplicate and sort for stable comparison. + podCIDRs = dedupeAndSort(podCIDRs) + supernets = dedupeAndSort(supernets) + + podKey := strings.Join(podCIDRs, ",") + superKey := strings.Join(supernets, ",") + + if podKey == m.currentPodCIDRs && superKey == m.currentSupernets { + return nil + } + + klog.V(2).Infof("NotrackManager: CIDRs changed, rebuilding chain (podCIDRs=%d, supernets=%d)", len(podCIDRs), len(supernets)) + + v4Pods, v6Pods := splitByFamily(podCIDRs) + v4Supers, v6Supers := splitByFamily(supernets) + + if err := m.reconcileFamily(m.ipt4, "IPv4", m.ctSupported4, v4Pods, v4Supers); err != nil { + return fmt.Errorf("failed to reconcile IPv4 notrack rules: %w", err) + } + + if m.ipt6 != nil { + if err := m.reconcileFamily(m.ipt6, "IPv6", m.ctSupported6, v6Pods, v6Supers); err != nil { + klog.Warningf("NotrackManager: failed to reconcile IPv6 notrack rules: %v", err) + } + } + + m.currentPodCIDRs = podKey + m.currentSupernets = superKey + + return nil +} + +func (m *NotrackManager) reconcileFamily(ipt *iptables.IPTables, family string, ctSupported bool, podCIDRs, supernets []string) error { + if !ctSupported { + return nil + } + + if err := m.ensureChain(ipt, family); err != nil { + return err + } + + // Flush and rebuild. + if err := ipt.ClearChain("raw", notrackChain); err != nil { + return fmt.Errorf("failed to flush %s notrack chain: %w", family, err) + } + + // Rule 1: RETURN for traffic destined to local addresses. + localRule := []string{ + "-m", "addrtype", "--dst-type", "LOCAL", + "-m", "comment", "--comment", notrackComment, "-j", "RETURN", + } + if err := ipt.Append("raw", notrackChain, localRule...); err != nil { + return fmt.Errorf("failed to add %s addrtype LOCAL rule: %w", family, err) + } + + // RETURN for each local podCIDR. + for _, cidr := range podCIDRs { + rule := []string{ + "-d", cidr, + "-m", "comment", "--comment", notrackComment, "-j", "RETURN", + } + if err := ipt.Append("raw", notrackChain, rule...); err != nil { + return fmt.Errorf("failed to add %s podCIDR RETURN for %s: %w", family, cidr, err) + } + } + + // CT --notrack for each supernet. + for _, cidr := range supernets { + rule := []string{ + "-d", cidr, + "-m", "comment", "--comment", notrackComment, "-j", "CT", "--notrack", + } + if err := ipt.Append("raw", notrackChain, rule...); err != nil { + return fmt.Errorf("failed to add %s notrack rule for %s: %w", family, cidr, err) + } + } + + klog.V(2).Infof("NotrackManager: rebuilt %s chain (%d podCIDR RETURNs, %d supernet NOTRACKs)", + family, len(podCIDRs), len(supernets)) + + return nil +} + +// ensureChain creates the UNBOUNDED-NOTRACK chain if it does not exist. +func (m *NotrackManager) ensureChain(ipt *iptables.IPTables, family string) error { + exists, err := ipt.ChainExists("raw", notrackChain) + if err != nil { + return fmt.Errorf("failed to check if %s chain exists: %w", family, err) + } + + if !exists { + if err := ipt.NewChain("raw", notrackChain); err != nil { + return fmt.Errorf("failed to create %s chain: %w", family, err) + } + + klog.V(2).Infof("NotrackManager: created %s chain %s in raw table", family, notrackChain) + } + + return nil +} + +// Cleanup removes all PREROUTING jump rules, flushes and deletes the +// UNBOUNDED-NOTRACK chain. Safe to call even if rules were never installed. +func (m *NotrackManager) Cleanup() { + m.mu.Lock() + defer m.mu.Unlock() + + m.cleanupFamily(m.ipt4, "IPv4") + + if m.ipt6 != nil { + m.cleanupFamily(m.ipt6, "IPv6") + } + + m.currentPodCIDRs = "" + m.currentSupernets = "" + + klog.V(2).Info("NotrackManager: cleaned up notrack chain and rules") +} + +func (m *NotrackManager) cleanupFamily(ipt *iptables.IPTables, family string) { + // Remove all PREROUTING jump rules referencing our chain. + rules, err := ipt.List("raw", "PREROUTING") + if err != nil { + klog.V(4).Infof("NotrackManager: failed to list %s PREROUTING rules for cleanup: %v", family, err) + } else { + for _, rule := range rules { + if !strings.Contains(rule, notrackChain) || !strings.Contains(rule, notrackComment) { + continue + } + + ifName := parseInterfaceFromRule(rule, "-i") + if ifName != "" { + jRule := notrackJumpRule(ifName) + if delErr := ipt.DeleteIfExists("raw", "PREROUTING", jRule...); delErr != nil { + klog.V(4).Infof("NotrackManager: failed to remove %s PREROUTING jump for %s during cleanup: %v", family, ifName, delErr) + } + } + } + } + + // Flush and delete the chain. + exists, err := ipt.ChainExists("raw", notrackChain) + if err != nil { + klog.Warningf("NotrackManager: failed to check if %s chain exists during cleanup: %v", family, err) + + return + } + + if exists { + if err := ipt.ClearChain("raw", notrackChain); err != nil { + klog.Warningf("NotrackManager: failed to flush %s chain during cleanup: %v", family, err) + } + + if err := ipt.DeleteChain("raw", notrackChain); err != nil { + klog.Warningf("NotrackManager: failed to delete %s chain during cleanup: %v", family, err) + } + } +} + +// splitByFamily separates a list of CIDRs into IPv4 and IPv6 slices. +func splitByFamily(cidrs []string) (v4, v6 []string) { + for _, cidr := range cidrs { + ip, _, err := net.ParseCIDR(cidr) + if err != nil { + klog.V(4).Infof("NotrackManager: skipping unparseable CIDR %q: %v", cidr, err) + + continue + } + + if ip.To4() != nil { + v4 = append(v4, cidr) + } else { + v6 = append(v6, cidr) + } + } + + return v4, v6 +} + +// dedupeAndSort returns a sorted, deduplicated copy of the input slice. +func dedupeAndSort(items []string) []string { + if len(items) == 0 { + return nil + } + + seen := make(map[string]struct{}, len(items)) + result := make([]string, 0, len(items)) + + for _, item := range items { + if _, ok := seen[item]; ok { + continue + } + + seen[item] = struct{}{} + result = append(result, item) + } + + sort.Strings(result) + + return result +} diff --git a/internal/net/netlink/notrack_manager_test.go b/internal/net/netlink/notrack_manager_test.go new file mode 100644 index 00000000..b7e40e3b --- /dev/null +++ b/internal/net/netlink/notrack_manager_test.go @@ -0,0 +1,107 @@ +// Copyright (c) Microsoft Corporation. +// Licensed under the MIT License. + +package netlink + +import ( + "testing" +) + +func TestSplitByFamily(t *testing.T) { + t.Parallel() + + v4, v6 := splitByFamily([]string{ + "10.244.0.0/16", + "fd00::/48", + "192.168.1.0/24", + "2001:db8::/32", + }) + + if len(v4) != 2 { + t.Fatalf("expected 2 IPv4 CIDRs, got %d: %v", len(v4), v4) + } + + if len(v6) != 2 { + t.Fatalf("expected 2 IPv6 CIDRs, got %d: %v", len(v6), v6) + } + + if v4[0] != "10.244.0.0/16" || v4[1] != "192.168.1.0/24" { + t.Errorf("unexpected IPv4 CIDRs: %v", v4) + } + + if v6[0] != "fd00::/48" || v6[1] != "2001:db8::/32" { + t.Errorf("unexpected IPv6 CIDRs: %v", v6) + } +} + +func TestSplitByFamilyInvalidCIDR(t *testing.T) { + t.Parallel() + + v4, v6 := splitByFamily([]string{"not-a-cidr", "10.0.0.0/8"}) + + if len(v4) != 1 || v4[0] != "10.0.0.0/8" { + t.Errorf("expected [10.0.0.0/8], got %v", v4) + } + + if len(v6) != 0 { + t.Errorf("expected no IPv6, got %v", v6) + } +} + +func TestSplitByFamilyEmpty(t *testing.T) { + t.Parallel() + + v4, v6 := splitByFamily(nil) + + if v4 != nil || v6 != nil { + t.Errorf("expected nil slices, got v4=%v v6=%v", v4, v6) + } +} + +func TestDedupeAndSort(t *testing.T) { + t.Parallel() + + result := dedupeAndSort([]string{"c", "a", "b", "a", "c"}) + expected := []string{"a", "b", "c"} + + if len(result) != len(expected) { + t.Fatalf("expected %d items, got %d: %v", len(expected), len(result), result) + } + + for i, v := range result { + if v != expected[i] { + t.Errorf("result[%d] = %q, want %q", i, v, expected[i]) + } + } +} + +func TestDedupeAndSortEmpty(t *testing.T) { + t.Parallel() + + result := dedupeAndSort(nil) + if result != nil { + t.Errorf("expected nil, got %v", result) + } + + result = dedupeAndSort([]string{}) + if result != nil { + t.Errorf("expected nil for empty slice, got %v", result) + } +} + +func TestNotrackJumpRuleSpec(t *testing.T) { + t.Parallel() + + rule := notrackJumpRule("wg0") + expected := []string{"-i", "wg0", "-m", "comment", "--comment", notrackComment, "-j", notrackChain} + + if len(rule) != len(expected) { + t.Fatalf("notrackJumpRule length = %d, want %d", len(rule), len(expected)) + } + + for i, v := range rule { + if v != expected[i] { + t.Errorf("notrackJumpRule[%d] = %q, want %q", i, v, expected[i]) + } + } +} diff --git a/internal/net/webhook/server.go b/internal/net/webhook/server.go index fca89911..4b55774a 100644 --- a/internal/net/webhook/server.go +++ b/internal/net/webhook/server.go @@ -5,12 +5,14 @@ package webhook import ( "context" + "crypto/tls" "crypto/x509" "encoding/json" "fmt" "io" "net/http" "os" + "sort" "time" admissionv1 "k8s.io/api/admission/v1" @@ -164,13 +166,27 @@ func (s *Server) registerAggregatedDiscoveryHandlers() { // isTrustedAggregatedRequest validates that aggregated API requests arrive with // a verified client certificate signed by the cluster trust roots. func (s *Server) isTrustedAggregatedRequest(r *http.Request) bool { - if r == nil || r.TLS == nil || len(r.TLS.PeerCertificates) == 0 { - klog.V(2).Info("Rejecting aggregated request without client certificate") + if r == nil { + klog.V(2).Info("Rejecting aggregated request: nil request") + return false + } + + reqCtx := requestContextForLog(r) + + if r.TLS == nil || len(r.TLS.PeerCertificates) == 0 { + klog.V(2).Infof("Rejecting aggregated request without client certificate: %s; tls=%t handshakeComplete=%t serverName=%q negotiatedProtocol=%q", + reqCtx, + r.TLS != nil, + r.TLS != nil && r.TLS.HandshakeComplete, + tlsServerName(r.TLS), + tlsNegotiatedProto(r.TLS), + ) + return false } if s.aggregatedClientCAs == nil { - klog.Warning("Rejecting aggregated request because client CA pool is not configured") + klog.Warningf("Rejecting aggregated request because client CA pool is not configured: %s", reqCtx) return false } @@ -186,13 +202,17 @@ func (s *Server) isTrustedAggregatedRequest(r *http.Request) bool { Intermediates: intermediates, KeyUsages: []x509.ExtKeyUsage{x509.ExtKeyUsageClientAuth}, }); err != nil { - klog.V(2).Infof("Rejecting aggregated request with untrusted client certificate: %v", err) + klog.V(2).Infof("Rejecting aggregated request with untrusted client certificate: %s; %s; verifyErr=%v", + reqCtx, certDescription(leaf), err) + return false } if len(s.aggregatedClientAllowedCNs) > 0 { if _, ok := s.aggregatedClientAllowedCNs[leaf.Subject.CommonName]; !ok { - klog.V(2).Infof("Rejecting aggregated request with unexpected client certificate CN %q", leaf.Subject.CommonName) + klog.V(2).Infof("Rejecting aggregated request with unexpected client certificate CN: %s; %s; allowedCNs=%v", + reqCtx, certDescription(leaf), allowedCNsForLog(s.aggregatedClientAllowedCNs)) + return false } } @@ -200,6 +220,65 @@ func (s *Server) isTrustedAggregatedRequest(r *http.Request) bool { return true } +// requestContextForLog returns a single-line summary of an HTTP request +// suitable for inclusion in rejection log lines. +func requestContextForLog(r *http.Request) string { + path := "" + if r.URL != nil { + path = r.URL.Path + } + + return fmt.Sprintf("method=%s path=%q host=%q remoteAddr=%s userAgent=%q", + r.Method, path, r.Host, r.RemoteAddr, r.UserAgent()) +} + +// certDescription returns a single-line summary of an X.509 certificate. +func certDescription(c *x509.Certificate) string { + if c == nil { + return "cert=" + } + + return fmt.Sprintf("certSubject=%q certIssuer=%q certSerial=%s certNotBefore=%s certNotAfter=%s certDNSNames=%v", + c.Subject.String(), + c.Issuer.String(), + c.SerialNumber.String(), + c.NotBefore.UTC().Format(time.RFC3339), + c.NotAfter.UTC().Format(time.RFC3339), + c.DNSNames, + ) +} + +// tlsServerName returns the SNI server name from a TLS connection state, or +// the empty string when none was provided. +func tlsServerName(state *tls.ConnectionState) string { + if state == nil { + return "" + } + + return state.ServerName +} + +// tlsNegotiatedProto returns the negotiated ALPN protocol or empty string. +func tlsNegotiatedProto(state *tls.ConnectionState) string { + if state == nil { + return "" + } + + return state.NegotiatedProtocol +} + +// allowedCNsForLog returns a slice of the allowed CNs for inclusion in logs. +func allowedCNsForLog(set map[string]struct{}) []string { + out := make([]string, 0, len(set)) + for cn := range set { + out = append(out, cn) + } + + sort.Strings(out) + + return out +} + // GetClientCAs returns the front-proxy client CA pool so callers can set it // on the unified TLS server's ClientCAs. The returned pool may be nil if the // extension-apiserver-authentication ConfigMap has not been loaded yet.