diff --git a/CHANGELOG.md b/CHANGELOG.md index eb77f27f..e3f3f024 100644 --- a/CHANGELOG.md +++ b/CHANGELOG.md @@ -11,6 +11,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/). - OCI labels missing from `docker/metadata-action` on the Topograph container image: `org.opencontainers.image.documentation`, `authors`, and `vendor` ([#377](https://github.com/NVIDIA/topograph/pull/377)). - Helm chart metadata: `home`, `icon`, `maintainers`, `keywords`, and Artifact Hub annotations ([#377](https://github.com/NVIDIA/topograph/pull/377)). +### Changed + +- Slinky engine `useGpuCliqueLabel` now emits an actionable diagnostic when no block domains can be built: the error reports how many nodes were scanned and why each was skipped (no Slurm mapping, missing `nvidia.com/gpu.clique` label, or missing the node-data-broker-written `topograph.nvidia.com/instance` annotation), and lists the offending node names. + ### Fixed - Helm node-observer now targets the rendered Topograph Service fullname in `generateTopologyUrl`. diff --git a/pkg/engines/slinky/engine.go b/pkg/engines/slinky/engine.go index b7c3b08e..12450fbb 100644 --- a/pkg/engines/slinky/engine.go +++ b/pkg/engines/slinky/engine.go @@ -262,21 +262,35 @@ func getComputeInstances(nodes *corev1.NodeList, nodeMap map[string]string) ([]t func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*topology.Graph, *httperr.Error) { domains := topology.NewDomainMap() + + // Diagnostic counters to explain why no domains were built. The instance + // annotation is written per-node by the node-data-broker DaemonSet, so a + // node with the clique label but no annotation points at a broker that has + // not (yet) annotated that specific node. + totalNodes := len(clusterNodes.nodes.Items) + var noSlurmName, noCliqueLabel int + missingAnnotation := []string{} + for _, node := range clusterNodes.nodes.Items { slurmName, ok := clusterNodes.nodeMap[node.Name] if !ok || slurmName == "" { + noSlurmName++ klog.V(4).Infof("Skipping node %s as it does not have a corresponding SLURM name", node.Name) continue } gpuClique := strings.TrimSpace(node.Labels[topology.KeyNvidiaGPUClique]) if gpuClique == "" { + noCliqueLabel++ + klog.V(4).Infof("Skipping node %s (SLURM node %s): missing/empty %q label", node.Name, slurmName, topology.KeyNvidiaGPUClique) continue } instance, ok := node.Annotations[topology.KeyNodeInstance] if !ok { - klog.Warningf("missing %q annotation in node %s", topology.KeyNodeInstance, node.Name) + missingAnnotation = append(missingAnnotation, node.Name) + klog.Warningf("node %s (SLURM node %s) has label %s=%q but is missing annotation %q (expected from node-data-broker on that node)", + node.Name, slurmName, topology.KeyNvidiaGPUClique, gpuClique, topology.KeyNodeInstance) continue } @@ -285,8 +299,11 @@ func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*t if len(domains) == 0 { return nil, httperr.NewError(http.StatusBadGateway, - fmt.Sprintf("useGpuCliqueLabel=true but no matching nodes found; check label %q and annotation %q", - topology.KeyNvidiaGPUClique, topology.KeyNodeInstance)) + fmt.Sprintf("useGpuCliqueLabel=true but no matching nodes found; check label %q and annotation %q. "+ + "Scanned %d node(s): %d without a SLURM node mapping (no Ready slurmd pod), %d missing the %q label, %d with the label but missing the %q annotation%s", + topology.KeyNvidiaGPUClique, topology.KeyNodeInstance, + totalNodes, noSlurmName, noCliqueLabel, topology.KeyNvidiaGPUClique, + len(missingAnnotation), topology.KeyNodeInstance, formatMissingAnnotationNodes(missingAnnotation))) } if graph == nil { @@ -300,6 +317,23 @@ func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*t return graph, nil } +// formatMissingAnnotationNodes renders the node names that carry the GPU clique +// label but lack the node-data-broker-written instance annotation. The list is +// capped so the error message stays bounded on large clusters. +func formatMissingAnnotationNodes(nodes []string) string { + if len(nodes) == 0 { + return "" + } + + const maxListed = 10 + if len(nodes) <= maxListed { + return fmt.Sprintf(" (nodes missing annotation: %s)", strings.Join(nodes, ", ")) + } + + return fmt.Sprintf(" (nodes missing annotation: %s, and %d more)", + strings.Join(nodes[:maxListed], ", "), len(nodes)-maxListed) +} + func usesBlockTopology(cfg *translate.Config) bool { if cfg == nil { return false diff --git a/pkg/engines/slinky/engine_test.go b/pkg/engines/slinky/engine_test.go index bde0e743..8034773c 100644 --- a/pkg/engines/slinky/engine_test.go +++ b/pkg/engines/slinky/engine_test.go @@ -404,6 +404,42 @@ func TestWithGPUCliqueDomainsNoMatchingNodes(t *testing.T) { got, httpErr := withGPUCliqueDomains(&topology.Graph{}, clusterNodes) require.Nil(t, got) require.ErrorContains(t, httpErr, "useGpuCliqueLabel=true but no matching nodes found") + // The node maps to a SLURM node but has no clique label. + require.ErrorContains(t, httpErr, "Scanned 1 node(s)") + require.ErrorContains(t, httpErr, fmt.Sprintf("1 missing the %q label", topology.KeyNvidiaGPUClique)) +} + +func TestWithGPUCliqueDomainsMissingBrokerAnnotation(t *testing.T) { + ctx := context.Background() + client := fake.NewSimpleClientset() + + // Node has the GPU clique label but is missing the node-data-broker-written + // instance annotation - the exact scenario that produces the error in the field. + _, err := client.CoreV1().Nodes().Create(ctx, &corev1.Node{ + ObjectMeta: metav1.ObjectMeta{ + Name: "k8s-node-0", + Labels: map[string]string{topology.KeyNvidiaGPUClique: "clique-a"}, + }, + }, metav1.CreateOptions{}) + require.NoError(t, err) + + _, err = client.CoreV1().Pods("test-ns").Create(ctx, makeReadySlurmdPod("pod-0", "k8s-node-0", "slurm-0"), metav1.CreateOptions{}) + require.NoError(t, err) + + eng := &SlinkyEngine{ + client: client, + params: &Params{ + Namespace: "test-ns", + podListOpt: &metav1.ListOptions{LabelSelector: "app=slinky"}, + }, + } + + clusterNodes, httpErr := eng.getClusterNodes(ctx) + require.Nil(t, httpErr) + got, httpErr := withGPUCliqueDomains(&topology.Graph{}, clusterNodes) + require.Nil(t, got) + require.ErrorContains(t, httpErr, fmt.Sprintf("1 with the label but missing the %q annotation", topology.KeyNodeInstance)) + require.ErrorContains(t, httpErr, "nodes missing annotation: k8s-node-0") } func TestGenerateOutputUsesGPUCliqueDomains(t *testing.T) {