Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions CHANGELOG.md
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,10 @@ The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.1.0/).
- OCI labels missing from `docker/metadata-action` on the Topograph container image: `org.opencontainers.image.documentation`, `authors`, and `vendor` ([#377](https://github.com/NVIDIA/topograph/pull/377)).
- Helm chart metadata: `home`, `icon`, `maintainers`, `keywords`, and Artifact Hub annotations ([#377](https://github.com/NVIDIA/topograph/pull/377)).

### Changed

- Slinky engine `useGpuCliqueLabel` now emits an actionable diagnostic when no block domains can be built: the error reports how many nodes were scanned and why each was skipped (no Slurm mapping, missing `nvidia.com/gpu.clique` label, or missing the node-data-broker-written `topograph.nvidia.com/instance` annotation), and lists the offending node names.

### Fixed

- Helm node-observer now targets the rendered Topograph Service fullname in `generateTopologyUrl`.
Expand Down
40 changes: 37 additions & 3 deletions pkg/engines/slinky/engine.go
Original file line number Diff line number Diff line change
Expand Up @@ -262,21 +262,35 @@ func getComputeInstances(nodes *corev1.NodeList, nodeMap map[string]string) ([]t

func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*topology.Graph, *httperr.Error) {
domains := topology.NewDomainMap()

// Diagnostic counters to explain why no domains were built. The instance
// annotation is written per-node by the node-data-broker DaemonSet, so a
// node with the clique label but no annotation points at a broker that has
// not (yet) annotated that specific node.
totalNodes := len(clusterNodes.nodes.Items)
var noSlurmName, noCliqueLabel int
missingAnnotation := []string{}

for _, node := range clusterNodes.nodes.Items {
slurmName, ok := clusterNodes.nodeMap[node.Name]
if !ok || slurmName == "" {
noSlurmName++
klog.V(4).Infof("Skipping node %s as it does not have a corresponding SLURM name", node.Name)
continue
}

gpuClique := strings.TrimSpace(node.Labels[topology.KeyNvidiaGPUClique])
if gpuClique == "" {
noCliqueLabel++
klog.V(4).Infof("Skipping node %s (SLURM node %s): missing/empty %q label", node.Name, slurmName, topology.KeyNvidiaGPUClique)
continue
}

instance, ok := node.Annotations[topology.KeyNodeInstance]
if !ok {
klog.Warningf("missing %q annotation in node %s", topology.KeyNodeInstance, node.Name)
missingAnnotation = append(missingAnnotation, node.Name)
klog.Warningf("node %s (SLURM node %s) has label %s=%q but is missing annotation %q (expected from node-data-broker on that node)",
node.Name, slurmName, topology.KeyNvidiaGPUClique, gpuClique, topology.KeyNodeInstance)
continue
}

Expand All @@ -285,8 +299,11 @@ func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*t

if len(domains) == 0 {
return nil, httperr.NewError(http.StatusBadGateway,
fmt.Sprintf("useGpuCliqueLabel=true but no matching nodes found; check label %q and annotation %q",
topology.KeyNvidiaGPUClique, topology.KeyNodeInstance))
fmt.Sprintf("useGpuCliqueLabel=true but no matching nodes found; check label %q and annotation %q. "+
"Scanned %d node(s): %d without a SLURM node mapping (no Ready slurmd pod), %d missing the %q label, %d with the label but missing the %q annotation%s",
topology.KeyNvidiaGPUClique, topology.KeyNodeInstance,
totalNodes, noSlurmName, noCliqueLabel, topology.KeyNvidiaGPUClique,
len(missingAnnotation), topology.KeyNodeInstance, formatMissingAnnotationNodes(missingAnnotation)))
}

if graph == nil {
Expand All @@ -300,6 +317,23 @@ func withGPUCliqueDomains(graph *topology.Graph, clusterNodes *clusterNodes) (*t
return graph, nil
}

// formatMissingAnnotationNodes renders the node names that carry the GPU clique
// label but lack the node-data-broker-written instance annotation. The list is
// capped so the error message stays bounded on large clusters.
func formatMissingAnnotationNodes(nodes []string) string {
if len(nodes) == 0 {
return ""
}

const maxListed = 10
if len(nodes) <= maxListed {
return fmt.Sprintf(" (nodes missing annotation: %s)", strings.Join(nodes, ", "))
}

return fmt.Sprintf(" (nodes missing annotation: %s, and %d more)",
strings.Join(nodes[:maxListed], ", "), len(nodes)-maxListed)
}

func usesBlockTopology(cfg *translate.Config) bool {
if cfg == nil {
return false
Expand Down
36 changes: 36 additions & 0 deletions pkg/engines/slinky/engine_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -404,6 +404,42 @@ func TestWithGPUCliqueDomainsNoMatchingNodes(t *testing.T) {
got, httpErr := withGPUCliqueDomains(&topology.Graph{}, clusterNodes)
require.Nil(t, got)
require.ErrorContains(t, httpErr, "useGpuCliqueLabel=true but no matching nodes found")
// The node maps to a SLURM node but has no clique label.
require.ErrorContains(t, httpErr, "Scanned 1 node(s)")
require.ErrorContains(t, httpErr, fmt.Sprintf("1 missing the %q label", topology.KeyNvidiaGPUClique))
}

func TestWithGPUCliqueDomainsMissingBrokerAnnotation(t *testing.T) {
ctx := context.Background()
client := fake.NewSimpleClientset()

// Node has the GPU clique label but is missing the node-data-broker-written
// instance annotation - the exact scenario that produces the error in the field.
_, err := client.CoreV1().Nodes().Create(ctx, &corev1.Node{
ObjectMeta: metav1.ObjectMeta{
Name: "k8s-node-0",
Labels: map[string]string{topology.KeyNvidiaGPUClique: "clique-a"},
},
}, metav1.CreateOptions{})
require.NoError(t, err)

_, err = client.CoreV1().Pods("test-ns").Create(ctx, makeReadySlurmdPod("pod-0", "k8s-node-0", "slurm-0"), metav1.CreateOptions{})
require.NoError(t, err)

eng := &SlinkyEngine{
client: client,
params: &Params{
Namespace: "test-ns",
podListOpt: &metav1.ListOptions{LabelSelector: "app=slinky"},
},
}

clusterNodes, httpErr := eng.getClusterNodes(ctx)
require.Nil(t, httpErr)
got, httpErr := withGPUCliqueDomains(&topology.Graph{}, clusterNodes)
require.Nil(t, got)
require.ErrorContains(t, httpErr, fmt.Sprintf("1 with the label but missing the %q annotation", topology.KeyNodeInstance))
require.ErrorContains(t, httpErr, "nodes missing annotation: k8s-node-0")
}

func TestGenerateOutputUsesGPUCliqueDomains(t *testing.T) {
Expand Down
Loading