Skip to content
Closed
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions internal/config/config_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -28,10 +28,30 @@ import (
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
)

func useTempHomeDir(t *testing.T) string {
t.Helper()

origGeteuid := osGeteuid
origHomeDirFn := homeDirFn

tmpHome := t.TempDir()
osGeteuid = func() int { return 1000 }
homeDirFn = func() (string, error) { return tmpHome, nil }

t.Cleanup(func() {
osGeteuid = origGeteuid
homeDirFn = origHomeDirFn
})

return tmpHome
}

func TestDefault(t *testing.T) {
ctx := context.Background()

t.Run("default values", func(t *testing.T) {
useTempHomeDir(t)

cfg, err := Default(ctx)
require.NoError(t, err)

Expand All @@ -45,6 +65,8 @@ func TestDefault(t *testing.T) {
})

t.Run("with infiniband class dir option", func(t *testing.T) {
useTempHomeDir(t)

classDir := "/custom/class"

cfg, err := Default(ctx, WithInfinibandClassRootDir(classDir))
Expand Down Expand Up @@ -551,6 +573,8 @@ func TestDefaultWithHealthExporter(t *testing.T) {
ctx := context.Background()

t.Run("default includes health exporter", func(t *testing.T) {
useTempHomeDir(t)

cfg, err := Default(ctx)
require.NoError(t, err)

Expand Down
116 changes: 103 additions & 13 deletions internal/exporter/collector/collector.go
Original file line number Diff line number Diff line change
Expand Up @@ -20,7 +20,9 @@ import (
"context"
"crypto/rand"
"encoding/hex"
"errors"
"fmt"
"sync"
"time"

apiv1 "github.com/NVIDIA/fleet-intelligence-sdk/api/v1"
Expand Down Expand Up @@ -72,6 +74,10 @@ type collector struct {
lastAttestationCollection time.Time
machineID string // Agent's stable identity from server initialization
dcgmGPUIndexes map[string]string // UUID → DCGM device ID override for GPU indices
machineInfoMu sync.RWMutex
cachedMachineInfo *machineinfo.MachineInfo
machineInfoCollecting bool
machineInfoFetcher func(nvidianvml.Instance, ...machineinfo.MachineInfoOption) (*machineinfo.MachineInfo, error)
}

// New creates a new health data collector
Expand Down Expand Up @@ -104,6 +110,7 @@ func New(
attestationManager: attestationManager,
machineID: machineID,
dcgmGPUIndexes: dcgmGPUIndexes,
machineInfoFetcher: machineinfo.GetMachineInfo,
}
}

Expand All @@ -124,33 +131,58 @@ func (c *collector) Collect(ctx context.Context) (*HealthData, error) {
Timestamp: time.Now().UTC(),
}

if err := ctx.Err(); err != nil {
return data, err
}

// Collect machine info if enabled
if c.config.IncludeMachineInfo {
if err := c.collectMachineInfo(data); err != nil {
if err := c.collectMachineInfo(ctx, data); err != nil {
log.Logger.Errorw("Failed to collect machine info", "error", err)
}
}
if err := ctx.Err(); err != nil {
return data, err
Comment on lines 139 to +145
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟡 Minor

Treat machine-info cancellation like the other partial-data phases.

This branch still logs context.Canceled / context.DeadlineExceeded as Errorw before returning the same partial-data error path. Metrics, events, and component data already suppress that noise; machine info should match.

🔧 Suggested adjustment
 	if c.config.IncludeMachineInfo {
 		if err := c.collectMachineInfo(ctx, data); err != nil {
+			if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
+				return data, err
+			}
 			log.Logger.Errorw("Failed to collect machine info", "error", err)
 		}
 	}
📝 Committable suggestion

‼️ IMPORTANT
Carefully review the code before committing. Ensure that it accurately replaces the highlighted code, contains no missing lines, and has no issues with indentation. Thoroughly test & benchmark the code to ensure it meets the requirements.

Suggested change
if c.config.IncludeMachineInfo {
if err := c.collectMachineInfo(data); err != nil {
if err := c.collectMachineInfo(ctx, data); err != nil {
log.Logger.Errorw("Failed to collect machine info", "error", err)
}
}
if err := ctx.Err(); err != nil {
return data, err
if c.config.IncludeMachineInfo {
if err := c.collectMachineInfo(ctx, data); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return data, err
}
log.Logger.Errorw("Failed to collect machine info", "error", err)
}
}
if err := ctx.Err(); err != nil {
return data, err
🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/exporter/collector/collector.go` around lines 139 - 145, The
machine-info collection currently logs all errors from c.collectMachineInfo as
Errorw and then lets the function hit the ctx.Err() return path; change this to
mirror metrics/events/component behavior: after calling
c.collectMachineInfo(ctx, data) check the returned err, and if err is
context.Canceled or context.DeadlineExceeded (use errors.Is(err,
context.Canceled) or errors.Is(err, context.DeadlineExceeded)) do NOT call
log.Logger.Errorw — instead return the partial data and the ctx error
immediately; for other non-nil errors keep the existing Errorw call and
continue. Ensure you update the logic around c.config.IncludeMachineInfo and the
ctx.Err() handling so machine-info cancellation is suppressed like the other
partial-data phases.

}

// Collect metrics if enabled
if c.config.IncludeMetrics {
if err := c.collectMetrics(ctx, data); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return data, err
}
log.Logger.Errorw("Failed to collect metrics", "error", err)
}
}
if err := ctx.Err(); err != nil {
return data, err
}

// Collect events if enabled
if c.config.IncludeEvents {
if err := c.collectEvents(ctx, data); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return data, err
}
log.Logger.Errorw("Failed to collect events", "error", err)
}
}
if err := ctx.Err(); err != nil {
return data, err
}

// Collect component data if enabled
if c.config.IncludeComponentData {
if err := c.collectComponentData(data); err != nil {
if err := c.collectComponentData(ctx, data); err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
return data, err
}
log.Logger.Errorw("Failed to collect component data", "error", err)
}
}
if err := ctx.Err(); err != nil {
return data, err
}

// Collect attestation data if provider is available
// Attestation is always enabled if manager is available
Expand All @@ -166,27 +198,67 @@ func (c *collector) Collect(ctx context.Context) (*HealthData, error) {
return data, nil
}

// collectMachineInfo collects machine hardware information
func (c *collector) collectMachineInfo(data *HealthData) error {
// collectMachineInfo attaches cached machine info when available and triggers
// a background refresh only until the cache has been populated.
func (c *collector) collectMachineInfo(ctx context.Context, data *HealthData) error {
if c.nvmlInstance == nil {
return fmt.Errorf("NVML instance not available")
}

var opts []machineinfo.MachineInfoOption
if len(c.dcgmGPUIndexes) > 0 {
opts = append(opts, machineinfo.WithDCGMGPUIndexes(c.dcgmGPUIndexes))
if cached := c.getCachedMachineInfo(); cached != nil {
data.MachineInfo = cached
log.Logger.Debugw("Using cached machine info")
}

machineInfo, err := machineinfo.GetMachineInfo(c.nvmlInstance, opts...)
if err != nil {
return fmt.Errorf("failed to get machine info: %w", err)
c.startMachineInfoRefresh()
if err := ctx.Err(); err != nil {
return err
Comment on lines +203 to +215
Copy link
Copy Markdown

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

⚠️ Potential issue | 🟠 Major

Don't enqueue uncancellable machine-info work from a dead collection.

collectMachineInfo() checks ctx.Err() only after startMachineInfoRefresh(), and the refresh goroutine itself has no cancel/deadline control. A timed-out collection can still launch NVML work, and one wedged machineInfoFetcher call leaves machineInfoCollecting stuck true, so later collections never retry.

Also applies to: 221-252

🤖 Prompt for AI Agents
Verify each finding against the current code and only fix it if needed.

In `@internal/exporter/collector/collector.go` around lines 203 - 215,
collectMachineInfo currently launches startMachineInfoRefresh (which spawns
machineInfoFetcher) before checking ctx.Err() and the fetcher has no
cancellation or guaranteed reset of machineInfoCollecting; change the code to
check ctx.Err() before calling startMachineInfoRefresh and modify
startMachineInfoRefresh/machineInfoFetcher to accept a context (or return a
cancellable func) so the goroutine selects on ctx.Done() to abort early, and
ensure machineInfoCollecting is always cleared (use defer or a finalizer) even
when fetcher exits early or errors so the flag cannot remain stuck true.

}

data.MachineInfo = machineInfo
log.Logger.Debugw("Collected machine info", "machine_info", data.MachineInfo)
return nil
}

func (c *collector) startMachineInfoRefresh() {
c.machineInfoMu.Lock()
if c.machineInfoCollecting || c.cachedMachineInfo != nil {
c.machineInfoMu.Unlock()
return
}
c.machineInfoCollecting = true
c.machineInfoMu.Unlock()

go func() {
defer func() {
c.machineInfoMu.Lock()
c.machineInfoCollecting = false
c.machineInfoMu.Unlock()
}()

var opts []machineinfo.MachineInfoOption
if len(c.dcgmGPUIndexes) > 0 {
opts = append(opts, machineinfo.WithDCGMGPUIndexes(c.dcgmGPUIndexes))
}

machineInfo, err := c.machineInfoFetcher(c.nvmlInstance, opts...)
if err != nil {
log.Logger.Errorw("Failed to refresh machine info", "error", err)
return
}

c.machineInfoMu.Lock()
c.cachedMachineInfo = machineInfo
c.machineInfoMu.Unlock()
log.Logger.Debugw("Refreshed machine info", "machine_info", machineInfo)
}()
}

func (c *collector) getCachedMachineInfo() *machineinfo.MachineInfo {
c.machineInfoMu.RLock()
defer c.machineInfoMu.RUnlock()

return c.cachedMachineInfo
}

// collectMetrics collects metrics data from the metrics store
func (c *collector) collectMetrics(ctx context.Context, data *HealthData) error {
if c.metricsStore == nil {
Expand Down Expand Up @@ -219,8 +291,17 @@ func (c *collector) collectEvents(ctx context.Context, data *HealthData) error {
}

for _, component := range components {
if err := ctx.Err(); err != nil {
data.Events = allEvents
return err
}

componentEvents, err := component.Events(ctx, since)
if err != nil {
if errors.Is(err, context.Canceled) || errors.Is(err, context.DeadlineExceeded) {
data.Events = allEvents
return err
}
log.Logger.Errorw("Failed to get events from component",
"component", component.Name(), "error", err)
continue
Expand Down Expand Up @@ -250,7 +331,7 @@ func (c *collector) collectEvents(ctx context.Context, data *HealthData) error {
}

// collectComponentData collects health states from all components
func (c *collector) collectComponentData(data *HealthData) error {
func (c *collector) collectComponentData(ctx context.Context, data *HealthData) error {
if c.componentsRegistry == nil {
return fmt.Errorf("components registry not available")
}
Expand All @@ -259,10 +340,19 @@ func (c *collector) collectComponentData(data *HealthData) error {
components := c.componentsRegistry.All()

for _, component := range components {
if err := ctx.Err(); err != nil {
data.ComponentData = componentData
return err
}

componentName := component.Name()

// Get health states
healthStates := component.LastHealthStates()
if err := ctx.Err(); err != nil {
data.ComponentData = componentData
return err
}
log.Logger.Debugw("Collecting health states",
"component", componentName, "health_states", healthStates)

Expand Down
Loading
Loading