Skip to content

Commit bab8fd9

Browse files
committed
feat(agent): poll guest metrics via VSOCK and expose as Prometheus gauges
1 parent bb0ebc4 commit bab8fd9

2 files changed

Lines changed: 96 additions & 7 deletions

File tree

internal/agent/firecracker_driver.go

Lines changed: 42 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -207,15 +207,16 @@ func (d *FirecrackerDriver) Start(ctx context.Context, vm *impdevv1alpha1.ImpVM)
207207

208208
proc := &fcProc{machine: m, pid: int64(pid), socket: sockPath, netInfo: netInfo}
209209

210-
// Capture probe arguments before goroutine launch to avoid data race on vm.
210+
// Capture goroutine args before launch to avoid data race on vm.
211211
var probeCtx context.Context
212212
var probes *impdevv1alpha1.ProbeSpec
213-
var vsockPath, vmNamespace, vmName string
214-
if gaEnabled && vm.Spec.Probes != nil {
213+
var vsockPath, vmNamespace, vmName, className string
214+
if gaEnabled {
215215
vsockPath = strings.TrimSuffix(sockPath, ".sock") + ".vsock"
216-
probes = vm.Spec.Probes
216+
probes = vm.Spec.Probes // may be nil — runProbes handles nil probes
217217
vmNamespace = vm.Namespace
218218
vmName = vm.Name
219+
className = vm.Spec.ClassRef.Name // safe: checked non-nil at top of Start
219220
var probeCancel context.CancelFunc
220221
probeCtx, probeCancel = context.WithCancel(context.Background())
221222
proc.probeCancel = probeCancel
@@ -228,7 +229,7 @@ func (d *FirecrackerDriver) Start(ctx context.Context, vm *impdevv1alpha1.ImpVM)
228229
d.mu.Unlock()
229230

230231
if probeCtx != nil {
231-
go d.runProbes(probeCtx, probes, vsockPath, vmNamespace, vmName)
232+
go d.runProbes(probeCtx, probes, vsockPath, vmNamespace, vmName, className)
232233
}
233234

234235
return int64(pid), nil
@@ -476,16 +477,28 @@ func (d *FirecrackerDriver) guestAgentPath() string {
476477
}
477478

478479
// runProbes dials the guest VSOCK and runs probe polling until ctx is cancelled.
479-
// Called in a goroutine after the VM reaches Running. probes must not be nil.
480+
// Called in a goroutine after the VM reaches Running. probes may be nil — when
481+
// nil, runProbes keeps the VSOCK connection open for metrics until ctx is done.
480482
// vmNamespace and vmName identify the ImpVM to patch conditions onto.
481-
func (d *FirecrackerDriver) runProbes(ctx context.Context, probes *impdevv1alpha1.ProbeSpec, vsockPath, vmNamespace, vmName string) {
483+
func (d *FirecrackerDriver) runProbes(ctx context.Context, probes *impdevv1alpha1.ProbeSpec, vsockPath, vmNamespace, vmName, className string) {
482484
conn, err := agentvsock.Dial(ctx, vsockPath, 10000)
483485
if err != nil {
484486
logf.Log.Error(err, "runProbes: VSOCK dial failed", "vsock", vsockPath)
485487
return
486488
}
487489
defer conn.Close() //nolint:errcheck
488490
client := pb.NewGuestAgentClient(conn)
491+
492+
// Always poll metrics when collector is set.
493+
if d.Metrics != nil {
494+
go d.pollMetrics(ctx, client, vmNamespace+"/"+vmName, className)
495+
}
496+
497+
if probes == nil {
498+
<-ctx.Done() // keep connection open for metrics until VM stops
499+
return
500+
}
501+
489502
runner := probe.NewRunner(client, probes, func(conds []metav1.Condition) {
490503
if d.Client == nil {
491504
logf.Log.Error(nil, "probe patcher: client is nil, skipping condition patch", "vm", vmNamespace+"/"+vmName)
@@ -508,6 +521,28 @@ func (d *FirecrackerDriver) runProbes(ctx context.Context, probes *impdevv1alpha
508521
runner.Run(ctx)
509522
}
510523

524+
// pollMetrics calls the guest Metrics RPC on every metricsInterval tick and
525+
// forwards results to the Metrics collector. Errors are logged at V(1) and
526+
// skipped — the guest may be unavailable during startup or shutdown.
527+
// Runs until ctx is cancelled.
528+
func (d *FirecrackerDriver) pollMetrics(ctx context.Context, client pb.GuestAgentClient, vmKey, className string) {
529+
ticker := time.NewTicker(metricsInterval)
530+
defer ticker.Stop()
531+
for {
532+
select {
533+
case <-ctx.Done():
534+
return
535+
case <-ticker.C:
536+
resp, err := client.Metrics(ctx, &pb.MetricsRequest{})
537+
if err != nil {
538+
logf.Log.V(1).Info("pollMetrics: guest agent unavailable", "vm", vmKey, "err", err)
539+
continue
540+
}
541+
d.Metrics.SetGuestMetrics(vmKey, d.NodeName, className, resp.CpuUsageRatio, resp.MemoryUsedBytes, resp.DiskUsedBytes)
542+
}
543+
}
544+
}
545+
511546
// Snapshot pauses the VM, writes state+memory files to destDir, then resumes it.
512547
// The VM is always resumed before returning, even on error (enforced via defer).
513548
// destDir must exist; files are named vm.state and vm.mem.

internal/agent/firecracker_driver_test.go

Lines changed: 54 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,14 +7,17 @@ import (
77
"os"
88
"os/exec"
99
"testing"
10+
"time"
1011

1112
firecracker "github.com/firecracker-microvm/firecracker-go-sdk"
13+
"google.golang.org/grpc"
1214
"k8s.io/apimachinery/pkg/runtime"
1315
"sigs.k8s.io/controller-runtime/pkg/client/fake"
1416

1517
impdevv1alpha1 "github.com/syscode-labs/imp/api/v1alpha1"
1618
"github.com/syscode-labs/imp/internal/agent/network"
1719
"github.com/syscode-labs/imp/internal/agent/rootfs"
20+
pb "github.com/syscode-labs/imp/internal/proto/guest"
1821
)
1922

2023
// hasFirecrackerBin returns true if the firecracker binary is available.
@@ -601,3 +604,54 @@ func TestFirecrackerDriver_Stop_doesNotCallRemoveNATWhenNotLast(t *testing.T) {
601604
t.Errorf("expected RemoveNAT NOT called, but got calls: %v", stub.RemoveNATCalls)
602605
}
603606
}
607+
608+
func TestFirecrackerDriver_PollMetrics(t *testing.T) {
609+
// Override interval so test completes in milliseconds.
610+
old := metricsInterval
611+
metricsInterval = 1 * time.Millisecond
612+
defer func() { metricsInterval = old }()
613+
614+
mc := NewVMMetricsCollector()
615+
d := &FirecrackerDriver{
616+
Metrics: mc,
617+
NodeName: "node-1",
618+
procs: make(map[string]*fcProc),
619+
}
620+
621+
called := make(chan struct{}, 1)
622+
fakeClient := &fakeGuestAgentClient{
623+
metricsResp: &pb.MetricsResponse{
624+
CpuUsageRatio: 0.5,
625+
MemoryUsedBytes: 1024,
626+
DiskUsedBytes: 2048,
627+
},
628+
onMetrics: func() { select { case called <- struct{}{}: default: } },
629+
}
630+
631+
ctx, cancel := context.WithTimeout(context.Background(), 500*time.Millisecond)
632+
defer cancel()
633+
634+
go d.pollMetrics(ctx, fakeClient, "default/vm1", "small")
635+
636+
select {
637+
case <-called:
638+
// success — Metrics RPC was called
639+
case <-ctx.Done():
640+
t.Fatal("pollMetrics never called Metrics RPC within timeout")
641+
}
642+
}
643+
644+
// fakeGuestAgentClient implements pb.GuestAgentClient for testing.
645+
// Embed pb.GuestAgentClient (interface) so unimplemented methods panic rather than silently no-op.
646+
type fakeGuestAgentClient struct {
647+
pb.GuestAgentClient
648+
metricsResp *pb.MetricsResponse
649+
onMetrics func()
650+
}
651+
652+
func (f *fakeGuestAgentClient) Metrics(_ context.Context, _ *pb.MetricsRequest, _ ...grpc.CallOption) (*pb.MetricsResponse, error) {
653+
if f.onMetrics != nil {
654+
f.onMetrics()
655+
}
656+
return f.metricsResp, nil
657+
}

0 commit comments

Comments
 (0)