Skip to content

Commit bbcfd61

Browse files
committed
feat(agent): StartedAt timeout for stuck Starting VMs; hostPath socketDir default
Add StartedAt to ImpVMStatus to track when a VM entered the Starting phase. handleStarting times out VMs stuck in Starting (default 5 min, configurable via ImpVMReconciler.StartTimeout) by calling finishFailed. Clear StartedAt on successful transition to Running. Flip socketDir.enabled to true in Helm values so Firecracker sockets survive agent pod restarts, enabling PID-based reattach for running VMs.
1 parent 26abf45 commit bbcfd61

6 files changed

Lines changed: 148 additions & 3 deletions

File tree

api/v1alpha1/impvm_types.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,11 @@ type ImpVMStatus struct {
147147
// +optional
148148
RunningAt *metav1.Time `json:"runningAt,omitempty"`
149149

150+
// StartedAt is the time the VM last transitioned to phase Starting.
151+
// Used to detect and time out stuck start attempts.
152+
// +optional
153+
StartedAt *metav1.Time `json:"startedAt,omitempty"`
154+
150155
// RestartCount is the cumulative number of times this VM has been restarted.
151156
// +optional
152157
RestartCount int32 `json:"restartCount,omitempty"`

api/v1alpha1/zz_generated.deepcopy.go

Lines changed: 4 additions & 0 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

charts/imp/values.yaml

Lines changed: 4 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -39,7 +39,10 @@ agent:
3939
kernelPath: ""
4040
hostPaths:
4141
socketDir:
42-
enabled: false
42+
# enabled: true ensures sockets survive agent pod restarts (required for
43+
# PID-based reattach). Setting this to false will orphan Firecracker
44+
# sockets on restart — breaking graceful stop for running VMs.
45+
enabled: true
4346
path: /run/imp/sockets
4447
imageCache:
4548
enabled: false

config/crd/bases/imp.dev_impvms.yaml

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -747,6 +747,12 @@ spec:
747747
a node.
748748
format: date-time
749749
type: string
750+
startedAt:
751+
description: |-
752+
StartedAt is the time the VM last transitioned to phase Starting.
753+
Used to detect and time out stuck start attempts.
754+
format: date-time
755+
type: string
750756
type: object
751757
type: object
752758
served: true

internal/agent/reconciler.go

Lines changed: 26 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@ import (
99
"time"
1010

1111
apierrors "k8s.io/apimachinery/pkg/api/errors"
12+
metav1 "k8s.io/apimachinery/pkg/apis/meta/v1"
1213
"k8s.io/apimachinery/pkg/runtime"
1314
"k8s.io/client-go/util/retry"
1415
ctrl "sigs.k8s.io/controller-runtime"
@@ -35,6 +36,9 @@ type ImpVMReconciler struct {
3536
// Alloc is the in-memory IP allocator. When non-nil, Reserve is called during
3637
// lazy reattach to restore IP state after agent restart.
3738
Alloc *network.Allocator
39+
// StartTimeout is how long a VM may remain in Starting before being
40+
// transitioned to Failed. Defaults to 5 minutes when zero.
41+
StartTimeout time.Duration
3842
}
3943

4044
// +kubebuilder:rbac:groups=imp.dev,resources=impvms,verbs=get;list;watch;update;patch
@@ -65,20 +69,39 @@ func (r *ImpVMReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl
6569
case impdevv1alpha1.VMPhaseRunning:
6670
return r.handleRunning(ctx, vm)
6771
case impdevv1alpha1.VMPhaseStarting:
68-
log.Info("VM is Starting — requeuing")
69-
return ctrl.Result{RequeueAfter: 2 * time.Second}, nil
72+
return r.handleStarting(ctx, vm)
7073
default:
7174
// Pending, Succeeded, Failed — not our concern.
7275
return ctrl.Result{}, nil
7376
}
7477
}
7578

79+
func (r *ImpVMReconciler) startTimeout() time.Duration {
80+
if r.StartTimeout > 0 {
81+
return r.StartTimeout
82+
}
83+
return 5 * time.Minute
84+
}
85+
86+
func (r *ImpVMReconciler) handleStarting(ctx context.Context, vm *impdevv1alpha1.ImpVM) (ctrl.Result, error) {
87+
log := logf.FromContext(ctx)
88+
if vm.Status.StartedAt != nil {
89+
if elapsed := time.Since(vm.Status.StartedAt.Time); elapsed > r.startTimeout() {
90+
log.Info("VM stuck in Starting — timing out", "elapsed", elapsed, "timeout", r.startTimeout())
91+
return r.finishFailed(ctx, vm)
92+
}
93+
}
94+
return ctrl.Result{RequeueAfter: 2 * time.Second}, nil
95+
}
96+
7697
func (r *ImpVMReconciler) handleScheduled(ctx context.Context, vm *impdevv1alpha1.ImpVM) (ctrl.Result, error) {
7798
log := logf.FromContext(ctx)
7899

79100
// Set phase=Starting before calling driver to make concurrent reconciles idempotent.
80101
base := vm.DeepCopy()
81102
vm.Status.Phase = impdevv1alpha1.VMPhaseStarting
103+
now := metav1.Now()
104+
vm.Status.StartedAt = &now
82105
if err := r.Status().Patch(ctx, vm, client.MergeFrom(base)); err != nil {
83106
if apierrors.IsConflict(err) {
84107
return ctrl.Result{Requeue: true}, nil
@@ -100,6 +123,7 @@ func (r *ImpVMReconciler) handleScheduled(ctx context.Context, vm *impdevv1alpha
100123

101124
base = vm.DeepCopy()
102125
vm.Status.Phase = impdevv1alpha1.VMPhaseRunning
126+
vm.Status.StartedAt = nil
103127
vm.Status.IP = state.IP
104128
vm.Status.RuntimePID = pid
105129
if err := r.Status().Patch(ctx, vm, client.MergeFrom(base)); err != nil {

internal/agent/reconciler_test.go

Lines changed: 103 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -549,6 +549,109 @@ var _ = Describe("ImpVM Agent: registerVTEP — concurrent writes", func() {
549549
})
550550
})
551551

552+
var _ = Describe("ImpVM Agent: Starting — stuck timeout", func() {
553+
ctx := context.Background()
554+
555+
It("transitions to Failed when StartedAt exceeds StartTimeout", func() {
556+
r := &ImpVMReconciler{
557+
Client: k8sClient,
558+
NodeName: testNode,
559+
Driver: NewStubDriver(),
560+
StartTimeout: 5 * time.Minute,
561+
}
562+
563+
sixMinsAgo := metav1.NewTime(time.Now().Add(-6 * time.Minute))
564+
vm := &impdevv1alpha1.ImpVM{
565+
ObjectMeta: metav1.ObjectMeta{
566+
Name: "tc-starting-timeout", Namespace: "default",
567+
Finalizers: []string{"imp/finalizer"},
568+
},
569+
Spec: impdevv1alpha1.ImpVMSpec{NodeName: testNode},
570+
}
571+
Expect(k8sClient.Create(ctx, vm)).To(Succeed())
572+
DeferCleanup(func() { k8sClient.Delete(ctx, vm) }) //nolint:errcheck
573+
574+
base := vm.DeepCopy()
575+
vm.Status.Phase = impdevv1alpha1.VMPhaseStarting
576+
vm.Status.StartedAt = &sixMinsAgo
577+
Expect(k8sClient.Status().Patch(ctx, vm, client.MergeFrom(base))).To(Succeed())
578+
579+
_, err := r.Reconcile(ctx, reconcile.Request{
580+
NamespacedName: types.NamespacedName{Name: "tc-starting-timeout", Namespace: "default"},
581+
})
582+
Expect(err).NotTo(HaveOccurred())
583+
584+
updated := &impdevv1alpha1.ImpVM{}
585+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "tc-starting-timeout", Namespace: "default"}, updated)).To(Succeed())
586+
Expect(updated.Status.Phase).To(Equal(impdevv1alpha1.VMPhaseFailed))
587+
})
588+
589+
It("stays in Starting when StartedAt is within StartTimeout", func() {
590+
r := &ImpVMReconciler{
591+
Client: k8sClient,
592+
NodeName: testNode,
593+
Driver: NewStubDriver(),
594+
StartTimeout: 5 * time.Minute,
595+
}
596+
597+
oneMinsAgo := metav1.NewTime(time.Now().Add(-1 * time.Minute))
598+
vm := &impdevv1alpha1.ImpVM{
599+
ObjectMeta: metav1.ObjectMeta{
600+
Name: "tc-starting-notimeout", Namespace: "default",
601+
Finalizers: []string{"imp/finalizer"},
602+
},
603+
Spec: impdevv1alpha1.ImpVMSpec{NodeName: testNode},
604+
}
605+
Expect(k8sClient.Create(ctx, vm)).To(Succeed())
606+
DeferCleanup(func() { k8sClient.Delete(ctx, vm) }) //nolint:errcheck
607+
608+
base := vm.DeepCopy()
609+
vm.Status.Phase = impdevv1alpha1.VMPhaseStarting
610+
vm.Status.StartedAt = &oneMinsAgo
611+
Expect(k8sClient.Status().Patch(ctx, vm, client.MergeFrom(base))).To(Succeed())
612+
613+
result, err := r.Reconcile(ctx, reconcile.Request{
614+
NamespacedName: types.NamespacedName{Name: "tc-starting-notimeout", Namespace: "default"},
615+
})
616+
Expect(err).NotTo(HaveOccurred())
617+
Expect(result.RequeueAfter).To(Equal(2 * time.Second))
618+
619+
updated := &impdevv1alpha1.ImpVM{}
620+
Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "tc-starting-notimeout", Namespace: "default"}, updated)).To(Succeed())
621+
Expect(updated.Status.Phase).To(Equal(impdevv1alpha1.VMPhaseStarting))
622+
})
623+
624+
It("stays in Starting when StartedAt is nil (timeout not yet set)", func() {
625+
r := &ImpVMReconciler{
626+
Client: k8sClient,
627+
NodeName: testNode,
628+
Driver: NewStubDriver(),
629+
StartTimeout: 5 * time.Minute,
630+
}
631+
632+
vm := &impdevv1alpha1.ImpVM{
633+
ObjectMeta: metav1.ObjectMeta{
634+
Name: "tc-starting-nostartedat", Namespace: "default",
635+
Finalizers: []string{"imp/finalizer"},
636+
},
637+
Spec: impdevv1alpha1.ImpVMSpec{NodeName: testNode},
638+
}
639+
Expect(k8sClient.Create(ctx, vm)).To(Succeed())
640+
DeferCleanup(func() { k8sClient.Delete(ctx, vm) }) //nolint:errcheck
641+
642+
base := vm.DeepCopy()
643+
vm.Status.Phase = impdevv1alpha1.VMPhaseStarting
644+
// StartedAt intentionally not set
645+
Expect(k8sClient.Status().Patch(ctx, vm, client.MergeFrom(base))).To(Succeed())
646+
647+
result, err := r.Reconcile(ctx, reconcile.Request{
648+
NamespacedName: types.NamespacedName{Name: "tc-starting-nostartedat", Namespace: "default"},
649+
})
650+
Expect(err).NotTo(HaveOccurred())
651+
Expect(result.RequeueAfter).To(Equal(2 * time.Second))
652+
})
653+
})
654+
552655
var _ = Describe("ImpVM Agent: handleTerminating driver.Stop error", func() {
553656
ctx := context.Background()
554657

0 commit comments

Comments
 (0)