Skip to content

Commit 4c39042

Browse files
authored
adds vms healthcheck (#84)
* adds vms healthcheck Signed-off-by: Ashraf Fouda <ashraf.m.fouda@gmail.com> * fixing iperf test Signed-off-by: Ashraf Fouda <ashraf.m.fouda@gmail.com> --------- Signed-off-by: Ashraf Fouda <ashraf.m.fouda@gmail.com>
1 parent 6272c40 commit 4c39042

6 files changed

Lines changed: 263 additions & 5 deletions

File tree

pkg/app/flag.go

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -17,6 +17,8 @@ const (
1717
ReadonlyCache = "readonly-cache"
1818
// NotReachable represents the flag when a grid service is not reachable
1919
NotReachable = "not-reachable"
20+
// VMTestFailed represents the flag when VM health check fails
21+
VMTestFailed = "vm-test-failed"
2022
)
2123

2224
// SetFlag is used when the /var/cache cannot be mounted on a SSD or HDD,

pkg/perf/healthcheck/healthcheck.go

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -25,6 +25,7 @@ func NewTask() perf.Task {
2525
checks := map[string]checkFunc{
2626
"cache": cacheCheck,
2727
"network": networkCheck,
28+
"vm": vmCheck,
2829
}
2930
return &healthcheckTask{
3031
checks: checks,
@@ -97,7 +98,7 @@ func (h *healthcheckTask) Run(ctx context.Context) (interface{}, error) {
9798
}
9899

99100
bo := backoff.NewExponentialBackOff()
100-
bo.InitialInterval = 30 * time.Second
101+
bo.InitialInterval = 3 * time.Minute
101102
bo.MaxInterval = 30 * time.Second
102103
bo.MaxElapsedTime = 10 * time.Minute
103104

pkg/perf/healthcheck/vm.go

Lines changed: 252 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,252 @@
1+
package healthcheck
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"net/url"
7+
"os"
8+
"path/filepath"
9+
"strings"
10+
"time"
11+
12+
"github.com/pkg/errors"
13+
"github.com/rs/zerolog/log"
14+
"github.com/threefoldtech/zosbase/pkg"
15+
"github.com/threefoldtech/zosbase/pkg/app"
16+
"github.com/threefoldtech/zosbase/pkg/environment"
17+
"github.com/threefoldtech/zosbase/pkg/gridtypes"
18+
"github.com/threefoldtech/zosbase/pkg/perf"
19+
"github.com/threefoldtech/zosbase/pkg/stubs"
20+
)
21+
22+
// FListInfo contains virtual machine flist details
23+
type FListInfo struct {
24+
ImagePath string
25+
KernelPath string
26+
InitrdPath string
27+
}
28+
29+
// IsContainer returns true if this is a container (no disk image)
30+
func (f *FListInfo) IsContainer() bool {
31+
return len(f.ImagePath) == 0
32+
}
33+
34+
// vmCheck deploys a test VM, waits, then decommissions it
35+
func vmCheck(ctx context.Context) []error {
36+
var errs []error
37+
38+
log.Debug().Msg("starting VM health check")
39+
40+
cl := perf.MustGetZbusClient(ctx)
41+
vmd := stubs.NewVMModuleStub(cl)
42+
flist := stubs.NewFlisterStub(cl)
43+
44+
// Create a test VM ID
45+
vmID := fmt.Sprintf("healthcheck-vm-%d", time.Now().Unix())
46+
flistURL := "https://hub.threefold.me/tf-official-apps/redis_zinit.flist"
47+
48+
log.Debug().Str("vm_id", vmID).Str("flist", flistURL).Msg("deploying test VM")
49+
50+
// Deploy the VM with timeout
51+
deployCtx, cancel := context.WithTimeout(ctx, 2*time.Minute)
52+
defer cancel()
53+
54+
// Mount cloud-container flist for kernel and initrd
55+
env := environment.MustGet()
56+
cloudContainerFlist, err := url.JoinPath(env.HubURL, "tf-autobuilder", "cloud-container-9dba60e.flist")
57+
if err != nil {
58+
errs = append(errs, errors.Wrap(err, "failed to construct cloud-container flist url"))
59+
if err := app.SetFlag(app.VMTestFailed); err != nil {
60+
log.Error().Err(err).Msg("failed to set VM test failed flag")
61+
}
62+
return errs
63+
}
64+
65+
cloudImageID := fmt.Sprintf("healthcheck-cloud-%d", time.Now().Unix())
66+
cloudImage, err := flist.Mount(deployCtx, cloudImageID, cloudContainerFlist, pkg.ReadOnlyMountOptions)
67+
if err != nil {
68+
errs = append(errs, errors.Wrap(err, "failed to mount cloud container base image"))
69+
if err := app.SetFlag(app.VMTestFailed); err != nil {
70+
log.Error().Err(err).Msg("failed to set VM test failed flag")
71+
}
72+
return errs
73+
}
74+
75+
// Ensure we unmount the cloud image when done
76+
defer func() {
77+
if unmountErr := flist.Unmount(context.Background(), cloudImageID); unmountErr != nil {
78+
log.Error().Err(unmountErr).Str("id", cloudImageID).Msg("failed to unmount cloud image")
79+
}
80+
}()
81+
82+
// Mount the flist to inspect its contents
83+
log.Debug().Str("flist", flistURL).Msg("mounting flist")
84+
mountPath, err := flist.Mount(deployCtx, vmID, flistURL, pkg.MountOptions{
85+
ReadOnly: true,
86+
})
87+
if err != nil {
88+
errs = append(errs, errors.Wrap(err, "failed to mount flist"))
89+
if err := app.SetFlag(app.VMTestFailed); err != nil {
90+
log.Error().Err(err).Msg("failed to set VM test failed flag")
91+
}
92+
return errs
93+
}
94+
95+
// Ensure we unmount the flist when done
96+
defer func() {
97+
if unmountErr := flist.Unmount(context.Background(), vmID); unmountErr != nil {
98+
log.Error().Err(unmountErr).Str("vm_id", vmID).Msg("failed to unmount flist")
99+
}
100+
}()
101+
102+
log.Debug().Str("mount_path", mountPath).Msg("flist mounted successfully")
103+
104+
// Get flist info (kernel, initrd, image paths)
105+
flistInfo, err := getFlistInfo(mountPath)
106+
if err != nil {
107+
errs = append(errs, errors.Wrap(err, "failed to get flist info"))
108+
if err := app.SetFlag(app.VMTestFailed); err != nil {
109+
log.Error().Err(err).Msg("failed to set VM test failed flag")
110+
}
111+
return errs
112+
}
113+
114+
log.Debug().
115+
Bool("is_container", flistInfo.IsContainer()).
116+
Str("kernel", flistInfo.KernelPath).
117+
Str("initrd", flistInfo.InitrdPath).
118+
Str("image", flistInfo.ImagePath).
119+
Msg("flist info retrieved")
120+
121+
// Create VM configuration
122+
vmConfig := pkg.VM{
123+
Name: vmID,
124+
CPU: 1,
125+
Memory: 512 * gridtypes.Megabyte,
126+
Network: pkg.VMNetworkInfo{},
127+
NoKeepAlive: false,
128+
}
129+
130+
// Configure boot based on flist type
131+
if flistInfo.IsContainer() {
132+
// Container mode - boot from virtio-fs
133+
log.Debug().Msg("configuring as container VM")
134+
// Use kernel from cloud-container flist
135+
vmConfig.KernelImage = filepath.Join(cloudImage, "kernel")
136+
vmConfig.InitrdImage = filepath.Join(cloudImage, "initramfs-linux.img")
137+
138+
// Can be overridden from the flist itself if exists
139+
if len(flistInfo.KernelPath) != 0 {
140+
vmConfig.KernelImage = flistInfo.KernelPath
141+
if len(flistInfo.InitrdPath) != 0 {
142+
vmConfig.InitrdImage = flistInfo.InitrdPath
143+
}
144+
}
145+
vmConfig.Boot = pkg.Boot{
146+
Type: pkg.BootVirtioFS,
147+
Path: mountPath,
148+
}
149+
} else {
150+
// VM mode - boot from disk image
151+
log.Debug().Msg("configuring as full VM with disk image")
152+
vmConfig.KernelImage = flistInfo.KernelPath
153+
if len(flistInfo.InitrdPath) != 0 {
154+
vmConfig.InitrdImage = flistInfo.InitrdPath
155+
}
156+
vmConfig.Boot = pkg.Boot{
157+
Type: pkg.BootDisk,
158+
Path: flistInfo.ImagePath,
159+
}
160+
}
161+
162+
log.Debug().Str("vm_id", vmID).Str("boot_path", vmConfig.Boot.Path).Msg("deploying VM")
163+
164+
// Deploy the VM
165+
machineInfo, err := vmd.Run(deployCtx, vmConfig)
166+
if err != nil {
167+
errs = append(errs, errors.Wrap(err, "failed to deploy VM"))
168+
if err := app.SetFlag(app.VMTestFailed); err != nil {
169+
log.Error().Err(err).Msg("failed to set VM test failed flag")
170+
}
171+
return errs
172+
}
173+
174+
log.Debug().
175+
Str("vm_id", vmID).
176+
Str("console_url", machineInfo.ConsoleURL).
177+
Msg("test VM deployed successfully")
178+
179+
// Wait 2 minutes to let the VM run
180+
time.Sleep(30 * time.Second)
181+
182+
// Decommission the VM
183+
log.Debug().Str("vm_id", vmID).Msg("decommissioning test VM")
184+
185+
decommissionCtx, cancelDecommission := context.WithTimeout(ctx, 1*time.Minute)
186+
defer cancelDecommission()
187+
188+
if err := vmd.Delete(decommissionCtx, vmID); err != nil {
189+
errs = append(errs, errors.Wrap(err, "failed to decommission VM"))
190+
if err := app.SetFlag(app.VMTestFailed); err != nil {
191+
log.Error().Err(err).Msg("failed to set VM test failed flag")
192+
}
193+
return errs
194+
}
195+
196+
log.Debug().Str("vm_id", vmID).Msg("test VM decommissioned successfully")
197+
198+
// All checks passed, delete the flag if it was set
199+
if err := app.DeleteFlag(app.VMTestFailed); err != nil {
200+
log.Error().Err(err).Msg("failed to delete VM test failed flag")
201+
}
202+
203+
return errs
204+
}
205+
206+
// getFlistInfo inspects a mounted flist and extracts kernel, initrd, and image paths
207+
func getFlistInfo(flistPath string) (flist FListInfo, err error) {
208+
files := map[string]*string{
209+
"/image.raw": &flist.ImagePath,
210+
"/boot/vmlinuz": &flist.KernelPath,
211+
"/boot/initrd.img": &flist.InitrdPath,
212+
}
213+
214+
for rel, ptr := range files {
215+
path := filepath.Join(flistPath, rel)
216+
217+
stat, err := os.Stat(path)
218+
if os.IsNotExist(err) {
219+
continue
220+
} else if err != nil {
221+
return flist, errors.Wrapf(err, "couldn't stat %s", rel)
222+
}
223+
224+
if stat.IsDir() {
225+
return flist, fmt.Errorf("path '%s' cannot be a directory", rel)
226+
}
227+
228+
mod := stat.Mode()
229+
switch mod.Type() {
230+
case 0:
231+
// regular file, do nothing
232+
case os.ModeSymlink:
233+
// this is a symlink, validate it points inside the flist
234+
link, err := os.Readlink(path)
235+
if err != nil {
236+
return flist, errors.Wrapf(err, "failed to read link at '%s", rel)
237+
}
238+
// the link if joined with path (and cleaned) must point to somewhere under flistPath
239+
abs := filepath.Clean(filepath.Join(flistPath, link))
240+
if !strings.HasPrefix(abs, flistPath) {
241+
return flist, fmt.Errorf("path '%s' points to invalid location", rel)
242+
}
243+
default:
244+
return flist, fmt.Errorf("path '%s' is of invalid type: %s", rel, mod.Type().String())
245+
}
246+
247+
// set the value
248+
*ptr = path
249+
}
250+
251+
return flist, nil
252+
}

pkg/perf/iperf/iperf_task.go

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -29,8 +29,6 @@ const (
2929
maxElapsedTime = 7 * time.Minute
3030
iperfTimeout = 90 * time.Second
3131

32-
errServerBusy = "the server is busy running a test. try again later"
33-
3432
iperf3ServersURL = "https://export.iperf3serverlist.net/listed_iperf3_servers.json"
3533
)
3634

pkg/perf/iperf/iperf_task_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -25,7 +25,7 @@ func TestIperfTest_Run_Success(t *testing.T) {
2525

2626
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
2727
w.WriteHeader(http.StatusOK)
28-
w.Write(serversJSON)
28+
_, _ = w.Write(serversJSON)
2929
}))
3030
defer server.Close()
3131

@@ -139,7 +139,7 @@ func TestIperfTest_Run_NoServersAvailable(t *testing.T) {
139139
// Create mock HTTP server that returns empty list
140140
server := httptest.NewServer(http.HandlerFunc(func(w http.ResponseWriter, r *http.Request) {
141141
w.WriteHeader(http.StatusOK)
142-
w.Write([]byte("[]"))
142+
_, _ = w.Write([]byte("[]"))
143143
}))
144144
defer server.Close()
145145

pkg/vm/monitor.go

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -5,6 +5,7 @@ import (
55
"fmt"
66
"os"
77
"path/filepath"
8+
"strings"
89
"syscall"
910
"time"
1011

@@ -158,6 +159,10 @@ func (m *Module) monitorID(ctx context.Context, running map[string]Process, id s
158159
stub := stubs.NewProvisionStub(m.client)
159160
log := log.With().Str("id", id).Logger()
160161

162+
// skip healthcheck vms
163+
if strings.HasPrefix(id, "healthcheck-vm") {
164+
return nil
165+
}
161166
if ps, ok := running[id]; ok {
162167
state, exists, err := stub.GetWorkloadStatus(ctx, id)
163168
if err != nil {

0 commit comments

Comments
 (0)