Skip to content

Commit f59b0e0

Browse files
add debug commands (#92)
* feat: implement debug endpoints for deployments * feat: implement debug VM info endpoint * feat: add debug provisioning health endpoint and related checks * refactor: add debug commands for deployment retrieval, listing, and VM info * feat: enhance provisioning health checks with custom system probe support * refactor: rename debug handlers for consistency and clarity * refactor: unify deployment handling in debug commands by replacing twin_id and contract_id with a single deployment field * refactor: streamline debug commands for listing, retrieving, health checking, and info retrieval of deployments * feat: restructure the network/system/machine health checks * refactor: improve health check structure and logging in debug commands * refactor: enhance health check logic to enforce deployment requirement when system probe is not specified
1 parent b53aa00 commit f59b0e0

23 files changed

Lines changed: 984 additions & 3 deletions

pkg/debugcmd/checks/check.go

Lines changed: 62 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,62 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
6+
"github.com/threefoldtech/zosbase/pkg/gridtypes"
7+
"github.com/threefoldtech/zosbase/pkg/gridtypes/zos"
8+
)
9+
10+
type Checker interface {
11+
Name() string
12+
Run(ctx context.Context, data *CheckData) []HealthCheck
13+
}
14+
15+
type HealthCheck struct {
16+
Name string `json:"name"`
17+
OK bool `json:"ok"`
18+
Message string `json:"message,omitempty"`
19+
Evidence map[string]interface{} `json:"evidence,omitempty"`
20+
}
21+
22+
type CheckData struct {
23+
Twin uint32
24+
Contract uint64
25+
Workload gridtypes.Workload
26+
VM func(ctx context.Context, id string) bool
27+
Network func(ctx context.Context, id zos.NetID) string
28+
}
29+
30+
func success(name, message string, evidence map[string]interface{}) HealthCheck {
31+
if evidence == nil {
32+
evidence = make(map[string]interface{})
33+
}
34+
return HealthCheck{Name: name, OK: true, Message: message, Evidence: evidence}
35+
}
36+
37+
func failure(name, message string, evidence map[string]interface{}) HealthCheck {
38+
if evidence == nil {
39+
evidence = make(map[string]interface{})
40+
}
41+
return HealthCheck{Name: name, OK: false, Message: message, Evidence: evidence}
42+
}
43+
44+
func IsHealthy(checks []HealthCheck) bool {
45+
for _, check := range checks {
46+
if !check.OK {
47+
return false
48+
}
49+
}
50+
return true
51+
}
52+
53+
func Run(ctx context.Context, workloadType gridtypes.WorkloadType, data *CheckData) []HealthCheck {
54+
switch workloadType {
55+
case zos.NetworkType, zos.NetworkLightType:
56+
return NetworkCheckerInstance.Run(ctx, data)
57+
case zos.ZMachineType, zos.ZMachineLightType:
58+
return VMCheckerInstance.Run(ctx, data)
59+
default:
60+
return nil
61+
}
62+
}

pkg/debugcmd/checks/network.go

Lines changed: 140 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,140 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"encoding/json"
6+
"fmt"
7+
"os"
8+
"path/filepath"
9+
10+
cnins "github.com/containernetworking/plugins/pkg/ns"
11+
"github.com/threefoldtech/zosbase/pkg"
12+
"github.com/threefoldtech/zosbase/pkg/gridtypes/zos"
13+
"github.com/threefoldtech/zosbase/pkg/network/namespace"
14+
"github.com/threefoldtech/zosbase/pkg/network/nr"
15+
"github.com/threefoldtech/zosbase/pkg/versioned"
16+
"github.com/threefoldtech/zosbase/pkg/zinit"
17+
"github.com/vishvananda/netlink"
18+
)
19+
20+
const (
21+
networkdVolatileDir = "/var/run/cache/networkd"
22+
networksDir = "networks"
23+
myceliumKeyDir = "mycelium-key"
24+
)
25+
26+
type NetworkChecker struct {
27+
netID zos.NetID
28+
nsName string
29+
netCfgPath string
30+
nrr *nr.NetResource
31+
}
32+
33+
func (nc *NetworkChecker) Name() string { return "network" }
34+
35+
func (nc *NetworkChecker) Run(ctx context.Context, data *CheckData) []HealthCheck {
36+
netID := zos.NetworkID(data.Twin, data.Workload.Name)
37+
nc.netID = netID
38+
nc.nsName = data.Network(ctx, netID)
39+
nc.netCfgPath = filepath.Join(networkdVolatileDir, networksDir, netID.String())
40+
nc.nrr = nr.New(pkg.Network{NetID: netID}, filepath.Join(networkdVolatileDir, myceliumKeyDir))
41+
42+
return []HealthCheck{
43+
nc.checkConfig(),
44+
nc.checkNamespace(),
45+
nc.checkInterfaces(),
46+
nc.checkBridge(),
47+
nc.checkMycelium(),
48+
}
49+
}
50+
51+
func (nc *NetworkChecker) checkConfig() HealthCheck {
52+
_, raw, err := versioned.ReadFile(nc.netCfgPath)
53+
if err != nil {
54+
return failure("network.config", fmt.Sprintf("config file not found: %v", err), map[string]interface{}{"path": nc.netCfgPath, "netid": nc.netID.String()})
55+
}
56+
57+
var netCfg pkg.Network
58+
if err := json.Unmarshal(raw, &netCfg); err != nil {
59+
return failure("network.config", fmt.Sprintf("config file invalid: %v", err), map[string]interface{}{"path": nc.netCfgPath, "netid": nc.netID.String()})
60+
}
61+
62+
if netCfg.NetID != nc.netID {
63+
return failure("network.config", fmt.Sprintf("netid mismatch: expected %s, got %s", nc.netID.String(), netCfg.NetID.String()), map[string]interface{}{"expected": nc.netID.String(), "got": netCfg.NetID.String()})
64+
}
65+
66+
return success("network.config", "config valid", map[string]interface{}{"path": nc.netCfgPath, "netid": nc.netID.String()})
67+
}
68+
69+
func (nc *NetworkChecker) checkNamespace() HealthCheck {
70+
if !namespace.Exists(nc.nsName) {
71+
return failure("network.namespace", "namespace not found", map[string]interface{}{"namespace": nc.nsName})
72+
}
73+
return success("network.namespace", "namespace exists", map[string]interface{}{"namespace": nc.nsName})
74+
}
75+
76+
func (nc *NetworkChecker) checkInterfaces() HealthCheck {
77+
wgIface, _ := nc.nrr.WGName()
78+
nrIface, _ := nc.nrr.NRIface()
79+
pubIface := "public"
80+
81+
netnsLinks := map[string]struct{}{}
82+
if netNS, err := namespace.GetByName(nc.nsName); err == nil {
83+
_ = netNS.Do(func(_ cnins.NetNS) error {
84+
links, err := netlink.LinkList()
85+
if err == nil {
86+
for _, l := range links {
87+
netnsLinks[l.Attrs().Name] = struct{}{}
88+
}
89+
}
90+
return nil
91+
})
92+
netNS.Close()
93+
}
94+
95+
missing := []string{}
96+
for _, iface := range []string{wgIface, nrIface, pubIface} {
97+
if _, ok := netnsLinks[iface]; !ok {
98+
missing = append(missing, iface)
99+
}
100+
}
101+
102+
if len(missing) > 0 {
103+
return failure("network.interfaces", fmt.Sprintf("missing interfaces: %v", missing), map[string]interface{}{"namespace": nc.nsName, "missing": missing})
104+
}
105+
106+
return success("network.interfaces", "all required interfaces present", map[string]interface{}{"namespace": nc.nsName})
107+
}
108+
109+
func (nc *NetworkChecker) checkBridge() HealthCheck {
110+
brName, _ := nc.nrr.BridgeName()
111+
brPath := filepath.Join("/sys/class/net", brName)
112+
113+
if _, err := os.Stat(brPath); err != nil {
114+
return failure("network.bridge", fmt.Sprintf("bridge not found: %v", err), map[string]interface{}{"bridge": brName})
115+
}
116+
117+
brifDir := filepath.Join(brPath, "brif")
118+
ents, err := os.ReadDir(brifDir)
119+
if err != nil || len(ents) == 0 {
120+
return failure("network.bridge", fmt.Sprintf("bridge has no members: %v", err), map[string]interface{}{"bridge": brName})
121+
}
122+
123+
return success("network.bridge", "bridge has members", map[string]interface{}{"bridge": brName})
124+
}
125+
126+
func (nc *NetworkChecker) checkMycelium() HealthCheck {
127+
service := nc.nrr.MyceliumServiceName()
128+
st, err := zinit.Default().Status(service)
129+
if err != nil {
130+
return failure("network.mycelium", fmt.Sprintf("cannot get service status: %v", err), map[string]interface{}{"service": service})
131+
}
132+
133+
if !st.State.Is(zinit.ServiceStateRunning) {
134+
return failure("network.mycelium", fmt.Sprintf("service not running: %s", st.State.String()), map[string]interface{}{"service": service, "state": st.State.String()})
135+
}
136+
137+
return success("network.mycelium", "service running", map[string]interface{}{"service": service, "pid": st.Pid})
138+
}
139+
140+
var NetworkCheckerInstance = &NetworkChecker{}

pkg/debugcmd/checks/system.go

Lines changed: 46 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,46 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os/exec"
7+
"strings"
8+
"time"
9+
)
10+
11+
const systemProbeTimeout = 60 * time.Second
12+
13+
type SystemChecker struct {
14+
command string
15+
}
16+
17+
func (sc *SystemChecker) Name() string { return "system" }
18+
19+
func (sc *SystemChecker) Run(ctx context.Context, data *CheckData) []HealthCheck {
20+
if sc.command == "" {
21+
return nil
22+
}
23+
24+
parts := strings.Fields(sc.command)
25+
if len(parts) == 0 {
26+
return []HealthCheck{failure("system.probe", "empty probe command", nil)}
27+
}
28+
29+
probeCtx, cancel := context.WithTimeout(ctx, systemProbeTimeout)
30+
defer cancel()
31+
32+
cmd := exec.CommandContext(probeCtx, parts[0], parts[1:]...)
33+
output, err := cmd.CombinedOutput()
34+
if err != nil {
35+
return []HealthCheck{failure("system.probe", fmt.Sprintf("probe failed: %v", err), map[string]interface{}{"error": err.Error()})}
36+
}
37+
38+
return []HealthCheck{success("system.probe", "probe executed successfully", map[string]interface{}{
39+
"output": string(output),
40+
"exit_code": cmd.ProcessState.ExitCode(),
41+
})}
42+
}
43+
44+
func NewSystemChecker(command string) *SystemChecker {
45+
return &SystemChecker{command: command}
46+
}

pkg/debugcmd/checks/vm.go

Lines changed: 120 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
package checks
2+
3+
import (
4+
"context"
5+
"fmt"
6+
"os"
7+
"path/filepath"
8+
9+
"github.com/threefoldtech/zosbase/pkg/gridtypes"
10+
"github.com/threefoldtech/zosbase/pkg/vm"
11+
)
12+
13+
const vmdVolatileDir = "/var/run/cache/vmd"
14+
15+
type VMChecker struct {
16+
workloadID gridtypes.WorkloadID
17+
vmID string
18+
cfgPath string
19+
machine *vm.Machine
20+
vmExists func(ctx context.Context, id string) bool
21+
}
22+
23+
func (vc *VMChecker) Name() string { return "vm" }
24+
25+
func (vc *VMChecker) Run(ctx context.Context, data *CheckData) []HealthCheck {
26+
workloadID, err := gridtypes.NewWorkloadID(data.Twin, data.Contract, data.Workload.Name)
27+
if err != nil {
28+
return []HealthCheck{failure("vm.init", fmt.Sprintf("invalid workload ID: %v", err), nil)}
29+
}
30+
31+
vc.workloadID = workloadID
32+
vc.vmID = workloadID.String()
33+
vc.cfgPath = filepath.Join(vmdVolatileDir, workloadID.String())
34+
vc.vmExists = data.VM
35+
36+
return []HealthCheck{
37+
vc.checkConfig(),
38+
vc.checkVMD(ctx),
39+
vc.checkProcess(),
40+
vc.checkDisks(),
41+
vc.checkVirtioFS(),
42+
}
43+
}
44+
45+
func (vc *VMChecker) loadMachine() (*vm.Machine, error) {
46+
if vc.machine != nil {
47+
return vc.machine, nil
48+
}
49+
machine, err := vm.MachineFromFile(vc.cfgPath)
50+
if err != nil {
51+
return nil, err
52+
}
53+
vc.machine = machine
54+
return machine, nil
55+
}
56+
57+
func (vc *VMChecker) checkConfig() HealthCheck {
58+
if _, err := os.Stat(vc.cfgPath); err != nil {
59+
return failure("vm.config", fmt.Sprintf("config file not found: %v", err), map[string]interface{}{"path": vc.cfgPath})
60+
}
61+
if _, err := vm.MachineFromFile(vc.cfgPath); err != nil {
62+
return failure("vm.config", fmt.Sprintf("config file invalid: %v", err), map[string]interface{}{"path": vc.cfgPath})
63+
}
64+
return success("vm.config", "config valid", map[string]interface{}{"path": vc.cfgPath, "vm_id": vc.vmID})
65+
}
66+
67+
func (vc *VMChecker) checkVMD(ctx context.Context) HealthCheck {
68+
if !vc.vmExists(ctx, vc.vmID) {
69+
return failure("vm.vmd", "vmd reports VM does not exist", map[string]interface{}{"vm_id": vc.vmID})
70+
}
71+
return success("vm.vmd", "vmd reports VM exists", map[string]interface{}{"vm_id": vc.vmID})
72+
}
73+
74+
func (vc *VMChecker) checkProcess() HealthCheck {
75+
ps, err := vm.Find(vc.vmID)
76+
if err != nil {
77+
return failure("vm.process", fmt.Sprintf("process not found: %v", err), map[string]interface{}{"vm_id": vc.vmID})
78+
}
79+
return success("vm.process", "process running", map[string]interface{}{"vm_id": vc.vmID, "pid": ps.Pid})
80+
}
81+
82+
func (vc *VMChecker) checkDisks() HealthCheck {
83+
machine, err := vc.loadMachine()
84+
if err != nil {
85+
return failure("vm.disks", "config not available", map[string]interface{}{"vm_id": vc.vmID})
86+
}
87+
88+
for _, disk := range machine.Disks {
89+
if disk.Path == "" {
90+
continue
91+
}
92+
if _, err := os.Stat(disk.Path); err != nil {
93+
return failure("vm.disks", fmt.Sprintf("disk missing: %s", disk.Path), map[string]interface{}{"path": disk.Path, "vm_id": vc.vmID})
94+
}
95+
}
96+
97+
// TODO: check for files on disks?
98+
99+
return success("vm.disks", "all disks valid", map[string]interface{}{"vm_id": vc.vmID})
100+
}
101+
102+
func (vc *VMChecker) checkVirtioFS() HealthCheck {
103+
machine, err := vc.loadMachine()
104+
if err != nil {
105+
return failure("vm.virtiofs", fmt.Sprintf("config unavailable: %v", err), map[string]interface{}{"vm_id": vc.vmID})
106+
}
107+
108+
for i := range machine.FS {
109+
sock := filepath.Join("/var/run", fmt.Sprintf("virtio-%s-%d.socket", vc.vmID, i))
110+
if _, err := os.Stat(sock); err != nil {
111+
return failure("vm.virtiofs", fmt.Sprintf("socket missing: %s", sock), map[string]interface{}{"socket": sock, "vm_id": vc.vmID})
112+
}
113+
}
114+
115+
return success("vm.virtiofs", "all virtiofs sockets present", map[string]interface{}{"vm_id": vc.vmID})
116+
}
117+
118+
// TODO: add cloud-console check
119+
120+
var VMCheckerInstance = &VMChecker{}

0 commit comments

Comments
 (0)