Skip to content

Commit 9a9b2bf

Browse files
committed
gather-info: implement CX feedback phases 1-4
Phase 1 — Services bug fix + transfer commands: - Batch D-Bus service status via ListUnitsByNamesContext (one call, existence-gated) - Distro-aware service name resolution (sshd→ssh on Debian) - Fabricmanager false-positive handling (tri-state NVSwitch detection) - OpenStack floating IP detection via metadata service - SCP transfer commands with default-route annotation (netlink.RouteGet) Phase 2 — Severity expansion + collection modes: - Sentinel SeverityUnspecified=0, SeverityInfo=1, explicit integer values - Hidden field on Issue for SUMMARY.txt visibility control - JSON-consistent severity serialization (MarshalJSON/UnmarshalJSON) - Collection modes: --mode=safe|quick|standard|deep with CLI flag override - DCGM level-2 diag gated behind --enable-active-gpu-diag Phase 3-4 — Triage analysis + output: - Xid classifier with 24-code NVIDIA catalog, Xid 154 dynamic severity - Firewall posture detection (iptables/ufw/nftables/firewalld) - Critical log extraction with hidden low-confidence findings - Triage output in triage/ (human) + triage/_data/ (JSON) - Synthetic triage collector result in manifest/report UI + branding: - Rebrand NexGenCloud → Hyperstack (magenta/light-magenta) - Dynamic SystemInfo box widths, rounded Unicode borders - Unified transfer box with live spinner feedback - Schema version bump 1.0.0 → 1.1.0
1 parent 924b27f commit 9a9b2bf

38 files changed

Lines changed: 3570 additions & 182 deletions
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1 +1,2 @@
11
bin/
2+
gather-info

customers/vm-troubleshooting/CODEMAP.md

Lines changed: 14 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -91,6 +91,18 @@ Keep this file updated in the same change as architecture or collector changes.
9191
- Does NOT use `apt-key` or `add-apt-repository` (conflicts with cuda-keyring).
9292
- Must remain explicit, interactive, and non-fatal to the overall diagnostics run.
9393

94+
### `internal/transfer/`
95+
- Owns SCP/transfer command generation for the diagnostic archive.
96+
- Discovers IPs via `net.Interfaces()`, detects default route via `netlink.RouteGet`, detects SSH username via `SUDO_USER`/`os/user`.
97+
- Template-first output: editable placeholders always first, discovered IPs as hints with default-route annotation (★).
98+
- `transfer_commands.txt` saved in archive; rendered in styled Unicode boxes on terminal.
99+
100+
### `internal/triage/`
101+
- Owns post-collection analysis: Xid classification, firewall posture detection, critical log extraction.
102+
- Runs after collectors, before summary/manifest generation. Findings injected as a synthetic "triage" collector result.
103+
- Output: `triage/*.txt` (human) + `triage/_data/*.json` (machine). Hidden findings omitted from SUMMARY.txt but present in manifest/report.
104+
- Xid catalog: 24 datacenter-relevant codes from NVIDIA r590, with Xid 154 dynamic severity from recovery action text.
105+
94106
### `internal/collector/`
95107
- Owns domain collectors and shared collector helpers.
96108
- Keep collectors narrow and artifact-oriented.
@@ -101,10 +113,10 @@ Keep this file updated in the same change as architecture or collector changes.
101113
|---|---|---|---|
102114
| `SystemCollector` | `system/`, `hardware/`, `processes/` | Mixed | Native summary + command artifacts for support familiarity |
103115
| `NetworkCollector` | `network/` | Mixed | Native link/route/neigh probes plus network stack command/config artifacts |
104-
| `NvidiaCollector` | `nvidia/` | No | `nvidia-smi`, `dmesg`, `systemctl` |
116+
| `NvidiaCollector` | `nvidia/` | No | `nvidia-smi`, `dmesg`, `systemctl`; sets `nvswitch_present` fact from lspci |
105117
| `DcgmCollector` | `dcgm/` | No | `dcgmi` discovery, health, stats, and level 2 diag (1-min cap) |
106118
| `DockerCollector` | `docker/` | Mostly shell | Docker CLI plus Go sanitization |
107-
| `ServicesCollector` | `services/` | Mixed | `go-systemd` facts plus `systemctl status` artifacts |
119+
| `ServicesCollector` | `services/` | Mixed | Batch D-Bus resolution (`ListUnitsByNamesContext`), distro-aware SSH name (`sshd``ssh` on Debian), existence-gated per-service artifacts, fabricmanager false-positive handling (tri-state NVSwitch detection) |
108120
| `JournalCollector` | `logs/` | No | `journalctl`/`dmesg` are authoritative |
109121
| `PackagesCollector` | `packages/` | No | Package managers remain distro authority |
110122
| `AdditionalCollector` | `system/`, `hardware/` | Mixed | Limits, sysctl, LVM, sensors, mounts |

customers/vm-troubleshooting/internal/cli/root.go

Lines changed: 15 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -2,7 +2,6 @@ package cli
22

33
import (
44
"context"
5-
"fmt"
65
"os"
76
"os/signal"
87
"syscall"
@@ -12,27 +11,34 @@ import (
1211
"github.com/NexGenCloud/vm-diagnostics/internal/runner"
1312
"github.com/NexGenCloud/vm-diagnostics/internal/ui"
1413
"github.com/spf13/cobra"
14+
"github.com/spf13/pflag"
1515
)
1616

1717
func NewRootCmd() *cobra.Command {
1818
cfg := &config.Config{OutputDir: "/tmp", JournalSince: "boot"}
1919
cmd := &cobra.Command{
2020
Use: "gather-info",
21-
Short: "Collect VM diagnostics for NexGenCloud support",
21+
Short: "Collect VM diagnostics for Hyperstack support",
2222
Version: config.Version,
2323
SilenceUsage: true,
2424
SilenceErrors: true,
2525
RunE: func(cmd *cobra.Command, _ []string) error {
26+
// Collect explicitly-set flags before applying mode
27+
explicit := make(map[string]bool)
28+
cmd.Flags().Visit(func(f *pflag.Flag) {
29+
explicit[f.Name] = true
30+
})
31+
if err := cfg.ApplyMode(explicit); err != nil {
32+
return err
33+
}
34+
2635
uiImpl := ui.New(cfg.Verbosity, cfg.NonInteractive)
2736

28-
// First signal: cancel context (graceful shutdown).
29-
// After the first signal, stop() resets signal handling to default
30-
// so a second Ctrl+C immediately kills the process.
3137
ctx, stop := signal.NotifyContext(context.Background(), os.Interrupt, syscall.SIGTERM)
32-
uiImpl.SetDone(ctx.Done()) // unblocks interactive prompts on Ctrl+C
38+
uiImpl.SetDone(ctx.Done())
3339
go func() {
3440
<-ctx.Done()
35-
stop() // reset to default: next signal = immediate kill
41+
stop()
3642
uiImpl.Warn("Interrupted. Cleaning up... (press Ctrl+C again to force quit)")
3743
}()
3844

@@ -45,7 +51,6 @@ func NewRootCmd() *cobra.Command {
4551
}
4652
return exitError{code: config.ExitFatal, err: err}
4753
}
48-
fmt.Fprintln(os.Stdout, res.ArchivePath)
4954
if res.ExitCode != config.ExitSuccess {
5055
return exitError{code: res.ExitCode}
5156
}
@@ -55,11 +60,13 @@ func NewRootCmd() *cobra.Command {
5560
flags := cmd.Flags()
5661
flags.StringVarP(&cfg.OutputDir, "output-dir", "o", cfg.OutputDir, "Output directory for the archive")
5762
flags.StringVarP(&cfg.JournalSince, "journal-since", "j", cfg.JournalSince, "Collect journal logs since TIMESPEC")
63+
flags.StringVarP(&cfg.Mode, "mode", "m", "", "Collection mode: safe, quick, standard, deep")
5864
flags.BoolVar(&cfg.SkipNvidia, "skip-nvidia", false, "Skip NVIDIA diagnostics")
5965
flags.BoolVar(&cfg.SkipDocker, "skip-docker", false, "Skip Docker diagnostics")
6066
flags.BoolVar(&cfg.SkipDCGM, "skip-dcgm", false, "Skip DCGM diagnostics")
6167
flags.BoolVar(&cfg.SkipJournal, "skip-journal", false, "Skip journal collection")
6268
flags.BoolVar(&cfg.SkipServices, "skip-services", false, "Skip service collection")
69+
flags.BoolVar(&cfg.SkipPackages, "skip-packages", false, "Skip package collection")
6370
flags.BoolVar(&cfg.IncludeFullJournal, "include-full-journal", false, "Include full journal dump")
6471
flags.BoolVar(&cfg.IncludeContainerLogs, "include-container-logs", false, "Include container logs")
6572
flags.BoolVar(&cfg.EnableActiveGPUDiag, "enable-active-gpu-diag", false, "Enable active DCGM GPU diagnostics")

customers/vm-troubleshooting/internal/cli/root_test.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ func TestRootCmdHelpIncludesCoreFlags(t *testing.T) {
2121
t.Fatalf("help failed: %v", err)
2222
}
2323
help := stdout.String()
24-
for _, flag := range []string{"--output-dir", "--journal-since", "--skip-nvidia", "--skip-docker", "--skip-dcgm", "--skip-journal", "--skip-services", "--include-full-journal", "--include-container-logs", "--enable-active-gpu-diag", "--non-interactive", "--verbose"} {
24+
for _, flag := range []string{"--output-dir", "--journal-since", "--mode", "--skip-nvidia", "--skip-docker", "--skip-dcgm", "--skip-journal", "--skip-services", "--skip-packages", "--include-full-journal", "--include-container-logs", "--enable-active-gpu-diag", "--non-interactive", "--verbose"} {
2525
if !strings.Contains(help, flag) {
2626
t.Fatalf("help missing %s\n%s", flag, help)
2727
}

customers/vm-troubleshooting/internal/collector/collector.go

Lines changed: 50 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -4,6 +4,7 @@ import (
44
"context"
55
"fmt"
66
"sort"
7+
"strings"
78
"time"
89

910
"github.com/NexGenCloud/vm-diagnostics/internal/ui"
@@ -43,7 +44,7 @@ var ValidTags = map[string]bool{
4344
"docker": true, "docker-security": true,
4445
"services": true, "journal": true, "oom": true,
4546
"packages": true, "storage": true, "infiniband": true,
46-
"processes": true, "config": true,
47+
"processes": true, "config": true, "triage": true,
4748
}
4849

4950
// ValidParserHints is the controlled vocabulary for parser hints.
@@ -172,13 +173,16 @@ type Issue struct {
172173
Severity Severity
173174
Category string
174175
Message string
176+
Hidden bool // default false = shown in SUMMARY.txt
175177
}
176178

177179
type Severity int
178180

179181
const (
180-
SeverityWarning Severity = iota
181-
SeverityCritical
182+
SeverityUnspecified Severity = 0 // sentinel — catches uninitialized Issue{}
183+
SeverityInfo Severity = 1
184+
SeverityWarning Severity = 2
185+
SeverityCritical Severity = 3
182186
)
183187

184188
type Registry struct{ collectors []Collector }
@@ -239,12 +243,52 @@ func (r *Registry) RunAll(ctx context.Context, skip map[string]bool, u ui.UI) ([
239243
return results, nil
240244
}
241245

242-
// SeverityString returns the string representation of a severity level.
243-
func SeverityString(s Severity) string {
246+
// String returns the string representation of a severity level.
247+
func (s Severity) String() string {
244248
switch s {
249+
case SeverityInfo:
250+
return "info"
251+
case SeverityWarning:
252+
return "warning"
245253
case SeverityCritical:
246254
return "critical"
247255
default:
248-
return "warning"
256+
return "unspecified"
257+
}
258+
}
259+
260+
// ParseSeverity converts a string to Severity. Returns error on unknown input.
261+
func ParseSeverity(s string) (Severity, error) {
262+
switch s {
263+
case "info":
264+
return SeverityInfo, nil
265+
case "warning":
266+
return SeverityWarning, nil
267+
case "critical":
268+
return SeverityCritical, nil
269+
default:
270+
return SeverityUnspecified, fmt.Errorf("unknown severity %q", s)
249271
}
250272
}
273+
274+
// MarshalJSON serializes Severity as a JSON string (e.g. "info", "warning", "critical").
275+
// This ensures consistent serialization across all JSON outputs (manifest, report, triage).
276+
func (s Severity) MarshalJSON() ([]byte, error) {
277+
return []byte(`"` + s.String() + `"`), nil
278+
}
279+
280+
// UnmarshalJSON deserializes a JSON string to Severity.
281+
func (s *Severity) UnmarshalJSON(data []byte) error {
282+
str := strings.Trim(string(data), `"`)
283+
parsed, err := ParseSeverity(str)
284+
if err != nil {
285+
return err
286+
}
287+
*s = parsed
288+
return nil
289+
}
290+
291+
// Valid reports whether the severity is a known non-sentinel value.
292+
func (s Severity) Valid() bool {
293+
return s >= SeverityInfo && s <= SeverityCritical
294+
}

0 commit comments

Comments
 (0)