Skip to content

Commit e24941d

Browse files
committed
fix: use 5-minute load average for resource alerts to reduce false positives
The 1-minute load average spikes during normal build/test runs, triggering false "critically low resources" alerts. The 5-minute average smooths transient spikes while still catching sustained resource pressure. Spec: specs/load-alert-5min-average.md Signed-off-by: Jose Alekhinne <jose@parlakisik.com>
1 parent 685829a commit e24941d

4 files changed

Lines changed: 26 additions & 7 deletions

File tree

internal/cli/doctor/doctor_test.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -403,7 +403,7 @@ func TestAddResourceResults_DangerMapsToError(t *testing.T) {
403403
Supported: true,
404404
},
405405
Load: sysinfo.LoadInfo{
406-
Load1: 12.0,
406+
Load5: 12.0,
407407
NumCPU: 8, // 1.5x -> DANGER
408408
Supported: true,
409409
},
@@ -479,7 +479,7 @@ func TestAddResourceResults_MessageFormat(t *testing.T) {
479479
},
480480
Disk: sysinfo.DiskInfo{Supported: false},
481481
Load: sysinfo.LoadInfo{
482-
Load1: 2.0,
482+
Load5: 2.0,
483483
NumCPU: 8,
484484
Supported: true,
485485
},

internal/sysinfo/threshold.go

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -96,9 +96,9 @@ func Evaluate(snap Snapshot) []ResourceAlert {
9696
}
9797
}
9898

99-
// Load (1m)
99+
// Load (5m) — 5-minute average smooths transient build/test spikes.
100100
if snap.Load.Supported && snap.Load.NumCPU > 0 {
101-
ratio := snap.Load.Load1 / float64(snap.Load.NumCPU)
101+
ratio := snap.Load.Load5 / float64(snap.Load.NumCPU)
102102
msg := fmt.Sprintf(desc.Text(text.DescKeyResourcesAlertLoad), ratio)
103103
if ratio >= stats.ThresholdLoadDangerRatio {
104104
alerts = append(alerts, ResourceAlert{

internal/sysinfo/threshold_test.go

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -122,7 +122,7 @@ func TestEvaluate_DiskBoundaries(t *testing.T) {
122122
func TestEvaluate_LoadBoundaries(t *testing.T) {
123123
tests := []struct {
124124
name string
125-
load1 float64
125+
load5 float64
126126
ncpu int
127127
wantSev Severity
128128
wantN int
@@ -137,7 +137,7 @@ func TestEvaluate_LoadBoundaries(t *testing.T) {
137137
t.Run(tt.name, func(t *testing.T) {
138138
snap := Snapshot{
139139
Load: LoadInfo{
140-
Load1: tt.load1,
140+
Load5: tt.load5,
141141
NumCPU: tt.ncpu,
142142
Supported: true,
143143
},
@@ -171,7 +171,7 @@ func TestEvaluate_AllDanger(t *testing.T) {
171171
Supported: true,
172172
},
173173
Load: LoadInfo{
174-
Load1: 12.0,
174+
Load5: 12.0,
175175
NumCPU: 8,
176176
Supported: true,
177177
},

specs/load-alert-5min-average.md

Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
# Load Alert: Use 5-Minute Average
2+
3+
## Problem
4+
5+
The resource load alert uses the 1-minute load average, which spikes
6+
during normal build/test runs and produces false positives. A single
7+
`make test` run triggers "critically low resources" even when the
8+
system is healthy.
9+
10+
## Approach
11+
12+
Switch from `Load1` to `Load5` in `sysinfo.Evaluate`. The 5-minute
13+
average smooths transient build/test spikes while still catching
14+
sustained resource pressure.
15+
16+
## Non-Goals
17+
18+
- Changing the threshold ratios (0.8 warn, 1.5 danger)
19+
- Adding configurable averaging windows

0 commit comments

Comments
 (0)