Skip to content

Commit 00575f6

Browse files
committed
device-health-oracle: address code review findings
- Close ClickHouse client on shutdown (defer chClient.Close()) - Validate database name against [a-zA-Z0-9_-]+ to prevent injection - Use status.IsDrained() instead of == for forward compatibility - Pass tick timestamp through BurnInTimes.Now instead of per-device time.Now() calls - Add comment explaining 2-tick minimum for ReadyForUsers progression - Remove obvious comments per CLAUDE.md guidelines - Add test for expectedMinutes == 0 edge case (new environment)
1 parent ade95e5 commit 00575f6

6 files changed

Lines changed: 44 additions & 16 deletions

File tree

controlplane/device-health-oracle/cmd/device-health-oracle/main.go

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -147,6 +147,7 @@ func main() {
147147
if err != nil {
148148
log.Warn("ClickHouse connection failed, continuing without controller_success criterion", "addr", chAddr, "error", err)
149149
} else {
150+
defer chClient.Close()
150151
log.Info("ClickHouse enabled", "addr", chAddr, "db", chDB, "user", chUser, "tls", !chTLSDisabled)
151152
controllerSuccess := worker.NewControllerSuccessCriterion(chClient, log)
152153
deviceCriteria = append(deviceCriteria, controllerSuccess)

controlplane/device-health-oracle/internal/worker/clickhouse.go

Lines changed: 7 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -4,12 +4,15 @@ import (
44
"context"
55
"crypto/tls"
66
"fmt"
7+
"regexp"
78
"strings"
89
"time"
910

1011
"github.com/ClickHouse/clickhouse-go/v2"
1112
)
1213

14+
var validDBName = regexp.MustCompile(`^[a-zA-Z0-9_-]+$`)
15+
1316
// ControllerCallChecker queries ClickHouse for controller call records.
1417
type ControllerCallChecker interface {
1518
ControllerCallCoverage(ctx context.Context, devicePubkey string, start, end time.Time) (minutesWithCalls int64, err error)
@@ -22,8 +25,11 @@ type ClickHouseClient struct {
2225
db string
2326
}
2427

25-
// NewClickHouseClient creates a new ClickHouse client connection.
2628
func NewClickHouseClient(addr, db, user, pass string, disableTLS bool) (*ClickHouseClient, error) {
29+
if !validDBName.MatchString(db) {
30+
return nil, fmt.Errorf("invalid clickhouse database name: %q", db)
31+
}
32+
2733
addr = strings.TrimPrefix(addr, "https://")
2834
addr = strings.TrimPrefix(addr, "http://")
2935

@@ -74,7 +80,6 @@ func (c *ClickHouseClient) ControllerCallCoverage(ctx context.Context, devicePub
7480
return int64(minutesWithCalls), nil
7581
}
7682

77-
// Close closes the ClickHouse connection.
7883
func (c *ClickHouseClient) Close() error {
7984
return c.conn.Close()
8085
}

controlplane/device-health-oracle/internal/worker/controller_success.go

Lines changed: 2 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -4,7 +4,6 @@ import (
44
"context"
55
"fmt"
66
"log/slog"
7-
"time"
87

98
"github.com/gagliardetto/solana-go"
109
"github.com/malbeclabs/doublezero/smartcontract/sdk/go/serviceability"
@@ -21,7 +20,6 @@ type ControllerSuccessCriterion struct {
2120
log *slog.Logger
2221
}
2322

24-
// NewControllerSuccessCriterion creates a new ControllerSuccessCriterion.
2523
func NewControllerSuccessCriterion(checker ControllerCallChecker, log *slog.Logger) *ControllerSuccessCriterion {
2624
return &ControllerSuccessCriterion{
2725
checker: checker,
@@ -34,7 +32,7 @@ func (c *ControllerSuccessCriterion) Name() string {
3432
}
3533

3634
func (c *ControllerSuccessCriterion) Check(ctx context.Context, device serviceability.Device) (bool, string) {
37-
start, expectedMinutes, ok := DeviceBurnIn(ctx, device.Status)
35+
start, now, expectedMinutes, ok := DeviceBurnIn(ctx, device.Status)
3836
if !ok {
3937
return false, "burn-in times not available in context"
4038
}
@@ -43,7 +41,7 @@ func (c *ControllerSuccessCriterion) Check(ctx context.Context, device serviceab
4341
}
4442

4543
pubkey := solana.PublicKeyFromBytes(device.PubKey[:]).String()
46-
minutesWithCalls, err := c.checker.ControllerCallCoverage(ctx, pubkey, start, time.Now())
44+
minutesWithCalls, err := c.checker.ControllerCallCoverage(ctx, pubkey, start, now)
4745
if err != nil {
4846
c.log.Error("Failed to query controller call coverage",
4947
"device", pubkey, "code", device.Code, "error", err)

controlplane/device-health-oracle/internal/worker/controller_success_test.go

Lines changed: 24 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ func TestControllerSuccessCriterion_Passes(t *testing.T) {
3737
start := now.Add(-33 * time.Minute)
3838
ctx := ContextWithBurnInTimes(context.Background(), BurnInTimes{
3939
DrainedStart: start,
40+
Now: now,
4041
})
4142

4243
checker := &mockControllerCallChecker{minutesWithCalls: 33}
@@ -53,6 +54,7 @@ func TestControllerSuccessCriterion_Fails_InsufficientCoverage(t *testing.T) {
5354
start := now.Add(-33 * time.Minute)
5455
ctx := ContextWithBurnInTimes(context.Background(), BurnInTimes{
5556
DrainedStart: start,
57+
Now: now,
5658
})
5759

5860
checker := &mockControllerCallChecker{minutesWithCalls: 20}
@@ -69,6 +71,7 @@ func TestControllerSuccessCriterion_Fails_ClickHouseError(t *testing.T) {
6971
start := now.Add(-1 * time.Hour)
7072
ctx := ContextWithBurnInTimes(context.Background(), BurnInTimes{
7173
ProvisioningStart: start,
74+
Now: now,
7275
})
7376

7477
checker := &mockControllerCallChecker{err: errors.New("connection refused")}
@@ -95,6 +98,7 @@ func TestControllerSuccessCriterion_UsesProvisioningStart(t *testing.T) {
9598
ctx := ContextWithBurnInTimes(context.Background(), BurnInTimes{
9699
ProvisioningStart: now.Add(-60 * time.Minute),
97100
DrainedStart: now.Add(-10 * time.Minute),
101+
Now: now,
98102
})
99103

100104
// Provide enough coverage for provisioning (60 min) but check that
@@ -112,6 +116,7 @@ func TestControllerSuccessCriterion_UsesDrainedStart(t *testing.T) {
112116
ctx := ContextWithBurnInTimes(context.Background(), BurnInTimes{
113117
ProvisioningStart: now.Add(-60 * time.Minute),
114118
DrainedStart: now.Add(-10 * time.Minute),
119+
Now: now,
115120
})
116121

117122
// Only 10 minutes of coverage — enough for drained but not provisioning.
@@ -122,3 +127,22 @@ func TestControllerSuccessCriterion_UsesDrainedStart(t *testing.T) {
122127
passed, _ := c.Check(ctx, device)
123128
assert.True(t, passed)
124129
}
130+
131+
func TestControllerSuccessCriterion_ZeroBurnIn_Passes(t *testing.T) {
132+
// When burn-in start == now (zero-length window), the criterion should pass
133+
// without querying ClickHouse. This happens in newly created environments
134+
// where the burn-in slot is 0.
135+
now := time.Now()
136+
ctx := ContextWithBurnInTimes(context.Background(), BurnInTimes{
137+
ProvisioningStart: now,
138+
Now: now,
139+
})
140+
141+
checker := &mockControllerCallChecker{err: errors.New("should not be called")}
142+
c := NewControllerSuccessCriterion(checker, testLoggerSlog())
143+
144+
device := serviceability.Device{Status: serviceability.DeviceStatusDeviceProvisioning}
145+
passed, reason := c.Check(ctx, device)
146+
assert.True(t, passed)
147+
assert.Empty(t, reason)
148+
}

controlplane/device-health-oracle/internal/worker/criteria.go

Lines changed: 9 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ type burnInTimesKey struct{}
1515
type BurnInTimes struct {
1616
ProvisioningStart time.Time
1717
DrainedStart time.Time
18+
Now time.Time // tick timestamp, used as the end of the burn-in window
1819
}
1920

2021
// ContextWithBurnInTimes returns a new context carrying the given BurnInTimes.
@@ -26,20 +27,17 @@ func ContextWithBurnInTimes(ctx context.Context, times BurnInTimes) context.Cont
2627
// start time and expected number of minutes for the given device status.
2728
// Returns ok=false if the context has no BurnInTimes, and expectedMinutes=0
2829
// when the burn-in window has zero length (e.g. a newly created environment).
29-
func DeviceBurnIn(ctx context.Context, status serviceability.DeviceStatus) (start time.Time, expectedMinutes int64, ok bool) {
30+
func DeviceBurnIn(ctx context.Context, status serviceability.DeviceStatus) (start time.Time, now time.Time, expectedMinutes int64, ok bool) {
3031
burnIn, ok := ctx.Value(burnInTimesKey{}).(BurnInTimes)
3132
if !ok {
32-
return time.Time{}, 0, false
33+
return time.Time{}, time.Time{}, 0, false
3334
}
3435
start = burnIn.ProvisioningStart
35-
if status == serviceability.DeviceStatusDrained {
36+
if status.IsDrained() {
3637
start = burnIn.DrainedStart
3738
}
38-
expectedMinutes = int64(time.Since(start).Minutes())
39-
if expectedMinutes < 0 {
40-
expectedMinutes = 0
41-
}
42-
return start, expectedMinutes, true
39+
expectedMinutes = max(int64(burnIn.Now.Sub(start).Minutes()), 0)
40+
return start, burnIn.Now, expectedMinutes, true
4341
}
4442

4543
// DeviceCriterion evaluates whether a device meets a specific readiness requirement.
@@ -73,7 +71,9 @@ func (e *DeviceHealthEvaluator) Evaluate(ctx context.Context, device serviceabil
7371
return current
7472
}
7573

76-
// Stage 1: Pending/Unknown → ReadyForLinks
74+
// Stage 1: Pending/Unknown → ReadyForLinks.
75+
// Evaluate advances at most one stage per call, so a device needs a minimum
76+
// of two ticks (two worker intervals) to go from Pending to ReadyForUsers.
7777
if current < serviceability.DeviceHealthReadyForLinks {
7878
if !e.checkAll(ctx, device, e.ReadyForLinksCriteria) {
7979
return current

controlplane/device-health-oracle/internal/worker/worker.go

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -75,7 +75,7 @@ func (w *Worker) tick(ctx context.Context) {
7575
"drainedSlot", drainedSlot)
7676

7777
// Resolve burn-in boundary slots to wall-clock times for criteria evaluation.
78-
var burnIn BurnInTimes
78+
burnIn := BurnInTimes{Now: time.Now()}
7979
if provisioningSlot > 0 {
8080
bt, err := w.cfg.LedgerRPCClient.GetBlockTime(ctx, provisioningSlot)
8181
if err != nil {

0 commit comments

Comments
 (0)