From 066bb38d81e7e1495c60ec5eca39ad7bd5874b0e Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Mon, 4 May 2026 21:58:55 +0200 Subject: [PATCH 01/25] Add summarizer lag observability MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Register chaind_summarizer_lag_epochs{pipeline} GaugeVec and refresh it on every OnFinalityUpdated invocation via a deferred helper, so silent stalls in the epoch / block / validator pipelines become alertable within minutes instead of being discoverable only from dashboards. Key decisions: - Single lag gauge with pipeline label rather than the originally proposed (stalled_epoch, stall_ticks, reason) triple — generalises across all stall shapes and matches the consumer-lag mental model operators already have. - Lag arithmetic uses float64 to avoid uint64 underflow when metadata is briefly ahead of finality (would otherwise read ~1.8e19). - Disabled pipelines skip WithLabelValues so their series are absent from /metrics rather than reported as zero. - Two silent-skip sites upgraded to Warn (len(balances)==0 in summarizeEpoch and !updated in summarizeEpochs) with stable text so the lag-based alert rule can match annotations exactly. - updateLagGauges (does I/O) and setLagGauges (pure compute) split as a testable seam — internal-package tests drive setLagGauges directly with synthetic metadata. - main_test.go switched from binary-level env-gating to flag.Set("test.skip", "TestService") so internal-package unit tests run unconditionally while the existing env-gated TestService is still skipped. Files changed: - services/summarizer/standard/metrics.go - services/summarizer/standard/handler.go - services/summarizer/standard/epoch.go - services/summarizer/standard/main_test.go - services/summarizer/standard/handler_internal_test.go (new) - go.mod (kylelemons/godebug indirect via prometheus testutil) Notes for next iteration: - Issue 4 (smoke-test verification on Hoodi) and issue 5 (ADR for the metric-shape decision) are unblocked once this lands and is deployed. - Pre-existing flaky deadlock-detector failure in services/scheduler/standard TestMulti is unrelated and reproduces on clean master. --- go.mod | 1 + services/summarizer/standard/epoch.go | 5 +- services/summarizer/standard/handler.go | 43 +++- .../standard/handler_internal_test.go | 229 ++++++++++++++++++ services/summarizer/standard/main_test.go | 12 +- services/summarizer/standard/metrics.go | 27 +++ 6 files changed, 312 insertions(+), 5 deletions(-) create mode 100644 services/summarizer/standard/handler_internal_test.go diff --git a/go.mod b/go.mod index 747f2d8..247f4f8 100644 --- a/go.mod +++ b/go.mod @@ -64,6 +64,7 @@ require ( github.com/jmespath/go-jmespath v0.4.0 // indirect github.com/klauspost/compress v1.17.11 // indirect github.com/klauspost/cpuid/v2 v2.2.9 // indirect + github.com/kylelemons/godebug v1.1.0 // indirect github.com/magiconair/properties v1.8.9 // indirect github.com/mattn/go-colorable v0.1.14 // indirect github.com/mattn/go-isatty v0.0.20 // indirect diff --git a/services/summarizer/standard/epoch.go b/services/summarizer/standard/epoch.go index 48f15f4..067d0b2 100644 --- a/services/summarizer/standard/epoch.go +++ b/services/summarizer/standard/epoch.go @@ -77,7 +77,10 @@ func (s *Service) summarizeEpoch(ctx context.Context, } if len(balances) == 0 { // This can happen if chaind does not have validator balances enabled, or has not yet obtained - // the balances. We return false but no error. + // the balances. We return false but no error so we retry on the next finality tick. + // Alert annotation contract: this stable message text is matched by the lag-based alert + // rule; do not change without coordinating the alert update. + log.Warn().Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil } // Make a balances map. diff --git a/services/summarizer/standard/handler.go b/services/summarizer/standard/handler.go index fc7f48f..02506e6 100644 --- a/services/summarizer/standard/handler.go +++ b/services/summarizer/standard/handler.go @@ -47,6 +47,11 @@ func (s *Service) OnFinalityUpdated( return } defer s.activitySem.Release(1) + // Refresh per-pipeline lag gauges on every handler invocation, including + // the error-return paths below. Deferred so it observes the post-run + // metadata. Registered after activitySem.Release so LIFO ordering runs + // the gauge update first, while the semaphore is still held. + defer s.updateLagGauges(ctx, finalizedEpoch) if finalizedEpoch == 0 { log.Debug().Msg("Not summarizing on epoch 0") @@ -130,7 +135,9 @@ func (s *Service) summarizeEpochs(ctx context.Context, targetEpoch phase0.Epoch) return errors.Wrapf(err, "failed to update summary for epoch %d", epoch) } if !updated { - log.Debug().Uint64("epoch", uint64(epoch)).Msg("Not enough data to update summary") + // Alert annotation contract: this stable message text is matched by the lag-based alert + // rule; do not change without coordinating the alert update. + log.Warn().Uint64("epoch", uint64(epoch)).Msg("Not enough data to update summary; will retry on next finality tick") return nil } } @@ -270,3 +277,37 @@ func (s *Service) summarizeValidatorDays(ctx context.Context) error { func (s *Service) epochsPerDay() phase0.Epoch { return phase0.Epoch(86400.0 / s.chainTime.SlotDuration().Seconds() / float64(s.chainTime.SlotsPerEpoch())) } + +// updateLagGauges fetches the latest summarizer metadata and refreshes the +// per-pipeline lag gauges. Invoked via defer at the top of OnFinalityUpdated +// so it runs on every return path, including handler errors. The pure-compute +// half is split into setLagGauges so unit tests can drive the gauge math with +// synthetic metadata without standing up a chainDB. +func (s *Service) updateLagGauges(ctx context.Context, finalizedEpoch phase0.Epoch) { + if finalizedEpoch == 0 { + return + } + md, err := s.getMetadata(ctx) + if err != nil { + log.Debug().Err(err).Msg("Failed to obtain metadata for lag gauges") + return + } + s.setLagGauges(md, finalizedEpoch) +} + +// setLagGauges computes lag-in-epochs per enabled pipeline from synthetic +// inputs. The float64 cast is deliberate: a uint64 subtraction would underflow +// to ~1.8e19 if metadata were briefly ahead of finality. Callers are expected +// to pre-validate finalizedEpoch > 0. +func (s *Service) setLagGauges(md *metadata, finalizedEpoch phase0.Epoch) { + targetEpoch := finalizedEpoch - 1 + if s.epochSummaries { + monitorLag("epoch", float64(targetEpoch)-float64(md.LastEpoch)) + } + if s.blockSummaries { + monitorLag("block", float64(targetEpoch)-float64(md.LastBlockEpoch)) + } + if s.validatorSummaries { + monitorLag("validator", float64(targetEpoch)-float64(md.LastValidatorEpoch)) + } +} diff --git a/services/summarizer/standard/handler_internal_test.go b/services/summarizer/standard/handler_internal_test.go new file mode 100644 index 0000000..bf215dc --- /dev/null +++ b/services/summarizer/standard/handler_internal_test.go @@ -0,0 +1,229 @@ +// Copyright © 2021 - 2026 Weald Technology Limited. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package standard + +import ( + "bytes" + "context" + "strings" + "testing" + + "github.com/attestantio/go-eth2-client/spec/phase0" + "github.com/prometheus/client_golang/prometheus" + "github.com/prometheus/client_golang/prometheus/testutil" + "github.com/rs/zerolog" + "github.com/stretchr/testify/require" + "github.com/wealdtech/chaind/services/chaindb" +) + +// stubValidatorsProvider is a hand-built chaindb.ValidatorsProvider used to +// drive summarizeEpoch into the silent-skip branch. Only Validators and +// ValidatorBalancesByEpoch are exercised; the remaining methods satisfy the +// interface but are not invoked along the tested path. +type stubValidatorsProvider struct { + validators []*chaindb.Validator + balances []*chaindb.ValidatorBalance +} + +func (s *stubValidatorsProvider) Validators(_ context.Context) ([]*chaindb.Validator, error) { + return s.validators, nil +} + +func (s *stubValidatorsProvider) ValidatorsByPublicKey(_ context.Context, _ []phase0.BLSPubKey) (map[phase0.BLSPubKey]*chaindb.Validator, error) { + return nil, nil +} + +func (s *stubValidatorsProvider) ValidatorsByIndex(_ context.Context, _ []phase0.ValidatorIndex) (map[phase0.ValidatorIndex]*chaindb.Validator, error) { + return nil, nil +} + +func (s *stubValidatorsProvider) ValidatorBalancesByEpoch(_ context.Context, _ phase0.Epoch) ([]*chaindb.ValidatorBalance, error) { + return s.balances, nil +} + +func (s *stubValidatorsProvider) ValidatorBalancesByIndexAndEpoch(_ context.Context, _ []phase0.ValidatorIndex, _ phase0.Epoch) (map[phase0.ValidatorIndex]*chaindb.ValidatorBalance, error) { + return nil, nil +} + +func (s *stubValidatorsProvider) ValidatorBalancesByIndexAndEpochRange(_ context.Context, _ []phase0.ValidatorIndex, _ phase0.Epoch, _ phase0.Epoch) (map[phase0.ValidatorIndex][]*chaindb.ValidatorBalance, error) { + return nil, nil +} + +func (s *stubValidatorsProvider) ValidatorBalancesByIndexAndEpochs(_ context.Context, _ []phase0.ValidatorIndex, _ []phase0.Epoch) (map[phase0.ValidatorIndex][]*chaindb.ValidatorBalance, error) { + return nil, nil +} + +// TestSummarizeEpochSilentSkipEmitsWarn drives summarizeEpoch with one active +// validator and an empty balances slice, asserting the silent-skip path returns +// (false, nil) and emits a Warn log with the agreed stable text. +func TestSummarizeEpochSilentSkipEmitsWarn(t *testing.T) { + ctx := context.Background() + + // main_test.go sets the global level to Disabled; locally raise it so + // the buffer logger below actually emits. + originalGlobalLevel := zerolog.GlobalLevel() + zerolog.SetGlobalLevel(zerolog.TraceLevel) + defer zerolog.SetGlobalLevel(originalGlobalLevel) + + var buf bytes.Buffer + originalLog := log + log = zerolog.New(&buf).Level(zerolog.WarnLevel) + defer func() { log = originalLog }() + + const epoch = phase0.Epoch(5) + farFuture := phase0.Epoch(0xffffffffffffffff) + + s := &Service{ + farFutureEpoch: farFuture, + epochSummaries: true, + validatorsProvider: &stubValidatorsProvider{ + validators: []*chaindb.Validator{ + { + Index: 1, + ActivationEpoch: 0, + ExitEpoch: farFuture, + }, + }, + balances: nil, + }, + } + + updated, err := s.summarizeEpoch(ctx, &metadata{}, epoch) + require.NoError(t, err) + require.False(t, updated) + + logged := buf.String() + require.True(t, + strings.Contains(logged, "No validator balances available; cannot summarize epoch"), + "warn log missing expected stable message text; got: %s", logged) + require.True(t, + strings.Contains(logged, `"level":"warn"`), + "silent-skip log emitted at unexpected level; got: %s", logged) +} + +// TestSetLagGaugesWiring drives setLagGauges with synthetic metadata + finalized +// epoch and asserts per-pipeline values, that disabled pipelines produce no +// series, and that values are non-negative for the typical lag scenario. +func TestSetLagGaugesWiring(t *testing.T) { + originalGauge := summarizerLagEpochs + summarizerLagEpochs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Name: "lag_epochs", + Help: "test", + }, []string{"pipeline"}) + defer func() { summarizerLagEpochs = originalGauge }() + + tests := []struct { + name string + service *Service + md *metadata + finalizedEpoch phase0.Epoch + wantSeries int + wantValues map[string]float64 + }{ + { + name: "all pipelines enabled", + service: &Service{ + epochSummaries: true, + blockSummaries: true, + validatorSummaries: true, + }, + md: &metadata{ + LastEpoch: 7, + LastBlockEpoch: 5, + LastValidatorEpoch: 4, + }, + finalizedEpoch: 11, + wantSeries: 3, + wantValues: map[string]float64{ + "epoch": 3, // targetEpoch=10, 10-7 + "block": 5, // 10-5 + "validator": 6, // 10-4 + }, + }, + { + name: "block pipeline disabled", + service: &Service{ + epochSummaries: true, + blockSummaries: false, + validatorSummaries: true, + }, + md: &metadata{ + LastEpoch: 9, + LastValidatorEpoch: 8, + }, + finalizedEpoch: 11, + wantSeries: 2, + wantValues: map[string]float64{ + "epoch": 1, + "validator": 2, + }, + }, + { + name: "only epoch pipeline", + service: &Service{ + epochSummaries: true, + }, + md: &metadata{LastEpoch: 9}, + finalizedEpoch: 11, + wantSeries: 1, + wantValues: map[string]float64{"epoch": 1}, + }, + { + name: "no pipelines enabled", + service: &Service{}, + md: &metadata{}, + finalizedEpoch: 11, + wantSeries: 0, + wantValues: map[string]float64{}, + }, + { + name: "fully caught up - lag is zero not negative", + service: &Service{ + epochSummaries: true, + blockSummaries: true, + validatorSummaries: true, + }, + md: &metadata{ + LastEpoch: 10, + LastBlockEpoch: 10, + LastValidatorEpoch: 10, + }, + finalizedEpoch: 11, + wantSeries: 3, + wantValues: map[string]float64{ + "epoch": 0, + "block": 0, + "validator": 0, + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + summarizerLagEpochs.Reset() + + test.service.setLagGauges(test.md, test.finalizedEpoch) + + require.Equal(t, test.wantSeries, testutil.CollectAndCount(summarizerLagEpochs), + "unexpected number of label series emitted") + + for label, want := range test.wantValues { + got := testutil.ToFloat64(summarizerLagEpochs.WithLabelValues(label)) + require.Equal(t, want, got, "unexpected value for pipeline=%s", label) + require.GreaterOrEqual(t, got, 0.0, "lag for pipeline=%s should be non-negative", label) + } + }) + } +} diff --git a/services/summarizer/standard/main_test.go b/services/summarizer/standard/main_test.go index 6778a57..ce80738 100644 --- a/services/summarizer/standard/main_test.go +++ b/services/summarizer/standard/main_test.go @@ -14,6 +14,7 @@ package standard_test import ( + "flag" "os" "testing" @@ -22,8 +23,13 @@ import ( func TestMain(m *testing.M) { zerolog.SetGlobalLevel(zerolog.Disabled) - if os.Getenv("CHAINDB_URL") != "" && - os.Getenv("ETH2CLIENT_ADDRESS") != "" { - os.Exit(m.Run()) + // `package standard` (internal) and `package standard_test` (external) tests + // share a single test binary, so this TestMain governs both. When the env + // vars required by service_test.go's TestService are absent we skip that + // test by name and still call m.Run(), so internal-package unit tests like + // the lag-gauge wiring tests run unconditionally under `go test ./...`. + if os.Getenv("CHAINDB_URL") == "" || os.Getenv("ETH2CLIENT_ADDRESS") == "" { + _ = flag.Set("test.skip", "TestService") } + os.Exit(m.Run()) } diff --git a/services/summarizer/standard/metrics.go b/services/summarizer/standard/metrics.go index f6efe37..42b56a4 100644 --- a/services/summarizer/standard/metrics.go +++ b/services/summarizer/standard/metrics.go @@ -41,6 +41,15 @@ var ( lastBalancePrune prometheus.Gauge ) +// summarizerLagEpochs tracks how many epochs each summarizer pipeline is behind +// the finality target. Pipelines that are disabled never call WithLabelValues +// so their series are absent from /metrics. +// +// Alert annotation contract: this gauge is referenced by the lag-based alert +// rule deployed alongside chaind; do not rename without coordinating the alert +// migration. +var summarizerLagEpochs *prometheus.GaugeVec + func registerMetrics(_ context.Context, monitor metrics.Service) error { if latestEpoch != nil { // Already registered. @@ -112,9 +121,27 @@ func registerPrometheusMetrics() error { return errors.Wrap(err, "failed to register epoch_prune_ts") } + summarizerLagEpochs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ + Namespace: metricsNamespace, + Name: "lag_epochs", + Help: "Number of epochs by which each summarizer pipeline lags the finality target", + }, []string{"pipeline"}) + if err := prometheus.Register(summarizerLagEpochs); err != nil { + return errors.Wrap(err, "failed to register lag_epochs") + } + return nil } +// monitorLag records the lag in epochs for a given summarizer pipeline. +// Callers are responsible for guarding with the per-pipeline enabled flag, so +// disabled pipelines never produce a series. +func monitorLag(pipeline string, lag float64) { + if summarizerLagEpochs != nil { + summarizerLagEpochs.WithLabelValues(pipeline).Set(lag) + } +} + // monitorLatestEpoch sets the latest epoch without registering an // increase in epochs processed. This does not usually need to be // called directly, as it is called as part of monitorEpochProcessed. From 8027f90647056bfc98ed0b304e64ec1bd5c0a8e8 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Mon, 4 May 2026 22:18:57 +0200 Subject: [PATCH 02/25] Add ADR for summarizer observability metric shape MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Publish docs/adr/0001-summarizer-observability-shape.md, the first ADR in this repository, recording the decision to expose summarizer stall state as a single labelled gauge chaind_summarizer_lag_epochs{pipeline} rather than the originally-proposed (stalled_epoch, stall_ticks, reason) triple. Derived from the draft at .scratch/adr-0001-summarizer-observability-shape.md and polished into the MADR template since the repo had no prior ADR style to follow. Key decisions: - MADR template (Status / Context and Problem Statement / Decision Drivers / Considered Options / Decision Outcome with Pros and Cons) chosen as the recognisable standard. Confirm choice with upstream maintainers in PR review per the issue guidance. - ADR documents the decision as shipped in commit 066bb38, not as drafted. The draft's "Summarizer skipped epoch..." log string never landed; the shipped strings — "No validator balances available; cannot summarize epoch (will retry on next finality tick)" and "Not enough data to update summary; will retry on next finality tick" — are recorded verbatim so the alert annotation contract stays accurate. - Implementation notes capture the testable-seam split between updateLagGauges (does I/O) and setLagGauges (pure compute), the float64 cast that prevents uint64 underflow, and the disabled-pipeline behaviour (no WithLabelValues call so the series is absent from /metrics). - Rejected alternatives recorded with their trade-offs: the original (stalled_epoch, stall_ticks, reason) triple, the hybrid lag-gauge + per-event counter, and pre-populating the gauge at startup via eth2Client.Finality(). - Alert-annotation contract location corrected post-review: the in-code comment lives on the summarizerLagEpochs var declaration in metrics.go, not at the prometheus.Register call site, so the ADR documents the var declaration as the reference site. Files changed: - docs/adr/0001-summarizer-observability-shape.md (new) Notes for next iteration: - This is the first ADR in the repo. PR description should call out that it bootstraps the docs/adr/ convention and invite upstream maintainers to confirm the MADR template choice. - Issue 4 (Hoodi smoke-test verification) remains the only open PRD-A-related task; it requires real Postgres + beacon-node infrastructure and operator sign-off, so it is HITL despite the AFK label. - docs/prometheus.md does not yet document the new chaind_summarizer_lag_epochs metric; updating it is out of scope here but is worth a follow-up issue. --- .../0001-summarizer-observability-shape.md | 95 +++++++++++++++++++ 1 file changed, 95 insertions(+) create mode 100644 docs/adr/0001-summarizer-observability-shape.md diff --git a/docs/adr/0001-summarizer-observability-shape.md b/docs/adr/0001-summarizer-observability-shape.md new file mode 100644 index 0000000..b04405c --- /dev/null +++ b/docs/adr/0001-summarizer-observability-shape.md @@ -0,0 +1,95 @@ +# 0001 — Use a single labelled lag gauge for summarizer stall observability + +* Status: Accepted +* Date: 2026-04-30 +* Scope: `services/summarizer/standard` in `wealdtech/chaind`. Companion alerts live in `attestantio/ops`. + +## Context and Problem Statement + +`services/summarizer/standard.summarizeEpoch` returns `(false, nil)` when validator balances for an epoch are unavailable. The outer `summarizeEpochs` exits the loop silently with only a `Debug` log. Because two downstream pipelines (`summarizeBlocks`, `summarizeValidators`) cap their target epoch at `md.LastEpoch`, the silent skip freezes the entire summarizer until the underlying data condition resolves. The 2026-04-07 Hoodi incident produced a multi-week Grafana gap before any human noticed. + +We need stall observability that an operator can alert on within minutes. The shape of that observability is not obvious; several reasonable designs exist. + +## Decision Drivers + +- An alert must fire within minutes of any stall, not after a long-tail dashboard review. +- The signal should generalize across all stall shapes — including persistent errors in the `summarizeBlocks` and `summarizeValidators` pipelines, not only the `(false, nil)` site that triggered the original incident. +- Operators already reason about consumer-lag and replication-lag metrics; the new metric should fit that mental model so runbooks stay simple. +- Public Prometheus metric shape is hard to reverse — renaming or relabelling is a breaking change for downstream alerts, so the choice must be deliberate. + +## Considered Options + +1. **A single labelled lag gauge** — `chaind_summarizer_lag_epochs{pipeline}`. +2. **Original plan: `stalled_epoch` gauge + `stall_ticks_total{reason}` counter** — a paired stall-event metric pinned to the silent-skip site. +3. **Hybrid: lag gauge + `silent_skip_total{reason}` counter** — Option 1 plus an event counter for forensic detail. +4. **Pre-populate the gauge at service startup via `eth2Client.Finality()`** — variant of Option 1 that closes the cold-start blind spot. + +## Decision Outcome + +Chosen option: **Option 1 — a single labelled lag gauge**, because it surfaces every stall shape the summarizer can produce (not just the original `(false, nil)` site), collapses alerting to a single rule that matches operator intuition, and avoids coupling Prometheus cardinality to an internal enum that may evolve. + +### Implementation notes + +Stall state is exposed as a single Prometheus `GaugeVec`: + +``` +chaind_summarizer_lag_epochs{pipeline} = float64(targetEpoch) - float64(md.LastXEpoch) +``` + +where the `pipeline` label takes one of three values: `epoch`, `block`, or `validator`. + +The gauge is updated by a private helper `updateLagGauges(ctx, finalizedEpoch)`, registered via `defer` at the top of `OnFinalityUpdated` so it fires on every handler invocation including all error-return paths. The pure-compute half (`setLagGauges`) is split out as a testable seam: internal-package unit tests drive the gauge math with synthetic metadata without standing up a database. Disabled pipelines (e.g. when `s.blockSummaries == false`) never call `WithLabelValues(...)` and so omit their series entirely; the alert query naturally ignores absent labels. + +The `float64` cast is deliberate: a `uint64` subtraction would underflow to ~`1.8e19` if metadata is briefly ahead of finality. + +Per-event diagnostic detail (which epoch is stuck, which silent-skip branch fired) lives in `Warn` logs at the two silent-skip sites — *not* in a counter, *not* in a `reason` label. The paging signal is the lag gauge alert; the logs are forensic context that becomes useful once the alert fires. + +### Positive consequences + +- One alert rule (`chaind_summarizer_lag_epochs > 2 for 15m`) covers every stall shape the summarizer can produce, including future ones. +- Operator mental model matches existing consumer-lag and replication-lag metrics elsewhere in the fleet. +- Metric maintenance is a single series with a small label set instead of two metrics with overlapping semantics. +- High-cardinality diagnostic detail lives in logs, where it belongs, without forcing Prometheus to track an enum that may grow. + +### Negative consequences + +- **The metric is part of the public contract** with operators. Renaming the metric or its labels is a breaking change for anyone running the alert rule. If we ever need to evolve the shape, we add a new metric alongside this one and deprecate the old. +- **Alert wording is coupled to log message text.** The runbook annotation references the literal `Warn` strings: + - `"No validator balances available; cannot summarize epoch (will retry on next finality tick)"` (from `epoch.go`, the `len(balances) == 0` branch). + - `"Not enough data to update summary; will retry on next finality tick"` (from `handler.go`, the `!updated` branch in `summarizeEpochs`). + + Changing those strings without updating the alert annotation breaks operator UX. The implementation includes an `Alert annotation contract:` code comment near each log line and on the `summarizerLagEpochs` variable declaration in `metrics.go`, so the contract is visible at the point of edit. +- **No per-event counter** means we cannot trivially answer "how often is the silent-skip path firing across the fleet?" That question, if it arises, will be answered from log-based metrics (e.g. Loki/Mimir) or from sampling logs in incidents. If the question becomes recurrent, that is itself a signal that we should re-introduce a counter — superseding this ADR rather than retrofitting one in silently. +- **A six-minute cold-start blind spot** exists between service startup and the first `OnFinalityUpdated` tick during which the gauge has no value. This is an accepted trade-off (see Option 4 below) covered by the existing flat-`chaind_summarizer_epochs_processed_total` alert. + +## Pros and Cons of the Options + +### Option 1 — Single labelled lag gauge (chosen) + +- **Good** — generalizes across all summarizer stall shapes, including future ones. +- **Good** — alert rules collapse to a single `> 2 for 15m` query. +- **Good** — matches operator intuition for lag-style metrics. +- **Bad** — does not provide a per-event count of silent-skip occurrences; that question is deferred to logs. + +### Option 2 — `stalled_epoch` gauge + `stall_ticks_total{reason}` counter (rejected) + +- **Good** — explicit per-incident counter is convenient for "how many times did this fire" questions. +- **Bad** — only fires on the `(false, nil)` path inside `summarizeEpoch`. A persistent error inside `summarizeBlocksInEpoch` that causes `LastBlockEpoch` to fall behind `LastEpoch` would leave the gauge at zero and the alert silent — which is the original incident shape we are trying to prevent. +- **Bad** — the `reason` enum couples public metric cardinality to an internal classification we may want to evolve. Adding or renaming reasons becomes a breaking-change concern. + +### Option 3 — Hybrid (lag gauge + `silent_skip_total{reason}` counter, rejected) + +- **Good** — preserves the generalized alert from Option 1 while keeping a per-event counter. +- **Bad** — the forensic information the counter provides is already in the new `Warn` logs at the same sites. A per-event counter duplicates the log signal at higher cost (Prometheus cardinality, label maintenance, dashboard panels to ignore) without giving operators anything the logs do not. + +### Option 4 — Pre-populate the gauge at service startup via `eth2Client.Finality()` (rejected) + +- **Good** — closes the six-minute window after restart during which the gauge has no value. +- **Bad** — engineering for an unlikely failure shape ("chaind never receives a finality tick at all"). That shape is already covered by alerts on flat `chaind_summarizer_epochs_processed_total`. +- **Bad** — adds network I/O to the bootstrap path, creating new failure modes (slow or failing Beacon node at startup) for the small benefit of populating one gauge value six minutes earlier. + +## Links + +- Incident: 2026-04-07 chaindb-d04 outage (multi-week silent stall before detection) +- Implementation: chaind commit `066bb38` ("Add summarizer lag observability") — registers `chaind_summarizer_lag_epochs`, adds the two `Warn` log sites, and ships the `updateLagGauges` / `setLagGauges` split +- Companion alert rule: `attestantio/ops` work item — deploys the `chaind_summarizer_lag_epochs > 2 for 15m` alert against the strings above From 1edcb45dc65993a2781e8813602cbe5f9ba1748d Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Tue, 5 May 2026 23:23:59 +0200 Subject: [PATCH 03/25] Amend lag gauge to cursor diff and add zero-balance guard MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The targetEpoch - md.LastXEpoch formula was structurally incapable of detecting the silent-corruption mode the alert was meant to catch: ValidatorBalancesByEpoch's LEFT JOIN returns one zero-balance row per validator when t_validator_balances is empty for an epoch, so the existing len(balances)==0 guard never trips, the cursor advances unconditionally, and the gauge stays pinned at 0. An end-to-end smoke test on a seeded chaind instance (DELETE all t_validator_balances rows for one finalized epoch and observe) confirmed the failure shape: gauge stayed at 0/0/0 throughout, neither alert-contract Warn log fired, the cursor advanced past the corrupt epoch, and t_epoch_summaries was silently written with f_active_balance=0. See docs/adr/0001-summarizer-observability-shape.md "Amendment 1" for the full rationale and the considered alternatives. Re-instrument the gauge as a per-pipeline cursor diff against each pipeline's direct upstream: pipeline=epoch = validators.LatestBalancesEpoch - summarizer.LastEpoch pipeline=block = summarizer.LastEpoch - summarizer.LastBlockEpoch pipeline=validator = summarizer.LastEpoch - summarizer.LastValidatorEpoch Diffs are clamped to [0, +Inf) via a small helper to absorb the benign race window where a downstream cursor is read ahead of its upstream across two metadata reads. The metric name, label key, label values, and gauge registration are unchanged — operators running the existing alert rule do not need to migrate (threshold semantics shift slightly, documented in the ADR). Add a defensive guard in summarizeEpoch after the active-validator balance accumulation loop and before BeginTx: if ActiveValidators > 0 and ActiveBalance == 0, refuse to write a corrupt summary, reuse the existing alert-contract warn message verbatim, and return (false, nil) so the cursor stays put and the cycle retries on the next finality tick. The pre-existing len(balances)==0 check is retained as the bootstrap path with a comment differentiating it from the new production-state guard. Both branches share one warn-log message so the alert annotation contract is single-sourced. Tests: rewrite TestSetLagGaugesWiring with cursor-diff cases (caught up, upstream ahead, intra-summarizer lag, race-window clamp, disabled pipeline absence) and add TestSummarizeEpochZeroBalanceAssertion that drives summarizeEpoch with 1000 validators returning zero balances and asserts (false, nil), the warn log fires, and no row is written to t_epoch_summaries via a recording chaindb mock. Both tests run unconditionally under gosilent test ./... — no env gating. Files changed: - services/summarizer/standard/handler.go - services/summarizer/standard/epoch.go - services/summarizer/standard/metadata.go - services/summarizer/standard/handler_internal_test.go - docs/adr/0001-summarizer-observability-shape.md (Amendment 1) --- .../0001-summarizer-observability-shape.md | 56 ++++- services/summarizer/standard/epoch.go | 23 +- services/summarizer/standard/handler.go | 53 ++-- .../standard/handler_internal_test.go | 236 ++++++++++++++---- services/summarizer/standard/metadata.go | 44 ++++ 5 files changed, 341 insertions(+), 71 deletions(-) diff --git a/docs/adr/0001-summarizer-observability-shape.md b/docs/adr/0001-summarizer-observability-shape.md index b04405c..3d315d1 100644 --- a/docs/adr/0001-summarizer-observability-shape.md +++ b/docs/adr/0001-summarizer-observability-shape.md @@ -1,6 +1,6 @@ # 0001 — Use a single labelled lag gauge for summarizer stall observability -* Status: Accepted +* Status: Accepted (2026-04-30); amended 2026-05-05 — see "Amendment 1" below. * Date: 2026-04-30 * Scope: `services/summarizer/standard` in `wealdtech/chaind`. Companion alerts live in `attestantio/ops`. @@ -93,3 +93,57 @@ Per-event diagnostic detail (which epoch is stuck, which silent-skip branch fire - Incident: 2026-04-07 chaindb-d04 outage (multi-week silent stall before detection) - Implementation: chaind commit `066bb38` ("Add summarizer lag observability") — registers `chaind_summarizer_lag_epochs`, adds the two `Warn` log sites, and ships the `updateLagGauges` / `setLagGauges` split - Companion alert rule: `attestantio/ops` work item — deploys the `chaind_summarizer_lag_epochs > 2 for 15m` alert against the strings above + +## Amendment 1 — cursor-diff formula and zero-balance guard (2026-05-05) + +* Status: Accepted, amends the Decision Outcome and Implementation Notes above. + +### Motivation + +The original formula `targetEpoch - md.LastXEpoch` was structurally incapable of detecting the silent-corruption mode the alert was meant to catch. End-to-end smoke verification on a seeded chaind instance — DELETE all `t_validator_balances` rows for one finalized epoch, observe behavior — established the failure shape empirically: the gauge stayed pinned at `0/0/0` throughout, neither alert-contract `Warn` log fired, the summarizer cursor advanced past the corrupt epoch, and `t_epoch_summaries.f_active_balance` was silently written as `0` for the affected epoch. + +Root cause is in `services/chaindb/postgresql/validators.go` (`ValidatorBalancesByEpoch`): a `LEFT JOIN` of `t_validators` against the per-epoch balance subquery, with `COALESCE(f_balance, 0)`. When no balance rows exist for an epoch, the function returns ~1.3M rows of zeros instead of an empty slice — so the `len(balances) == 0` guard in `summarizeEpoch` never trips in any production state where `t_validators` is populated. The cursor advances unconditionally, the gauge stays at zero, the alert never fires, and corrupted summaries are written. + +The `0` reading is fatally ambiguous for an alert metric — it can mean "summarizer is healthy" or "summarizer is silently corrupting data" with no way for a downstream consumer to distinguish them. + +### Amended formula + +The metric name (`chaind_summarizer_lag_epochs`), label key (`pipeline`), label values (`epoch`/`block`/`validator`), and gauge registration are **unchanged** — this is a formula amendment, not a metric replacement. Each pipeline now measures its lag against its **direct** upstream cursor: + +| label | new formula | meaning | +|---|---|---| +| `pipeline=epoch` | `validators.latest_balances_epoch - summarizer.latest_epoch` | how far behind the upstream balances cursor the epoch-summary cursor is | +| `pipeline=block` | `summarizer.latest_epoch - summarizer.latest_block_epoch` | how far behind the epoch-summary cursor the block-summary sub-pipeline is | +| `pipeline=validator` | `summarizer.latest_epoch - summarizer.latest_validator_epoch` | how far behind the epoch-summary cursor the validator-summary sub-pipeline is | + +The reading-side now fetches both `summarizer.standard` and `validators.standard` rows from `t_metadata` per finality tick. Diffs are clamped to `[0, +Inf)` via a small helper to absorb the benign race window where a downstream cursor is read ahead of its upstream across two metadata reads. + +### Defensive zero-balance assertion + +Independently of the gauge change, `summarizeEpoch` now refuses to write a corrupt summary. After the active-validator balance accumulation loop and before `BeginTx`, if `summary.ActiveValidators > 0 && summary.ActiveBalance == 0` the function emits the existing alert-contract `Warn` log (verbatim, single-sourced with the bootstrap-path `len(balances) == 0` branch above it) and returns `(false, nil)`. The cursor stays put, the cycle retries on the next finality tick, and the row is never written to `t_epoch_summaries`. This keeps the data-integrity guarantee even if the alert is missed. + +### Why amend instead of supersede + +The metric/label/registration contract is unchanged — operators running the existing alert rule do not need to migrate. The formula change is a defect fix that recovers the *originally intended* semantics ("detect when the balance pipeline falls behind the summarizer"), not a redesign. Per the original "Public Prometheus metric shape is hard to reverse" decision driver, supersession is reserved for breaking changes to the operator-facing contract. + +### Why cursor-diff over the original formula + +1. **Decoupled from cycle outcomes.** Whether a finality cycle errors, runs with garbage, succeeds, or is skipped, the gauge measures actual cursor-state divergence at read time. It cannot lie about lag because it isn't computed from the lossy cycle-completion signal. +2. **Disambiguates `0` readings.** A `0` now means exactly one thing: "downstream cursor matches its upstream right now." +3. **Per-pipeline labels carry distinct diagnostic value.** + - `epoch` lag growing → upstream balances pipeline is stalled (network, API, validators-pipeline crash). Action: investigate `services/validators/standard/`. + - `block` lag growing → block-summary sub-pipeline is internally lagging behind epoch summarization (DB write contention, slow query). Action: investigate `summarizeBlocksInEpoch`. + - `validator` lag growing → same diagnosis pattern but for the validator sub-pipeline (`summarizeValidatorsInEpoch`). +4. **Alert thresholds become operationally direct.** "lag > N for M minutes" reads as "the upstream pipeline has been ≥N epochs ahead of the downstream pipeline for ≥M minutes" — a direct operational statement, not a proxy reading via cycle-completion. + +### Considered options for the fix + +- **Option A** — Add an "all-zero balances" guard inside `summarizeEpoch`. Adopted (the defensive assertion above). +- **Option B** — Change `ValidatorBalancesByEpoch` itself to return a `nil` slice when no rows. Rejected. The `LEFT JOIN` exists for a real reason: zero-balance validators don't have rows; the JOIN ensures every validator is represented in the result. Touching the SQL risks regressing the legitimate case. The defect is in the summarizer trusting the all-zero answer at face value, so the guard belongs at the summarizer layer where it can check the business invariant (`ActiveBalance > 0` when `ActiveValidators > 0`) rather than a structural property of the SQL. +- **Option C** — Re-instrument the lag gauge with cursor-diff math. Adopted (this amendment). + +A and C ship together: C ensures the alert fires; A ensures no corrupt summary is written even if the alert is missed. Belt and suspenders. + +### Operator-facing changes + +Metric name, labels, and registration are unchanged — no migration required. **The threshold *meaning* has changed**: the gauge now reports data-cursor divergence rather than cycle-completion lag. For the existing `> 2 for 15m` rule the practical behavior is similar (both fire when the summarizer falls behind), but the new signal is sensitive to the previously-undetectable silent-corruption mode. Companion alert spec in `attestantio/ops` should reference the amended semantics; per-pipeline diagnosis hints become useful in the runbook because the labels now carry genuinely distinct failure surfaces. diff --git a/services/summarizer/standard/epoch.go b/services/summarizer/standard/epoch.go index 067d0b2..6bb4957 100644 --- a/services/summarizer/standard/epoch.go +++ b/services/summarizer/standard/epoch.go @@ -76,8 +76,14 @@ func (s *Service) summarizeEpoch(ctx context.Context, return false, errors.Wrap(err, "failed to obtain validator balances") } if len(balances) == 0 { - // This can happen if chaind does not have validator balances enabled, or has not yet obtained - // the balances. We return false but no error so we retry on the next finality tick. + // Bootstrap path: chaind has no validators rows yet (first boot before + // the validators service has populated t_validators), or validator + // balances are disabled entirely, so the LEFT JOIN in + // ValidatorBalancesByEpoch returns no rows at all. In production + // state the JOIN returns one zero-balance row per validator instead — + // see the post-accumulation guard below for that case. Both branches + // share one warn message so the alert-annotation contract is single- + // sourced. // Alert annotation contract: this stable message text is matched by the lag-based alert // rule; do not change without coordinating the alert update. log.Warn().Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") @@ -100,6 +106,19 @@ func (s *Service) summarizeEpoch(ctx context.Context, } log.Trace().Dur("elapsed", time.Since(started)).Msg("Obtained validator balances") + // Production-state guard: ValidatorBalancesByEpoch LEFT-JOINs t_validators + // against t_validator_balances with COALESCE(f_balance, 0) — when no + // balance rows exist for the epoch, it returns one zero-balance row per + // validator rather than an empty slice, so the len(balances) == 0 check + // above cannot detect that case. If we have active validators but zero + // summed balance, the upstream balance pipeline missed this epoch. Refuse + // to write a corrupt summary; reuse the same warn-log text so the alert + // annotation contract is preserved. + if summary.ActiveValidators > 0 && summary.ActiveBalance == 0 { + log.Warn().Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") + return false, nil + } + err = s.blockStatsForEpoch(ctx, epoch, summary) if err != nil { return false, errors.Wrap(err, "failed to calculate block summary statistics for epoch") diff --git a/services/summarizer/standard/handler.go b/services/summarizer/standard/handler.go index 02506e6..742daac 100644 --- a/services/summarizer/standard/handler.go +++ b/services/summarizer/standard/handler.go @@ -278,11 +278,12 @@ func (s *Service) epochsPerDay() phase0.Epoch { return phase0.Epoch(86400.0 / s.chainTime.SlotDuration().Seconds() / float64(s.chainTime.SlotsPerEpoch())) } -// updateLagGauges fetches the latest summarizer metadata and refreshes the -// per-pipeline lag gauges. Invoked via defer at the top of OnFinalityUpdated -// so it runs on every return path, including handler errors. The pure-compute -// half is split into setLagGauges so unit tests can drive the gauge math with -// synthetic metadata without standing up a chainDB. +// updateLagGauges fetches the latest summarizer metadata and the upstream +// validators metadata, then refreshes the per-pipeline lag gauges. Invoked +// via defer at the top of OnFinalityUpdated so it runs on every return path, +// including handler errors. The pure-compute half is split into setLagGauges +// so unit tests can drive the gauge math with synthetic metadata without +// standing up a chainDB. func (s *Service) updateLagGauges(ctx context.Context, finalizedEpoch phase0.Epoch) { if finalizedEpoch == 0 { return @@ -292,22 +293,44 @@ func (s *Service) updateLagGauges(ctx context.Context, finalizedEpoch phase0.Epo log.Debug().Err(err).Msg("Failed to obtain metadata for lag gauges") return } - s.setLagGauges(md, finalizedEpoch) + upstream, err := s.getUpstreamMetadata(ctx) + if err != nil { + log.Debug().Err(err).Msg("Failed to obtain upstream metadata for lag gauges") + return + } + s.setLagGauges(md, upstream) } -// setLagGauges computes lag-in-epochs per enabled pipeline from synthetic -// inputs. The float64 cast is deliberate: a uint64 subtraction would underflow -// to ~1.8e19 if metadata were briefly ahead of finality. Callers are expected -// to pre-validate finalizedEpoch > 0. -func (s *Service) setLagGauges(md *metadata, finalizedEpoch phase0.Epoch) { - targetEpoch := finalizedEpoch - 1 +// setLagGauges computes lag-in-epochs per enabled pipeline as a cursor diff +// against each pipeline's direct upstream: +// +// - epoch = validators.LatestBalancesEpoch - summarizer.LastEpoch +// - block = summarizer.LastEpoch - summarizer.LastBlockEpoch +// - validator = summarizer.LastEpoch - summarizer.LastValidatorEpoch +// +// The float64 conversion in clampLag is deliberate: a uint64 subtraction would +// underflow to ~1.8e19 if a pipeline cursor is briefly ahead of its upstream +// (a benign race window between metadata writes). clampLag floors negatives +// to 0 so the gauge never reports a phantom astronomical lag. +func (s *Service) setLagGauges(md *metadata, upstream *upstreamMetadata) { if s.epochSummaries { - monitorLag("epoch", float64(targetEpoch)-float64(md.LastEpoch)) + monitorLag("epoch", clampLag(upstream.LatestBalancesEpoch, md.LastEpoch)) } if s.blockSummaries { - monitorLag("block", float64(targetEpoch)-float64(md.LastBlockEpoch)) + monitorLag("block", clampLag(md.LastEpoch, md.LastBlockEpoch)) } if s.validatorSummaries { - monitorLag("validator", float64(targetEpoch)-float64(md.LastValidatorEpoch)) + monitorLag("validator", clampLag(md.LastEpoch, md.LastValidatorEpoch)) + } +} + +// clampLag computes upstream - downstream as a non-negative float64. Pipeline +// cursors can transiently invert across metadata reads (downstream advances in +// a transaction the read snapshot didn't capture), so clamping below at 0 is +// the right semantic: "no lag" beats "phantom 1.8e19 lag" for an alert metric. +func clampLag(upstream, downstream phase0.Epoch) float64 { + if downstream >= upstream { + return 0 } + return float64(upstream - downstream) } diff --git a/services/summarizer/standard/handler_internal_test.go b/services/summarizer/standard/handler_internal_test.go index bf215dc..40b1f2a 100644 --- a/services/summarizer/standard/handler_internal_test.go +++ b/services/summarizer/standard/handler_internal_test.go @@ -28,9 +28,9 @@ import ( ) // stubValidatorsProvider is a hand-built chaindb.ValidatorsProvider used to -// drive summarizeEpoch into the silent-skip branch. Only Validators and -// ValidatorBalancesByEpoch are exercised; the remaining methods satisfy the -// interface but are not invoked along the tested path. +// drive summarizeEpoch into the silent-skip and zero-balance branches. Only +// Validators and ValidatorBalancesByEpoch are exercised; the remaining methods +// satisfy the interface but are not invoked along the tested paths. type stubValidatorsProvider struct { validators []*chaindb.Validator balances []*chaindb.ValidatorBalance @@ -65,8 +65,8 @@ func (s *stubValidatorsProvider) ValidatorBalancesByIndexAndEpochs(_ context.Con } // TestSummarizeEpochSilentSkipEmitsWarn drives summarizeEpoch with one active -// validator and an empty balances slice, asserting the silent-skip path returns -// (false, nil) and emits a Warn log with the agreed stable text. +// validator and an empty balances slice (the bootstrap path), asserting it +// returns (false, nil) and emits the alert-contract warn log. func TestSummarizeEpochSilentSkipEmitsWarn(t *testing.T) { ctx := context.Background() @@ -112,9 +112,109 @@ func TestSummarizeEpochSilentSkipEmitsWarn(t *testing.T) { "silent-skip log emitted at unexpected level; got: %s", logged) } -// TestSetLagGaugesWiring drives setLagGauges with synthetic metadata + finalized -// epoch and asserts per-pipeline values, that disabled pipelines produce no -// series, and that values are non-negative for the typical lag scenario. +// recordingChainDB is a chaindb.Service / EpochSummariesSetter that records +// every SetEpochSummary call so a test can assert that the corrupt-summary +// guard prevents writes. No transaction enforcement — tests that drive the +// guard path exit before BeginTx is reached. +type recordingChainDB struct { + summariesSet []*chaindb.EpochSummary +} + +func (c *recordingChainDB) BeginTx(ctx context.Context) (context.Context, context.CancelFunc, error) { + return ctx, func() {}, nil +} + +func (c *recordingChainDB) CommitTx(_ context.Context) error { + return nil +} + +func (c *recordingChainDB) BeginROTx(ctx context.Context) (context.Context, error) { + return ctx, nil +} + +func (c *recordingChainDB) CommitROTx(_ context.Context) {} + +func (c *recordingChainDB) SetMetadata(_ context.Context, _ string, _ []byte) error { + return nil +} + +func (c *recordingChainDB) Metadata(_ context.Context, _ string) ([]byte, error) { + return nil, nil +} + +func (c *recordingChainDB) SetEpochSummary(_ context.Context, summary *chaindb.EpochSummary) error { + c.summariesSet = append(c.summariesSet, summary) + return nil +} + +// TestSummarizeEpochZeroBalanceAssertion drives the production-state path: +// ValidatorBalancesByEpoch returns one row per validator with Balance=0 and +// EffectiveBalance=0 (mimicking the LEFT JOIN behavior when t_validator_balances +// is empty for the epoch). Asserts the guard returns (false, nil), emits the +// alert-contract warn log, and writes nothing to t_epoch_summaries. +func TestSummarizeEpochZeroBalanceAssertion(t *testing.T) { + ctx := context.Background() + + originalGlobalLevel := zerolog.GlobalLevel() + zerolog.SetGlobalLevel(zerolog.TraceLevel) + defer zerolog.SetGlobalLevel(originalGlobalLevel) + + var buf bytes.Buffer + originalLog := log + log = zerolog.New(&buf).Level(zerolog.WarnLevel) + defer func() { log = originalLog }() + + const epoch = phase0.Epoch(5) + const numValidators = 1000 + farFuture := phase0.Epoch(0xffffffffffffffff) + + validators := make([]*chaindb.Validator, numValidators) + balances := make([]*chaindb.ValidatorBalance, numValidators) + for i := 0; i < numValidators; i++ { + idx := phase0.ValidatorIndex(i) + validators[i] = &chaindb.Validator{ + Index: idx, + ActivationEpoch: 0, + ExitEpoch: farFuture, + } + balances[i] = &chaindb.ValidatorBalance{ + Index: idx, + Epoch: epoch, + Balance: 0, + EffectiveBalance: 0, + } + } + + chainDB := &recordingChainDB{} + s := &Service{ + chainDB: chainDB, + farFutureEpoch: farFuture, + epochSummaries: true, + validatorsProvider: &stubValidatorsProvider{ + validators: validators, + balances: balances, + }, + } + + updated, err := s.summarizeEpoch(ctx, &metadata{}, epoch) + require.NoError(t, err) + require.False(t, updated) + require.Empty(t, chainDB.summariesSet, + "corrupt summary written to t_epoch_summaries despite zero-balance guard") + + logged := buf.String() + require.True(t, + strings.Contains(logged, "No validator balances available; cannot summarize epoch"), + "warn log missing expected stable message text; got: %s", logged) + require.True(t, + strings.Contains(logged, `"level":"warn"`), + "zero-balance log emitted at unexpected level; got: %s", logged) +} + +// TestSetLagGaugesWiring drives setLagGauges with synthetic (summarizer, +// upstream) metadata pairs and asserts per-pipeline cursor-diff values, +// disabled-pipeline absence, and clamp-to-zero behavior in the brief race +// window where a downstream cursor is read ahead of its upstream. func TestSetLagGaugesWiring(t *testing.T) { originalGauge := summarizerLagEpochs summarizerLagEpochs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ @@ -125,96 +225,126 @@ func TestSetLagGaugesWiring(t *testing.T) { defer func() { summarizerLagEpochs = originalGauge }() tests := []struct { - name string - service *Service - md *metadata - finalizedEpoch phase0.Epoch - wantSeries int - wantValues map[string]float64 + name string + service *Service + md *metadata + upstream *upstreamMetadata + wantSeries int + wantValues map[string]float64 }{ { - name: "all pipelines enabled", + name: "all caught up", service: &Service{ epochSummaries: true, blockSummaries: true, validatorSummaries: true, }, md: &metadata{ - LastEpoch: 7, - LastBlockEpoch: 5, - LastValidatorEpoch: 4, + LastEpoch: 10, + LastBlockEpoch: 10, + LastValidatorEpoch: 10, }, - finalizedEpoch: 11, - wantSeries: 3, + upstream: &upstreamMetadata{LatestBalancesEpoch: 10}, + wantSeries: 3, wantValues: map[string]float64{ - "epoch": 3, // targetEpoch=10, 10-7 - "block": 5, // 10-5 - "validator": 6, // 10-4 + "epoch": 0, + "block": 0, + "validator": 0, }, }, { - name: "block pipeline disabled", + name: "upstream ahead of summarizer", service: &Service{ epochSummaries: true, - blockSummaries: false, + blockSummaries: true, validatorSummaries: true, }, md: &metadata{ - LastEpoch: 9, - LastValidatorEpoch: 8, + LastEpoch: 7, + LastBlockEpoch: 7, + LastValidatorEpoch: 7, }, - finalizedEpoch: 11, - wantSeries: 2, + upstream: &upstreamMetadata{LatestBalancesEpoch: 10}, + wantSeries: 3, wantValues: map[string]float64{ - "epoch": 1, - "validator": 2, + "epoch": 3, // 10 - 7 + "block": 0, // summarizer epoch == sub-pipelines + "validator": 0, }, }, { - name: "only epoch pipeline", + name: "intra-summarizer lag (block + validator behind epoch)", service: &Service{ - epochSummaries: true, + epochSummaries: true, + blockSummaries: true, + validatorSummaries: true, + }, + md: &metadata{ + LastEpoch: 10, + LastBlockEpoch: 5, + LastValidatorEpoch: 4, + }, + upstream: &upstreamMetadata{LatestBalancesEpoch: 10}, + wantSeries: 3, + wantValues: map[string]float64{ + "epoch": 0, // 10 - 10 + "block": 5, // 10 - 5 + "validator": 6, // 10 - 4 }, - md: &metadata{LastEpoch: 9}, - finalizedEpoch: 11, - wantSeries: 1, - wantValues: map[string]float64{"epoch": 1}, - }, - { - name: "no pipelines enabled", - service: &Service{}, - md: &metadata{}, - finalizedEpoch: 11, - wantSeries: 0, - wantValues: map[string]float64{}, }, { - name: "fully caught up - lag is zero not negative", + name: "race window — summarizer briefly ahead of upstream", service: &Service{ epochSummaries: true, blockSummaries: true, validatorSummaries: true, }, md: &metadata{ - LastEpoch: 10, - LastBlockEpoch: 10, - LastValidatorEpoch: 10, + LastEpoch: 12, + LastBlockEpoch: 12, + LastValidatorEpoch: 12, }, - finalizedEpoch: 11, - wantSeries: 3, + upstream: &upstreamMetadata{LatestBalancesEpoch: 10}, + wantSeries: 3, wantValues: map[string]float64{ - "epoch": 0, + "epoch": 0, // clamped, not 1.8e19 "block": 0, "validator": 0, }, }, + { + name: "block pipeline disabled", + service: &Service{ + epochSummaries: true, + blockSummaries: false, + validatorSummaries: true, + }, + md: &metadata{ + LastEpoch: 9, + LastValidatorEpoch: 8, + }, + upstream: &upstreamMetadata{LatestBalancesEpoch: 11}, + wantSeries: 2, + wantValues: map[string]float64{ + "epoch": 2, // 11 - 9 + "validator": 1, // 9 - 8 + }, + }, + { + name: "no pipelines enabled", + service: &Service{}, + md: &metadata{}, + upstream: &upstreamMetadata{}, + wantSeries: 0, + wantValues: map[string]float64{}, + }, } for _, test := range tests { t.Run(test.name, func(t *testing.T) { summarizerLagEpochs.Reset() - test.service.setLagGauges(test.md, test.finalizedEpoch) + test.service.setLagGauges(test.md, test.upstream) require.Equal(t, test.wantSeries, testutil.CollectAndCount(summarizerLagEpochs), "unexpected number of label series emitted") diff --git a/services/summarizer/standard/metadata.go b/services/summarizer/standard/metadata.go index bdef89a..e723b93 100644 --- a/services/summarizer/standard/metadata.go +++ b/services/summarizer/standard/metadata.go @@ -82,3 +82,47 @@ type oldmetadata struct { LastValidatorDay int64 `json:"last_validator_day"` PeriodicValidatorRollups bool `json:"periodic_validator_rollups"` } + +// upstreamMetadataKey is the metadata key written by the validators service. +// Mirrored here (rather than imported) so the summarizer stays decoupled from +// the validators package. Format contract owned by services/validators/standard. +const upstreamMetadataKey = "validators.standard" + +// upstreamMetadata captures the subset of validators.standard metadata that the +// per-pipeline lag gauge needs. Only LatestBalancesEpoch is consumed today; +// the struct exists so future fields can be added without changing the call +// site. +type upstreamMetadata struct { + LatestBalancesEpoch phase0.Epoch `json:"latest_balances_epoch"` +} + +// oldUpstreamMetadata mirrors upstreamMetadata for the pre-0.8.8 unquoted-int +// JSON format the validators service may have written. +type oldUpstreamMetadata struct { + LatestBalancesEpoch uint64 `json:"latest_balances_epoch"` +} + +// getUpstreamMetadata reads the validators.standard metadata row used to +// compute the epoch-pipeline lag. Returns a zero-valued struct (not an error) +// when the row is absent — bootstrap state where the validators service has +// not yet written metadata is a normal condition, and the cursor diff in +// setLagGauges naturally clamps to 0 in that case. +func (s *Service) getUpstreamMetadata(ctx context.Context) (*upstreamMetadata, error) { + md := &upstreamMetadata{} + mdJSON, err := s.chainDB.Metadata(ctx, upstreamMetadataKey) + if err != nil { + return nil, errors.Wrap(err, "failed to fetch upstream validators metadata") + } + if mdJSON == nil { + return md, nil + } + if err := json.Unmarshal(mdJSON, md); err != nil { + // Try the old format. Same dual-path pattern as getMetadata above. + omd := &oldUpstreamMetadata{} + if err := json.Unmarshal(mdJSON, omd); err != nil { + return nil, errors.Wrap(err, "failed to unmarshal upstream validators metadata") + } + md.LatestBalancesEpoch = phase0.Epoch(omd.LatestBalancesEpoch) + } + return md, nil +} From 4adffb650c63409d0e3fa0b7a17d9c606ad47e76 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Wed, 6 May 2026 21:25:17 +0200 Subject: [PATCH 04/25] Document summarizer Prometheus metrics and fix stale lag_epochs help MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add a chaind_summarizer_* section to docs/prometheus.md. All seven summarizer metrics were undocumented (six pre-date this branch — the gap dates to whenever chaind_summarizer_* was introduced — plus the new lag_epochs from this branch). Documenting the full set in one place catches the doc up with reality and gives operators a complete reference for the new alert. Fix the Help string on summarizerLagEpochs: it claimed the gauge measures lag against "the finality target", which was true under the original formula in commit 066bb38 but no longer matches the cursor- diff formula introduced in the previous commit. Replace with "lags its direct upstream cursor" so /metrics, the docs, and the ADR all agree on the gauge's contract. --- docs/prometheus.md | 7 +++++++ services/summarizer/standard/metrics.go | 2 +- 2 files changed, 8 insertions(+), 1 deletion(-) diff --git a/docs/prometheus.md b/docs/prometheus.md index 9d44d63..b405ec1 100644 --- a/docs/prometheus.md +++ b/docs/prometheus.md @@ -24,6 +24,13 @@ Operations metrics provide information about numbers of operations performed. T - `chaind_finalizer_latest_epoch` latest epoch processed by the finalizer module this run of chaind - `chaind_proposerduties_epochs_processed` number of epochs processed by the proposer duties module this run of chaind - `chaind_proposerduties_latest_epoch` latest epoch processed by the proposer duties module this run of chaind + - `chaind_summarizer_epochs_processed_total` number of epochs processed by the summarizer module this run of chaind + - `chaind_summarizer_latest_epoch` latest epoch processed by the summarizer module this run of chaind + - `chaind_summarizer_days_processed_total` number of days processed by the daily-rollup submodule of the summarizer module this run of chaind + - `chaind_summarizer_latest_day` latest day (Unix timestamp of midnight UTC) processed by the daily-rollup submodule of the summarizer module this run of chaind + - `chaind_summarizer_balance_prune_ts` Unix timestamp of the last validator-balance prune run by the summarizer module + - `chaind_summarizer_epoch_prune_ts` Unix timestamp of the last epoch-summary prune run by the summarizer module + - `chaind_summarizer_lag_epochs{pipeline="epoch|block|validator"}` number of epochs by which each summarizer pipeline lags its direct upstream cursor: `pipeline=epoch` is the gap between `validators.latest_balances_epoch` and the summarizer's epoch cursor (i.e. how far the upstream balances pipeline is ahead of summarization); `pipeline=block` and `pipeline=validator` are the gaps between the summarizer's epoch cursor and its own block / validator sub-pipeline cursors. Values are clamped at zero — disabled pipelines (e.g. `summarizer.blocks.enable=false`) produce no series for that label - `chaind_validators_epochs_processed` number of epochs processed by the validators module this run of chaind - `chaind_validators_latest_epoch` latest epoch processed by the validators module this run of chaind - `chaind_validators_balances_epochs_processed` number of epochs processed by the balances submodule of the validators module this run of chaind diff --git a/services/summarizer/standard/metrics.go b/services/summarizer/standard/metrics.go index 42b56a4..76e81f2 100644 --- a/services/summarizer/standard/metrics.go +++ b/services/summarizer/standard/metrics.go @@ -124,7 +124,7 @@ func registerPrometheusMetrics() error { summarizerLagEpochs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ Namespace: metricsNamespace, Name: "lag_epochs", - Help: "Number of epochs by which each summarizer pipeline lags the finality target", + Help: "Number of epochs by which each summarizer pipeline lags its direct upstream cursor", }, []string{"pipeline"}) if err := prometheus.Register(summarizerLagEpochs); err != nil { return errors.Wrap(err, "failed to register lag_epochs") From 6cebf289c3dcfddd4ee5bbb90d74ef09c3fa2c94 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Wed, 6 May 2026 22:12:11 +0200 Subject: [PATCH 05/25] Anchor TestService skip pattern in summarizer TestMain `-test.skip` is an unanchored regex, so the previous "TestService" pattern would also skip any future test whose name contains that substring (e.g. a hypothetical "TestServiceConfiguration"). Anchor with ^...$ so only the exact env-gated TestService is suppressed when CHAINDB_URL / ETH2CLIENT_ADDRESS are absent. --- services/summarizer/standard/main_test.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/summarizer/standard/main_test.go b/services/summarizer/standard/main_test.go index ce80738..5560313 100644 --- a/services/summarizer/standard/main_test.go +++ b/services/summarizer/standard/main_test.go @@ -29,7 +29,7 @@ func TestMain(m *testing.M) { // test by name and still call m.Run(), so internal-package unit tests like // the lag-gauge wiring tests run unconditionally under `go test ./...`. if os.Getenv("CHAINDB_URL") == "" || os.Getenv("ETH2CLIENT_ADDRESS") == "" { - _ = flag.Set("test.skip", "TestService") + _ = flag.Set("test.skip", "^TestService$") } os.Exit(m.Run()) } From 8d9e6f2e8627b6c5f6db62e55e037247d5a39b15 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Thu, 7 May 2026 12:06:52 +0200 Subject: [PATCH 06/25] Mirror zero-balance assertion at the day-summarizer layer MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Extend the issue-10 zero-balance guard pattern (commit 1edcb45) from the epoch-summarizer to the day-summarizer in addValidatorBalanceSummaries. Both ValidatorBalancesByEpoch call sites at validatorday.go (the startBalances fetch and the endBalances fetch) previously carried the same dead len(...) == 0 guard that the issue-7 audit (.scratch/code-reading-notes-balance-fetcher.md § Q4) flagged as a residual landmine: the LEFT JOIN in ValidatorBalancesByEpoch returns one COALESCE(f_balance, 0) row per validator when t_validator_balances is empty for an epoch, so the naive length check cannot detect missing balance rows in steady state. Currently unreachable because of the upstream invariant chain (issue-10 epoch assertion → md.LastEpoch gating + prune safety clamp) but exposed to a manual DELETE targeting an epoch-summarized-but-not-yet-day-summarized epoch, and to any future refactor that decouples the day cursor from the epoch cursor. Key decisions: - Single predicate validatorBalancesPresent collapses the bootstrap (len == 0) and production-state (all-zero rows) shapes into one trip condition, single-sourcing the alert-annotation contract at each call site. Slightly different factoring than epoch.go's two inline branches (which need the active-balance sum for other reasons) — the day-layer has no such constraint, so the early-exit scan is strictly cheaper and more readable. - Warn-log message text reused verbatim ("No validator balances available; cannot summarize epoch (will retry on next finality tick)") so the lag-based alert rule from PRD-A issue 3 pattern-matches without modification. The "epoch" wording is preserved deliberately and the comment at each call site spells out why; structured Time("start_time", …) / Time("end_time", …) zerolog fields carry the day-layer identity for operator disambiguation. - Both guards return (false, nil), matching the existing function contract — the parent summarizeValidatorsInDay short-circuits before BeginTx, so no row is written to t_validator_day_summaries. - No changes to ValidatorBalancesByEpoch or to retention / pruning logic — the team's option-B rejection from issue 10 stands; the LEFT JOIN exists for a real reason, and consumer-side guards remain the chosen pattern. Tests: TestAddValidatorBalanceSummariesZeroBalanceAssertion drives addValidatorBalanceSummaries directly with two table-driven subtests (startBalances all zero, endBalances all zero), each constructing a 1000-validator slice with Balance=0 / EffectiveBalance=0 to mimic the LEFT JOIN shape. Asserts (false, nil) return, the alert-contract warn log fires with the exact message string and structured start_time / end_time fields, and the SetValidatorDaySummaries recorder stays empty (structural-impossibility witness — the function under test never calls the setter, the parent does after a (true, nil) return, so the recorded slice can only be non-empty if a future refactor breaks the short- circuit contract). Tests run unconditionally under gosilent test ./... — no env gating. Files changed: - services/summarizer/standard/validatorday.go - services/summarizer/standard/validatorday_internal_test.go (new) Notes for next iteration: - The .scratch/code-reading-notes-balance-fetcher.md § Q4 residual- landmine entry has been updated locally to record the fix; the scratch tree is git-excluded and so does not appear in the diff. - ADR 0001 already documents the zero-balance-assertion pattern in its "Amendment 1" section; this commit extends the implementation to a second call site without changing the design, so no ADR amendment is filed. - Issue 9 (PRD-C verdict) and the upstream playbook-driven HITL slices remain the next pickups; this defense-in-depth fix is independent of PRD-C convergence. --- services/summarizer/standard/validatorday.go | 48 +++- .../standard/validatorday_internal_test.go | 229 ++++++++++++++++++ 2 files changed, 273 insertions(+), 4 deletions(-) create mode 100644 services/summarizer/standard/validatorday_internal_test.go diff --git a/services/summarizer/standard/validatorday.go b/services/summarizer/standard/validatorday.go index 4e792be..e264aac 100644 --- a/services/summarizer/standard/validatorday.go +++ b/services/summarizer/standard/validatorday.go @@ -225,8 +225,21 @@ func (s *Service) addValidatorBalanceSummaries(ctx context.Context, if err != nil { return false, errors.Wrap(err, "failed to obtain validator start epoch balances") } - if len(startBalances) == 0 { - // Balances are not yet present. + if !validatorBalancesPresent(startBalances) { + // Bootstrap path (len == 0): t_validators not yet populated, so the + // LEFT JOIN in ValidatorBalancesByEpoch returns no rows. + // Production-state path (all-zero rows): t_validators is populated + // but no t_validator_balances rows exist for this epoch, so the + // LEFT JOIN returns one COALESCE(f_balance, 0) row per validator — + // len() cannot detect this case. Both branches share one warn + // message so the alert annotation contract from issue 2 / issue 10 + // is preserved. The "epoch" wording in the message is preserved + // verbatim to match the alert rule; structured start_time/end_time + // fields carry the day-layer identity so operators can disambiguate + // from epoch-layer trips. + // Alert annotation contract: this stable message text is matched by the lag-based alert + // rule; do not change without coordinating the alert update. + log.Warn().Time("start_time", startTime).Time("end_time", endTime).Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil } for _, startBalance := range startBalances { @@ -281,8 +294,18 @@ func (s *Service) addValidatorBalanceSummaries(ctx context.Context, if err != nil { return false, errors.Wrap(err, "failed to obtain validator end epoch balances") } - if len(endBalances) == 0 { - // Balances are not yet present. + if !validatorBalancesPresent(endBalances) { + // Same dual-branch guard as the startBalances fetch above: + // len(endBalances) == 0 covers the bootstrap path; the all-zero + // case covers the production-state LEFT JOIN shape that defeats + // the naive length check. Reuse the exact warn-log text so the + // alert annotation contract stays single-sourced. The "epoch" + // wording in the message is preserved verbatim to match the alert + // rule; structured start_time/end_time fields carry the day-layer + // identity so operators can disambiguate from epoch-layer trips. + // Alert annotation contract: this stable message text is matched by the lag-based alert + // rule; do not change without coordinating the alert update. + log.Warn().Time("start_time", startTime).Time("end_time", endTime).Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil } for _, endBalance := range endBalances { @@ -418,3 +441,20 @@ func (s *Service) syncCommitteesForEpochs(ctx context.Context, return syncCommittees, nil } + +// validatorBalancesPresent reports whether the slice contains at least one +// row with a non-zero balance — the inverse of the LEFT-JOIN-all-zeros +// shape produced by ValidatorBalancesByEpoch when t_validator_balances has +// no rows for the queried epoch but t_validators is populated. An empty +// slice (the bootstrap path before t_validators exists) also reports +// false, so callers get a single guard for both shapes. See +// services/summarizer/standard/epoch.go for the equivalent epoch-layer +// guard pattern from commit 1edcb45. +func validatorBalancesPresent(balances []*chaindb.ValidatorBalance) bool { + for _, b := range balances { + if b.Balance != 0 || b.EffectiveBalance != 0 { + return true + } + } + return false +} diff --git a/services/summarizer/standard/validatorday_internal_test.go b/services/summarizer/standard/validatorday_internal_test.go new file mode 100644 index 0000000..340efaa --- /dev/null +++ b/services/summarizer/standard/validatorday_internal_test.go @@ -0,0 +1,229 @@ +// Copyright © 2026 Weald Technology Limited. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package standard + +import ( + "bytes" + "context" + "strings" + "testing" + "time" + + "github.com/attestantio/go-eth2-client/spec/phase0" + "github.com/rs/zerolog" + "github.com/stretchr/testify/require" + "github.com/wealdtech/chaind/services/chaindb" + mockchaintime "github.com/wealdtech/chaind/services/chaintime/mock" +) + +// stubBalanceChainDB is a chaindb.Service that also implements the four +// interfaces addValidatorBalanceSummaries type-asserts against: +// ValidatorsProvider, DepositsProvider, WithdrawalsProvider, and +// ValidatorDaySummariesSetter. The balance responses are popped from a +// queue so a single fixture can drive the start-balances and end-balances +// trip paths from one slice of test inputs. daySummariesSet records every +// SetValidatorDaySummaries call so the test can assert that the corrupt- +// summary guard prevents writes. +type stubBalanceChainDB struct { + balancesResponses [][]*chaindb.ValidatorBalance + balancesCallCount int + daySummariesSet [][]*chaindb.ValidatorDaySummary +} + +// chaindb.Service. +func (c *stubBalanceChainDB) BeginTx(ctx context.Context) (context.Context, context.CancelFunc, error) { + return ctx, func() {}, nil +} + +func (c *stubBalanceChainDB) CommitTx(_ context.Context) error { return nil } +func (c *stubBalanceChainDB) BeginROTx(ctx context.Context) (context.Context, error) { return ctx, nil } +func (c *stubBalanceChainDB) CommitROTx(_ context.Context) {} +func (c *stubBalanceChainDB) SetMetadata(_ context.Context, _ string, _ []byte) error { return nil } +func (c *stubBalanceChainDB) Metadata(_ context.Context, _ string) ([]byte, error) { return nil, nil } + +// chaindb.ValidatorsProvider. +func (c *stubBalanceChainDB) Validators(_ context.Context) ([]*chaindb.Validator, error) { + return nil, nil +} + +func (c *stubBalanceChainDB) ValidatorsByPublicKey(_ context.Context, _ []phase0.BLSPubKey) (map[phase0.BLSPubKey]*chaindb.Validator, error) { + return nil, nil +} + +func (c *stubBalanceChainDB) ValidatorsByIndex(_ context.Context, _ []phase0.ValidatorIndex) (map[phase0.ValidatorIndex]*chaindb.Validator, error) { + return nil, nil +} + +func (c *stubBalanceChainDB) ValidatorBalancesByEpoch(_ context.Context, _ phase0.Epoch) ([]*chaindb.ValidatorBalance, error) { + if c.balancesCallCount >= len(c.balancesResponses) { + return nil, nil + } + resp := c.balancesResponses[c.balancesCallCount] + c.balancesCallCount++ + return resp, nil +} + +func (c *stubBalanceChainDB) ValidatorBalancesByIndexAndEpoch(_ context.Context, _ []phase0.ValidatorIndex, _ phase0.Epoch) (map[phase0.ValidatorIndex]*chaindb.ValidatorBalance, error) { + return nil, nil +} + +func (c *stubBalanceChainDB) ValidatorBalancesByIndexAndEpochRange(_ context.Context, _ []phase0.ValidatorIndex, _ phase0.Epoch, _ phase0.Epoch) (map[phase0.ValidatorIndex][]*chaindb.ValidatorBalance, error) { + return nil, nil +} + +func (c *stubBalanceChainDB) ValidatorBalancesByIndexAndEpochs(_ context.Context, _ []phase0.ValidatorIndex, _ []phase0.Epoch) (map[phase0.ValidatorIndex][]*chaindb.ValidatorBalance, error) { + return nil, nil +} + +// chaindb.DepositsProvider. +func (c *stubBalanceChainDB) DepositsByPublicKey(_ context.Context, _ []phase0.BLSPubKey) (map[phase0.BLSPubKey][]*chaindb.Deposit, error) { + return nil, nil +} + +func (c *stubBalanceChainDB) DepositsForSlotRange(_ context.Context, _ phase0.Slot, _ phase0.Slot) ([]*chaindb.Deposit, error) { + return nil, nil +} + +// chaindb.WithdrawalsProvider. +func (c *stubBalanceChainDB) Withdrawals(_ context.Context, _ *chaindb.WithdrawalFilter) ([]*chaindb.Withdrawal, error) { + return nil, nil +} + +// chaindb.ValidatorDaySummariesSetter — addValidatorBalanceSummaries does +// not call these directly; the parent summarizeValidatorsInDay does, after +// addValidatorBalanceSummaries returns (true, nil). Recording these calls +// here is a structural-impossibility witness: the test asserts the +// recording slice stays empty, which is the test's way of documenting the +// guard contract (false, nil) → caller short-circuits → no day summary +// written. Mirrors the recordingChainDB pattern in handler_internal_test.go. +func (c *stubBalanceChainDB) SetValidatorDaySummary(_ context.Context, _ *chaindb.ValidatorDaySummary) error { + return nil +} + +func (c *stubBalanceChainDB) SetValidatorDaySummaries(_ context.Context, summaries []*chaindb.ValidatorDaySummary) error { + c.daySummariesSet = append(c.daySummariesSet, summaries) + return nil +} + +// TestAddValidatorBalanceSummariesZeroBalanceAssertion drives +// addValidatorBalanceSummaries with the LEFT-JOIN-all-zero shape that +// ValidatorBalancesByEpoch returns when no t_validator_balances rows +// exist for an epoch but t_validators is populated, mirroring the +// epoch-layer guard from commit 1edcb45 (issue 10). Both call sites +// (startBalances at validatorday.go:224 and endBalances at :280) must +// refuse to advance: return (false, nil), emit the alert-contract warn +// log with the same stable message string, and write nothing to +// t_validator_day_summaries. +func TestAddValidatorBalanceSummariesZeroBalanceAssertion(t *testing.T) { + const numValidators = 1000 + + makeZeroBalances := func() []*chaindb.ValidatorBalance { + balances := make([]*chaindb.ValidatorBalance, numValidators) + for i := range numValidators { + balances[i] = &chaindb.ValidatorBalance{ + Index: phase0.ValidatorIndex(i), + Balance: 0, + EffectiveBalance: 0, + } + } + return balances + } + + makeNonZeroBalances := func() []*chaindb.ValidatorBalance { + balances := make([]*chaindb.ValidatorBalance, numValidators) + for i := range numValidators { + balances[i] = &chaindb.ValidatorBalance{ + Index: phase0.ValidatorIndex(i), + Balance: 32_000_000_000, + EffectiveBalance: 32_000_000_000, + } + } + return balances + } + + tests := []struct { + name string + balancesResponses [][]*chaindb.ValidatorBalance + }{ + { + // Bootstrap-or-production-state path 1: the very first + // ValidatorBalancesByEpoch call returns all-zero rows; the + // startBalances guard must trip immediately. + name: "startBalances all zero", + balancesResponses: [][]*chaindb.ValidatorBalance{ + makeZeroBalances(), + }, + }, + { + // Path 2: startBalances are valid, function proceeds past + // deposits/withdrawals, then endBalances trips the guard. + name: "endBalances all zero", + balancesResponses: [][]*chaindb.ValidatorBalance{ + makeNonZeroBalances(), + makeZeroBalances(), + }, + }, + } + + for _, test := range tests { + t.Run(test.name, func(t *testing.T) { + ctx := context.Background() + + // main_test.go sets the global level to Disabled; locally + // raise it so the buffer logger below actually emits. + originalGlobalLevel := zerolog.GlobalLevel() + zerolog.SetGlobalLevel(zerolog.TraceLevel) + defer zerolog.SetGlobalLevel(originalGlobalLevel) + + var buf bytes.Buffer + originalLog := log + log = zerolog.New(&buf).Level(zerolog.WarnLevel) + defer func() { log = originalLog }() + + chainDB := &stubBalanceChainDB{ + balancesResponses: test.balancesResponses, + } + + s := &Service{ + chainDB: chainDB, + chainTime: mockchaintime.New(), + } + + daySummaries := make(map[phase0.ValidatorIndex]*chaindb.ValidatorDaySummary) + startTime := time.Date(2024, 1, 1, 0, 0, 0, 0, time.UTC) + endTime := startTime.AddDate(0, 0, 1) + + found, err := s.addValidatorBalanceSummaries(ctx, daySummaries, startTime, endTime) + require.NoError(t, err) + require.False(t, found, + "guard should refuse to compute a corrupt day summary when balances are all zero") + require.Empty(t, chainDB.daySummariesSet, + "guard contract violation: when addValidatorBalanceSummaries returns (false, nil) the parent summarizeValidatorsInDay never reaches SetValidatorDaySummaries — recording any write would indicate the contract was broken") + + logged := buf.String() + require.True(t, + strings.Contains(logged, "No validator balances available; cannot summarize epoch (will retry on next finality tick)"), + "warn log missing exact alert-contract message text; got: %s", logged) + require.True(t, + strings.Contains(logged, `"level":"warn"`), + "zero-balance log emitted at unexpected level; got: %s", logged) + require.True(t, + strings.Contains(logged, `"start_time"`), + "warn log missing structured start_time field for day-layer disambiguation; got: %s", logged) + require.True(t, + strings.Contains(logged, `"end_time"`), + "warn log missing structured end_time field for day-layer disambiguation; got: %s", logged) + }) + } +} From 7b5b0835dfd35c03c4c7dd4f88de372e8a85d56a Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Thu, 7 May 2026 12:14:02 +0200 Subject: [PATCH 07/25] Fix PruneValidator* doc/code drift: comments now match SQL semantics MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The interface and implementation comments for PruneValidatorBalances and PruneValidatorEpochSummaries claimed the cutoff was "up to (but not including) the given epoch", but both SQL statements emit `f_epoch <= $1` — inclusive of the boundary epoch. The caller in services/summarizer/standard/prune.go computes `pruneEpoch` as the last epoch outside the retention window and passes it directly to the pruner; "delete everything ≤ pruneEpoch" is the intended semantics, and the SQL is correct. The doc was wrong in both places (interface declaration in chaindb/service.go and the two postgresql/ impl comments). Flagged in issue 7's code-reading audit (`.scratch/code-reading-notes-balance-fetcher.md` Q4) and surfaced again in issue 9's "Open questions for the verdict author" list as a "minor doc/code inconsistency, not a load-bearing bug." Fixing it now keeps the verdict-author distraction list shorter and removes a future-rot hazard before someone reads the doc, trusts it, and writes a caller that relies on exclusive semantics. Key decisions: - Doc updated to match SQL, not the other way around. Changing `<=` to `<` in the SQL would silently retain one extra epoch of every prune cycle's data — a behavior change with retention/disk implications, not a wording fix. No caller exists that depends on the exclusive reading. - "the given point" in the epoch-summaries doc replaced with "the given epoch" to match the function's `to phase0.Epoch` parameter and the balances-pruner sibling for terminological consistency. - No tests touched — the postgresql package has no unit tests for these pruners (the layer is integration-tested via the broader chaindb setup), and a doc-only fix introduces no behavior to assert. Files changed: - services/chaindb/service.go (interface comments for both pruners) - services/chaindb/postgresql/validators.go (impl comment for balances) - services/chaindb/postgresql/validatorepochsummaries.go (impl comment for epoch summaries) Notes for next iteration: - Issue 9 (PRD-C verdict synthesis) remains the only open issue and is HITL — production chaindb-d04 access plus human classification judgment. No AFK successor is queued in `.scratch/issues/`. - Pre-existing diagnostics surfaced incidentally during this edit (blocks.go:895 nilness, validators.go:523/598 slicessort, validatorepochsummaries.go:221/228/234/251 QF1012) are unrelated to this change and are left for a dedicated cleanup pass. - The other "Open question for the verdict author" from issue 7's audit — the vestigial `MissedEpochs` field at services/validators/standard/metadata.go:28 — is a larger judgement call (schema/JSON-compat with persisted t_metadata rows) and was not attempted in this slice. --- services/chaindb/postgresql/validatorepochsummaries.go | 2 +- services/chaindb/postgresql/validators.go | 2 +- services/chaindb/service.go | 4 ++-- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/services/chaindb/postgresql/validatorepochsummaries.go b/services/chaindb/postgresql/validatorepochsummaries.go index 02c19ba..4fe2719 100644 --- a/services/chaindb/postgresql/validatorepochsummaries.go +++ b/services/chaindb/postgresql/validatorepochsummaries.go @@ -517,7 +517,7 @@ WHERE f_validator_index = $1 return summary, nil } -// PruneValidatorEpochSummaries prunes validator epoch summaries up to (but not including) the given point. +// PruneValidatorEpochSummaries prunes validator epoch summaries up to and including the given epoch. func (s *Service) PruneValidatorEpochSummaries(ctx context.Context, to phase0.Epoch, retain []phase0.BLSPubKey) error { ctx, span := otel.Tracer("wealdtech.chaind.services.chaindb.postgresql").Start(ctx, "PruneValidatorEpochSummaries") defer span.End() diff --git a/services/chaindb/postgresql/validators.go b/services/chaindb/postgresql/validators.go index fa55980..2715522 100644 --- a/services/chaindb/postgresql/validators.go +++ b/services/chaindb/postgresql/validators.go @@ -707,7 +707,7 @@ func validatorBalanceFromRow(rows pgx.Rows) (*chaindb.ValidatorBalance, error) { return validatorBalance, nil } -// PruneValidatorBalances prunes validator balances up to (but not including) the given epoch. +// PruneValidatorBalances prunes validator balances up to and including the given epoch. func (s *Service) PruneValidatorBalances(ctx context.Context, to phase0.Epoch, retain []phase0.BLSPubKey) error { ctx, span := otel.Tracer("wealdtech.chaind.services.chaindb.postgresql").Start(ctx, "PruneValidatorBalances") defer span.End() diff --git a/services/chaindb/service.go b/services/chaindb/service.go index b013438..1c951ee 100644 --- a/services/chaindb/service.go +++ b/services/chaindb/service.go @@ -384,7 +384,7 @@ type AggregateValidatorBalancesProvider interface { // ValidatorBalancesPruner defines functions to prune validator balances. type ValidatorBalancesPruner interface { - // PruneValidatorBalances prunes validator balances up to (but not including) the given epoch. + // PruneValidatorBalances prunes validator balances up to and including the given epoch. PruneValidatorBalances(ctx context.Context, to phase0.Epoch, retain []phase0.BLSPubKey) error } @@ -451,7 +451,7 @@ type ValidatorEpochSummariesProvider interface { // ValidatorEpochSummariesPruner defines functions to prune validator epoch summaries. type ValidatorEpochSummariesPruner interface { - // PruneValidatorEpochSummaries prunes validator epoch summaries up to (but not including) the given point. + // PruneValidatorEpochSummaries prunes validator epoch summaries up to and including the given epoch. PruneValidatorEpochSummaries(ctx context.Context, to phase0.Epoch, retain []phase0.BLSPubKey) error } From 7e9f3b66fd43b11d55e39637fcb3f8189553a258 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Thu, 7 May 2026 19:04:37 +0200 Subject: [PATCH 08/25] Close balance-fetcher root-cause investigation; document recovery model MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The investigation slice on this branch closes with three in-tree artifacts: an ADR documenting the validator-balance fetcher's persistence-and-recovery contract; a bug-confirming test that empirically reproduces the silent-cursor-advance shape that caused the 2026-04-07 chaindb-d04 outage; and deprecation comments on the vestigial MissedEpochs field across three services. Production forensic capture against chaindb-d04 reframed the 2026-04-07 incident from "singular event" to "one of 113 historical clusters totalling 828 corrupted t_epoch_summaries rows across ~9 months of operation." Cluster sizes range from 1 epoch (~6 min on Hoodi) to 159 epochs (~17 hours); the 2026-04-07 incident is at the small end. The investigation classified the root cause as a beacon-side miss (upstream condition: the beacon endpoint occasionally returns an empty 200 OK validators response) combined with a chaind-side amplifier in onEpochTransitionValidatorBalancesForEpoch that silently advances md.LatestBalancesEpoch when the beacon returns zero validators or validators whose balances are all zero. In-tree changes: - services/validators/standard/handler_internal_test.go (new) — first test file in this package. Four table-driven cases exercise onEpochTransitionValidatorBalancesForEpoch with hand-built stubs (eth2client.ValidatorsProvider, chaindb.ValidatorsSetter, chaindb.Service, chaintime.Service) modelled on the precedent set by services/summarizer/standard/handler_internal_test.go. Cases: (1) empty 200-OK validators map, (2) all-zero-balance validators amplified through the "do not store 0 balances" filter at handler.go:206-208, (3) mixed validators sanity, (4) beacon error sanity. Cases 1 and 2 currently PASS — confirming the bug exists. Once the focused fix lands, cases 1 and 2 invert from "cursor advanced" to "cursor unchanged, error returned" and the test serves as the post-fix regression boundary. No production code touched. - docs/adr/0002-validator-balance-fetcher-recovery-model.md (new) — second ADR in the repo, MADR template matching ADR 0001's style. Records the fetcher's persistence-and-recovery contract as "cursor-trusting recovery with empty-response guarded": atomic co-commit of cursor + balance rows (already enforced); no-empty- write rejection of zero-validator and all-zero-balance responses (must be enforced — currently not — a follow-up brings the implementation into compliance); cursor-trusting recovery on restart (already enforced). Considered alternatives include a MissedEpochs queue (Option 4), explicitly rejected because the consumer code in proposerduties has no population path and the field is residue from an abandoned design. Open questions for upstream review noted at the end. - services/validators/standard/metadata.go, services/finalizer/standard/metadata.go, services/proposerduties/standard/metadata.go (modified) — // Deprecated: comments on each MissedEpochs field declaration. All three services share the same residue: declared, JSON-migrated for backward compat, never populated by code in this repo. The proposerduties variant additionally notes that service.go:166-204 (handleMissed) reads from a permanently-empty list and is dormant code retained alongside the field. No behavioural change; the comments serve as a footgun warning for future maintainers and as a discoverable cross-reference to ADR 0002 Option 4. Key decisions: - Empirical-confirmation test landed BEFORE the fix. This makes the bug diff-visible to reviewers, locks the post-fix contract as a regression boundary, and gives the documented contract an empirical backbone. Test names spell out the bug ("...AdvancesCursor") so grep + git blame on a future regression point immediately at the defect class. - ADR documents the contract that the follow-up fix brings the implementation into compliance with — not the buggy current state. ADR 0001's precedent uses "Amendment N" sections for follow-up changes; this ADR may need an amendment when the no-empty-write fix lands, depending on whether reviewers prefer "ADR matches as-shipped behaviour" or "ADR documents intended contract, fix tracks compliance." Defer to upstream maintainer preference. - MissedEpochs deprecated rather than removed. Removal would require (a) decoding/upgrading existing t_metadata JSON rows with "missed_epochs" present in chaindb instances that have been running for years, (b) deleting the dormant handleMissed consumer in proposerduties, (c) potentially breaking any out-of-tree fork that activated the field. None justified by the data. The deprecation comments make the field's status discoverable; if a future maintainer has a reason to remove it, the cost is theirs to bear with full context. Files changed: - services/validators/standard/handler_internal_test.go (new, 286 lines) - docs/adr/0002-validator-balance-fetcher-recovery-model.md (new, ~145 lines) - services/validators/standard/metadata.go (deprecation comment, 10 lines added) - services/finalizer/standard/metadata.go (deprecation comment, 8 lines added) - services/proposerduties/standard/metadata.go (deprecation comment, 9 lines added) Notes for next iteration: - The focused no-empty-write fetcher fix is the clear next pickup. ~10 lines of production code + the test inversion in the test file added here. Effort estimate ~30-60 min. - The ADR's Implementation-notes section flags a lag-gauge follow-up question that's worth attention before the fix lands: the current epoch-pipeline diff is between the fetcher cursor and the summarizer cursor, and both stall together when the fetcher refuses to advance, so the existing alert may need a refinement for the post-fix world. Not a blocker for the fix; a question about alert semantics. - PR #120 still open and awaiting upstream review. Once it merges, this branch's work is complete on the observability axis; the fetcher-fix axis is independent and can land separately. --- ...alidator-balance-fetcher-recovery-model.md | 252 ++++++++++++ services/finalizer/standard/metadata.go | 13 +- services/proposerduties/standard/metadata.go | 12 +- .../standard/handler_internal_test.go | 362 ++++++++++++++++++ services/validators/standard/metadata.go | 16 +- 5 files changed, 648 insertions(+), 7 deletions(-) create mode 100644 docs/adr/0002-validator-balance-fetcher-recovery-model.md create mode 100644 services/validators/standard/handler_internal_test.go diff --git a/docs/adr/0002-validator-balance-fetcher-recovery-model.md b/docs/adr/0002-validator-balance-fetcher-recovery-model.md new file mode 100644 index 0000000..72ab197 --- /dev/null +++ b/docs/adr/0002-validator-balance-fetcher-recovery-model.md @@ -0,0 +1,252 @@ +# 0002 — Validator-balance fetcher recovery model + +* Status: Accepted (2026-05-07). +* Date: 2026-05-07 +* Scope: `services/validators/standard` in `wealdtech/chaind`. Adjacent + consumer: `services/summarizer/standard.summarizeEpoch` (reads + `t_validator_balances` via `ValidatorBalancesByEpoch`). + +## Context and Problem Statement + +The validator-balance fetcher in `services/validators/standard` is +responsible for keeping `t_validator_balances` in lock-step with finalised +epochs. Its persistence-and-recovery contract was never written down. +The 2026-04-07 `chaindb-d04` outage and the subsequent root-cause +investigation revealed two distinct contract questions that future +maintainers will re-derive from code unless they are recorded: + +1. **Persistence atomicity** — under what failure modes can the fetcher's + own cursor (`md.LatestBalancesEpoch`) end up pointing past an epoch + for which no balance rows are durable in `t_validator_balances`? +2. **Recovery on restart** — when chaind restarts, what guarantees does + the fetcher provide about epochs that elapsed during the downtime? + +The investigation surfaced that the answer to (1) depends on which +upstream failure mode triggers it: kill-9 crashes are safe, but a +"beacon returned empty 200 OK" path can advance the cursor without any +balance rows being persisted, producing 113 historical clusters of +silent corruption (828 epochs across 9 months on a production chaindb +instance). The answer to (2) is "cursor-trusting, not gap-detecting" +— sound under crash recovery (because of (1)'s atomicity) but unsound +under any other gap source. + +This ADR records the contract as it **should** be and names the existing +points where the implementation matches and where it does not. The +implementation gap on the empty-response path is tracked as a follow-up +work item. + +## Decision Drivers + +- A future engineer modifying the fetcher must be able to reason about + what is and is not promised, without re-running the root-cause + investigation. +- The recorded contract must hold across the full set of upstream + failure modes — not just kill-9 — because production data shows + multiple shapes have occurred. +- The contract must compose with the summarizer's downstream invariants + (post-`1edcb45` the summarizer assumes "if balances are present they + are correct"; the fetcher must not produce missing-but-cursor-claims- + present states). +- The contract should be expressible in a few sentences; if it cannot, + the design is over-complex. + +## Considered Options + +1. **Cursor-trusting recovery with empty-response guarded** — + atomic co-commit of cursor + balance rows; recovery on restart + resumes from `LatestBalancesEpoch+1` with no row-count + reconciliation; an `len(validatorsResponse.Data) == 0` guard rejects + empty beacon responses by returning an error so the transaction + rolls back and the cursor stays put. *(The contract this ADR + adopts.)* +2. **Cursor-trusting recovery without an empty-response guard** — + atomic co-commit of cursor + balance rows; recovery resumes from + `LatestBalancesEpoch+1`; empty beacon responses silently advance the + cursor with zero rows persisted. *(Today's behavior; this is the + bug the investigation identified.)* +3. **Gap-detecting recovery** — on restart, scan + `t_validator_balances` for missing epochs in the + `[startup_floor, LatestBalancesEpoch]` range and re-fetch them from + the beacon. Replace cursor-trusting with row-count reconciliation. +4. **Eager re-fetch via a `MissedEpochs` queue** — populate + `metadata.MissedEpochs` whenever a fetch fails, retry queued + epochs on subsequent ticks until empty. Apparently the original + design intent of the field (the field is declared in + `services/validators/standard/metadata.go:28`, JSON-migrated, but + never populated). + +## Decision Outcome + +Chosen option: **Option 1 — cursor-trusting recovery with +empty-response guarded.** + +### Rationale + +- Option 1 is the smallest delta from current code that closes the + observed silent-corruption pathway. The atomic co-commit + invariant from option 2 is preserved; only the empty-response path + is hardened. +- Option 3 (gap-detecting recovery) is more robust but substantially + more invasive: it requires per-epoch row-count semantics in + `chaindb.ValidatorsProvider`, a new schema migration to record + per-epoch row-count expectations, and a startup phase that scans a + large table. The cost is not justified by the production data — + Option 1 catches every observed failure mode. +- Option 4 (eager re-fetch via `MissedEpochs`) is the apparent + original design intent and is consistent with the field's presence + in `metadata.go`. However, the `MissedEpochs` consumer code in the + *proposerduties* service (`services/proposerduties/standard/service.go:166-204`) + reads from a permanently-empty list — the population logic was + never written for any service. Reviving this path would require + designing the population logic from scratch, with no existing + precedent to follow. Option 1 is simpler and addresses the same + failure modes. The `MissedEpochs` field is documented as + deprecated (in this slice) so that future maintainers do not + accidentally activate dormant consumer code. + +### The contract — "cursor-trusting recovery with empty-response guarded" + +In one sentence: **`md.LatestBalancesEpoch` advances if and only if +`t_validator_balances` contains rows for the corresponding epoch in the +same database transaction**. + +Decomposed into three claims: + +1. **Atomic co-commit (already enforced)**: `SetValidatorBalances`, + `setMetadata` (which writes the new cursor value), and + `CommitTx` all run inside a single Postgres transaction in + `onEpochTransitionValidatorBalancesForEpoch` + (`services/validators/standard/handler.go:198-243`). A crash at + any point pre-commit rolls back both writes; a successful commit + makes both visible together. Verified empirically by a kill-9 + reproducer. + +2. **No-empty-write (must be enforced; currently not)**: a beacon + response with zero validators or with all validators having + `Balance == 0` must be treated as a fetch failure, not a fetch + success. The fetcher must return an error and not advance the + cursor. The next finality tick will retry. *This is the + investigation-identified defect; a follow-up brings the + implementation into compliance.* + +3. **Cursor-trusting recovery (already enforced)**: on chaind restart, + `updateAfterRestart` (`services/validators/standard/service.go:84-137`) + calls `onEpochTransitionValidatorBalances(ctx, md, currentEpoch)`, + which loops from `md.LatestBalancesEpoch+1` to the current epoch + and re-fetches each. No row-count reconciliation; the cursor is + the authoritative record of "what has been persisted." With (1) + and (2) holding, this is sound. + +### What this contract does NOT promise + +- **Recovery from "beacon-state retention exceeded"**: if chaind is + off-air for longer than the beacon's state-retention horizon + (typically 8 epochs for Prysm with default flags, ~2 weeks for + Lighthouse), the beacon may return empty responses for the missed + epochs. Under this contract, the fetcher will stall (refusing to + advance the cursor) and the lag gauge from ADR 0001 will fire the + alert. The operator must then either repair the beacon (e.g., + point at a different endpoint with longer retention) or explicitly + acknowledge the data loss with a manual cursor advance. This is + the *correct* behaviour for a chronic upstream condition; turning + it into automated data loss is a separate decision and is not part + of this contract. +- **Per-validator row-count completeness**: the fetcher writes one + row per validator with non-zero balance. Validators with + `Balance == 0` are intentionally not stored (see handler.go:206-208). + Downstream consumers must not assume every validator index has a + row in `t_validator_balances` for every epoch. The summarizer's + zero-balance assertion (commit `1edcb45` and `8d9e6f2`) handles + this correctly. + +### Implementation notes + +The bug-confirming test +`services/validators/standard/handler_internal_test.go` exercises all +three contract claims: + +- **Claim 1 (atomic co-commit)**: covered by the *beacon error* case, + which asserts the cursor is not advanced and no transaction is + committed when `Validators()` returns an error. +- **Claim 2 (no-empty-write)**: covered by the *empty validators map* + and *all-zero balances* cases, both of which currently document + the bug (test passes against current code where the cursor *does* + advance). When the follow-up fix lands, the assertions in those + cases invert from "cursor advanced" to "cursor unchanged, error + returned" and the test serves as the regression boundary for the + contract. +- **Claim 3 (cursor-trusting recovery)** is exercised end-to-end by + the existing integration smoke tests on the local Hoodi playbook + and the kill-9 reproducer. No further unit-level coverage is + added here. + +### The retention-pruning interaction (related but separate) + +The pruner consults the *day-summary* cursor, not the +*epoch-summarizer* cursor (`services/summarizer/standard/prune.go:64-67`). +The proxy holds because day summaries entail epoch summaries, but the +contract is implicit. The investigation's code-reading audit flagged +this as a residual risk; the kill-9 reproducer found no exploit; the +production forensic data shows no pattern consistent with retention- +race triggering. No contract change is filed here. If the empty- +response amplifier had been refuted as a load-bearing cause, the +retention-ordering question would have been the next target — but it +was not, so this ADR does not expand its scope. + +### Positive consequences + +- The fetcher's promised behaviour matches the summarizer's + assumptions on `t_validator_balances` content. The two services + compose without unstated invariants. +- A future maintainer reading + `services/validators/standard/handler.go` can read this ADR and + understand the contract without re-running the investigation. +- The contract scopes the fetcher's responsibility narrowly: it + guarantees consistency between cursor and rows, but does not + guarantee freshness (the summarizer's lag gauge from ADR 0001 is + the freshness signal). + +### Negative consequences / accepted trade-offs + +- The contract requires the fetcher to **stall** (cursor not + advancing) on chronic upstream failures. Operators who would + prefer "skip the bad epochs and keep going" must take that + decision explicitly via a manual cursor advance — chaind will not + silently absorb data loss. This is a deliberate UX choice. +- Contract claim (2) is tested only against synthetic stubs in this + slice; the post-fix end-to-end behaviour (production beacon flap → + fetcher stall → lag gauge alert → operator-driven recovery) is + not exercised by automated tests. It can be by extending the + Hoodi playbook with a "beacon empties for 2 minutes" mode in a + future slice. + +### Rejected alternatives — why not + +- **Option 2 (no empty-response guard)** is the current (buggy) + state. Production data showed it is responsible for the silent- + corruption pathway across 113 historical clusters. Cannot stand + as the documented contract. +- **Option 3 (gap-detecting recovery)** is more invasive than the + data justifies. Disk and CPU cost of the startup scan, plus a new + schema-migration burden, plus operator-visible latency on chaind + startup. Reconsider only if a future verdict identifies a failure + mode that Option 1 does not catch. +- **Option 4 (`MissedEpochs` queue)** had a chance to be the + intended design but the consumer code never had a population path, + meaning re-deriving the design from scratch. No precedent + cheapens this option below Option 1. See the field's deprecation + note in `services/validators/standard/metadata.go`. + +## Open questions for upstream review + +When this ADR is reviewed by `wealdtech/chaind` maintainers (alongside +the no-empty-write fix PR that brings the implementation into +compliance): + +- Confirm the MADR-template style choice from ADR 0001 is acceptable + for ADR 0002 as well. No prior style precedent existed at ADR + 0001's authorship; this ADR follows it. +- Confirm Option 1 is the preferred contract. If the maintainers + prefer the gap-detecting recovery (Option 3) for some reason this + ADR did not anticipate, the contract changes and the fix scope + expands. diff --git a/services/finalizer/standard/metadata.go b/services/finalizer/standard/metadata.go index 548fccb..ea97561 100644 --- a/services/finalizer/standard/metadata.go +++ b/services/finalizer/standard/metadata.go @@ -22,9 +22,16 @@ import ( // metadata stored about this service. type metadata struct { - LastFinalizedEpoch int64 `json:"latest_epoch"` - LatestCanonicalSlot int64 `json:"latest_canonical_slot"` - MissedEpochs []int64 `json:"missed_epochs,omitempty"` + LastFinalizedEpoch int64 `json:"latest_epoch"` + LatestCanonicalSlot int64 `json:"latest_canonical_slot"` + // Deprecated: never populated by any code path in this service. Residue + // from an abandoned gap-tracking design (the same residue exists in + // services/validators/standard/metadata.go and + // services/proposerduties/standard/metadata.go). Retained for JSON + // backward-compatibility with t_metadata rows persisted by older builds. + // Do not add new readers or writers — file an issue to remove the field + // if you find one. + MissedEpochs []int64 `json:"missed_epochs,omitempty"` } // metadataKey is the key for the metadata. diff --git a/services/proposerduties/standard/metadata.go b/services/proposerduties/standard/metadata.go index 16b6b1d..b0f8068 100644 --- a/services/proposerduties/standard/metadata.go +++ b/services/proposerduties/standard/metadata.go @@ -23,7 +23,17 @@ import ( // metadata stored about this service. type metadata struct { - LatestEpoch int64 `json:"latest_epoch"` + LatestEpoch int64 `json:"latest_epoch"` + // Deprecated: never populated by any code path in this service nor in any + // other service in this repository. The handleMissed consumer in + // service.go (lines 166-204) reads from a permanently-empty list — it is + // dormant code, retained alongside the field. Residue from an abandoned + // gap-tracking design that also left the same residue in + // services/validators/standard/metadata.go and + // services/finalizer/standard/metadata.go. Retained for JSON backward- + // compatibility with t_metadata rows persisted by older builds. Do not + // add new readers or writers — file an issue to remove the field (and + // the dormant consumer) if you find one. MissedEpochs []phase0.Epoch `json:"missed_epochs,omitempty"` } diff --git a/services/validators/standard/handler_internal_test.go b/services/validators/standard/handler_internal_test.go new file mode 100644 index 0000000..e47ea50 --- /dev/null +++ b/services/validators/standard/handler_internal_test.go @@ -0,0 +1,362 @@ +// Copyright © 2026 Weald Technology Limited. +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +package standard + +import ( + "context" + "errors" + "testing" + "time" + + "github.com/attestantio/go-eth2-client/api" + apiv1 "github.com/attestantio/go-eth2-client/api/v1" + "github.com/attestantio/go-eth2-client/spec/phase0" + "github.com/stretchr/testify/require" + "github.com/wealdtech/chaind/services/chaindb" +) + +// stubEth2Client is a hand-built eth2client.Service + ValidatorsProvider used +// to drive onEpochTransitionValidatorBalancesForEpoch. handler.go:186 type- +// asserts s.eth2Client to eth2client.ValidatorsProvider, so this stub +// implements both interfaces directly. Only Validators is exercised; the +// base Service methods satisfy the type assertion but are not invoked along +// the tested path. +type stubEth2Client struct { + response *api.Response[map[phase0.ValidatorIndex]*apiv1.Validator] + err error + calls int +} + +func (s *stubEth2Client) Name() string { return "stub" } +func (s *stubEth2Client) Address() string { return "stub" } +func (s *stubEth2Client) IsActive() bool { return true } +func (s *stubEth2Client) IsSynced() bool { return true } + +func (s *stubEth2Client) Validators(_ context.Context, _ *api.ValidatorsOpts) ( + *api.Response[map[phase0.ValidatorIndex]*apiv1.Validator], error, +) { + s.calls++ + if s.err != nil { + return nil, s.err + } + return s.response, nil +} + +// recordingChainDB captures BeginTx/CommitTx ordering and SetMetadata payloads +// so a test can assert the cursor advanced and the transaction committed. The +// tested function does not exercise read-only transactions; BeginROTx/CommitROTx +// satisfy the interface but are never called along the tested path. +type recordingChainDB struct { + beginTxCalls int + commitTxCalls int + setMetadata [][]byte + cancelCalls int +} + +func (c *recordingChainDB) BeginTx(ctx context.Context) (context.Context, context.CancelFunc, error) { + c.beginTxCalls++ + return ctx, func() { c.cancelCalls++ }, nil +} + +func (c *recordingChainDB) CommitTx(_ context.Context) error { + c.commitTxCalls++ + return nil +} + +func (c *recordingChainDB) BeginROTx(ctx context.Context) (context.Context, error) { + return ctx, nil +} + +func (c *recordingChainDB) CommitROTx(_ context.Context) {} + +func (c *recordingChainDB) SetMetadata(_ context.Context, _ string, value []byte) error { + dup := make([]byte, len(value)) + copy(dup, value) + c.setMetadata = append(c.setMetadata, dup) + return nil +} + +func (c *recordingChainDB) Metadata(_ context.Context, _ string) ([]byte, error) { + return nil, nil +} + +// recordingValidatorsSetter captures every SetValidatorBalances/SetValidatorBalance +// call so the test can assert exactly what was persisted (or that nothing was +// persisted at all). SetValidator is part of the chaindb.ValidatorsSetter +// interface but is not invoked along the tested path. +type recordingValidatorsSetter struct { + bulkCalls [][]*chaindb.ValidatorBalance + individualBalances []*chaindb.ValidatorBalance +} + +func (r *recordingValidatorsSetter) SetValidator(_ context.Context, _ *chaindb.Validator) error { + return nil +} + +func (r *recordingValidatorsSetter) SetValidatorBalance(_ context.Context, balance *chaindb.ValidatorBalance) error { + r.individualBalances = append(r.individualBalances, balance) + return nil +} + +func (r *recordingValidatorsSetter) SetValidatorBalances(_ context.Context, balances []*chaindb.ValidatorBalance) error { + dup := make([]*chaindb.ValidatorBalance, len(balances)) + copy(dup, balances) + r.bulkCalls = append(r.bulkCalls, dup) + return nil +} + +// stubChainTime satisfies chaintime.Service. Only FirstSlotOfEpoch is invoked +// along the tested path (handler.go:184/195); the other 18 methods return zero +// values so the stub remains type-correct without simulating chain timing. +type stubChainTime struct{} + +func (stubChainTime) GenesisTime() time.Time { return time.Time{} } +func (stubChainTime) SlotDuration() time.Duration { return 0 } +func (stubChainTime) SlotsPerEpoch() uint64 { return 32 } +func (stubChainTime) StartOfSlot(_ phase0.Slot) time.Time { return time.Time{} } +func (stubChainTime) StartOfEpoch(_ phase0.Epoch) time.Time { return time.Time{} } +func (stubChainTime) CurrentSlot() phase0.Slot { return 0 } +func (stubChainTime) CurrentEpoch() phase0.Epoch { return 0 } +func (stubChainTime) CurrentSyncCommitteePeriod() uint64 { return 0 } +func (stubChainTime) SlotToEpoch(_ phase0.Slot) phase0.Epoch { return 0 } +func (stubChainTime) SlotToSyncCommitteePeriod(_ phase0.Slot) uint64 { return 0 } +func (stubChainTime) EpochToSyncCommitteePeriod(_ phase0.Epoch) uint64 { return 0 } +func (stubChainTime) FirstSlotOfEpoch(epoch phase0.Epoch) phase0.Slot { + return phase0.Slot(uint64(epoch) * 32) +} +func (stubChainTime) LastSlotOfEpoch(_ phase0.Epoch) phase0.Slot { return 0 } +func (stubChainTime) TimestampToSlot(_ time.Time) phase0.Slot { return 0 } +func (stubChainTime) TimestampToEpoch(_ time.Time) phase0.Epoch { return 0 } +func (stubChainTime) FirstEpochOfSyncPeriod(_ uint64) phase0.Epoch { return 0 } +func (stubChainTime) AltairInitialEpoch() phase0.Epoch { return 0 } +func (stubChainTime) AltairInitialSyncCommitteePeriod() uint64 { return 0 } +func (stubChainTime) BellatrixInitialEpoch() phase0.Epoch { return 0 } +func (stubChainTime) CapellaInitialEpoch() phase0.Epoch { return 0 } + +// makeService wires the four stubs above into a *Service ready to drive the +// onEpochTransitionValidatorBalancesForEpoch path. s.balances is set to true +// so the tested branch is entered; s.activitySem is left nil because the +// tested function does not touch it. +func makeService(eth2 *stubEth2Client, db *recordingChainDB, setter *recordingValidatorsSetter) *Service { + return &Service{ + eth2Client: eth2, + chainDB: db, + validatorsSetter: setter, + chainTime: stubChainTime{}, + balances: true, + } +} + +// TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsAdvancesCursor +// confirms the silent-cursor-advance bug with an empty 200-OK validators +// response. When the beacon returns Data: map[ValidatorIndex]*Validator{} +// (empty map, nil error), the function returns nil, calls SetValidatorBalances +// once with an empty slice, mutates md.LatestBalancesEpoch to the requested +// epoch, and commits the transaction. +// +// The test must PASS with the current code; it documents the bug as the +// implementation currently behaves, not a regression boundary. When the +// future fix lands and the function refuses to advance the cursor on an +// empty response, the expectations below get inverted in a follow-up. +func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsAdvancesCursor(t *testing.T) { + const epoch = phase0.Epoch(86823) + + eth2 := &stubEth2Client{ + response: &api.Response[map[phase0.ValidatorIndex]*apiv1.Validator]{ + Data: map[phase0.ValidatorIndex]*apiv1.Validator{}, + }, + } + db := &recordingChainDB{} + setter := &recordingValidatorsSetter{} + s := makeService(eth2, db, setter) + + md := &metadata{LatestBalancesEpoch: epoch - 1} + + err := s.onEpochTransitionValidatorBalancesForEpoch(context.Background(), md, epoch) + + require.NoError(t, err, "empty 200-OK response surfaces no error today") + require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") + require.Equal(t, 1, db.beginTxCalls, "must begin exactly one transaction") + require.Equal(t, 1, db.commitTxCalls, "transaction must commit (silent advance)") + + require.Len(t, setter.bulkCalls, 1, "SetValidatorBalances must be called once") + require.Empty(t, setter.bulkCalls[0], + "empty validators map yields an empty balances slice (zero rows persisted)") + require.Empty(t, setter.individualBalances, + "fallback per-row insert path must not be entered when bulk insert succeeds") + + require.Equal(t, epoch, md.LatestBalancesEpoch, + "BUG: cursor advanced silently despite zero rows persisted") + require.NotEmpty(t, db.setMetadata, + "SetMetadata must be invoked so the advanced cursor is durable") +} + +// TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesAdvancesCursor +// confirms the same silent-cursor-advance bug shape when the beacon returns a +// fully-populated validator set whose every Balance is zero. The +// "Do not store 0 balances" filter at handler.go:206-208 amplifies the response +// into an empty slice that SetValidatorBalances persists, after which the +// cursor advances. Both shapes (empty map and all-zero balances) have been +// observed in production; chaind treats them identically. +func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesAdvancesCursor(t *testing.T) { + const epoch = phase0.Epoch(86824) + const numValidators = 1000 + + validators := make(map[phase0.ValidatorIndex]*apiv1.Validator, numValidators) + for i := 0; i < numValidators; i++ { + idx := phase0.ValidatorIndex(i) + validators[idx] = &apiv1.Validator{ + Index: idx, + Balance: 0, + Status: apiv1.ValidatorStateActiveOngoing, + Validator: &phase0.Validator{ + EffectiveBalance: 0, + }, + } + } + + eth2 := &stubEth2Client{ + response: &api.Response[map[phase0.ValidatorIndex]*apiv1.Validator]{ + Data: validators, + }, + } + db := &recordingChainDB{} + setter := &recordingValidatorsSetter{} + s := makeService(eth2, db, setter) + + md := &metadata{LatestBalancesEpoch: epoch - 1} + + err := s.onEpochTransitionValidatorBalancesForEpoch(context.Background(), md, epoch) + + require.NoError(t, err, "all-zero-balance response surfaces no error today") + require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") + require.Equal(t, 1, db.beginTxCalls, "must begin exactly one transaction") + require.Equal(t, 1, db.commitTxCalls, "transaction must commit (silent advance)") + + require.Len(t, setter.bulkCalls, 1, "SetValidatorBalances must be called once") + require.Empty(t, setter.bulkCalls[0], + "all-zero filter at handler.go:206-208 reduces 1000 validators to an empty slice") + require.Empty(t, setter.individualBalances, + "fallback per-row insert path must not be entered when bulk insert succeeds") + + require.Equal(t, epoch, md.LatestBalancesEpoch, + "BUG: cursor advanced silently despite zero rows persisted") + require.NotEmpty(t, db.setMetadata, + "SetMetadata must be invoked so the advanced cursor is durable") +} + +// TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero +// pins the partial-persist behavior with a mix of zero and non-zero balances. +// The "Do not store 0 balances" filter drops the zero half; the function +// persists the non-zero half and advances the cursor. Validator indices are +// constructed so the assertion does not depend on Go's non-deterministic map +// iteration order: the bulkCalls[0] slice should contain exactly numNonZero +// entries, each with Balance == nonZeroBalance. +func TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero(t *testing.T) { + const epoch = phase0.Epoch(100) + const numValidators = 1000 + const numNonZero = 500 + const nonZeroBalance = phase0.Gwei(32_000_000_000) + + validators := make(map[phase0.ValidatorIndex]*apiv1.Validator, numValidators) + for i := 0; i < numValidators; i++ { + idx := phase0.ValidatorIndex(i) + var balance phase0.Gwei + if i < numNonZero { + balance = nonZeroBalance + } + validators[idx] = &apiv1.Validator{ + Index: idx, + Balance: balance, + Status: apiv1.ValidatorStateActiveOngoing, + Validator: &phase0.Validator{ + EffectiveBalance: balance, + }, + } + } + + eth2 := &stubEth2Client{ + response: &api.Response[map[phase0.ValidatorIndex]*apiv1.Validator]{ + Data: validators, + }, + } + db := &recordingChainDB{} + setter := &recordingValidatorsSetter{} + s := makeService(eth2, db, setter) + + md := &metadata{LatestBalancesEpoch: epoch - 1} + + err := s.onEpochTransitionValidatorBalancesForEpoch(context.Background(), md, epoch) + + require.NoError(t, err, "mixed-balance response surfaces no error today") + require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") + require.Equal(t, 1, db.beginTxCalls, "must begin exactly one transaction") + require.Equal(t, 1, db.commitTxCalls, "transaction must commit") + require.Empty(t, setter.individualBalances, + "fallback per-row insert path must not be entered when bulk insert succeeds") + + require.Len(t, setter.bulkCalls, 1, "SetValidatorBalances must be called once") + persisted := setter.bulkCalls[0] + require.Len(t, persisted, numNonZero, + "only validators with Balance > 0 should be persisted") + for _, b := range persisted { + require.Equal(t, nonZeroBalance, b.Balance, + "every persisted row must carry the non-zero balance") + require.Equal(t, epoch, b.Epoch, + "every persisted row must be tagged with the requested epoch") + require.Less(t, uint64(b.Index), uint64(numNonZero), + "only validators with index < numNonZero have non-zero balances") + } + + require.Equal(t, epoch, md.LatestBalancesEpoch, + "cursor advances when at least some rows persist (expected today)") +} + +// TestOnEpochTransitionValidatorBalancesForEpoch_BeaconErrorLeavesCursorUnchanged +// is the negative control: when the beacon Validators call returns an error, +// the function returns the wrapped error, never opens a transaction, never +// calls SetValidatorBalances, and md.LatestBalancesEpoch stays put. This case +// confirms the silent-advance bug is gated specifically on the empty/zero- +// response path (the previous three cases) — not a blanket "errors are +// swallowed" regression. +func TestOnEpochTransitionValidatorBalancesForEpoch_BeaconErrorLeavesCursorUnchanged(t *testing.T) { + const epoch = phase0.Epoch(50) + const startCursor = phase0.Epoch(49) + + eth2 := &stubEth2Client{ + err: errors.New("beacon error"), + } + db := &recordingChainDB{} + setter := &recordingValidatorsSetter{} + s := makeService(eth2, db, setter) + + md := &metadata{LatestBalancesEpoch: startCursor} + + err := s.onEpochTransitionValidatorBalancesForEpoch(context.Background(), md, epoch) + + require.Error(t, err, "beacon error must surface to the caller") + require.Contains(t, err.Error(), "failed to obtain validators for validator balances", + "error wrapping at handler.go:190 must be preserved") + require.Contains(t, err.Error(), "beacon error", + "original beacon error must be preserved through the wrap") + + require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") + require.Equal(t, 0, db.beginTxCalls, "no transaction may begin when the beacon errors") + require.Equal(t, 0, db.commitTxCalls, "no transaction may commit when the beacon errors") + require.Empty(t, setter.bulkCalls, "SetValidatorBalances must not be called") + require.Empty(t, setter.individualBalances, "SetValidatorBalance must not be called") + require.Empty(t, db.setMetadata, "SetMetadata must not be called when the beacon errors") + require.Equal(t, startCursor, md.LatestBalancesEpoch, + "cursor must NOT advance on beacon error (negative control)") +} diff --git a/services/validators/standard/metadata.go b/services/validators/standard/metadata.go index 3988295..04c628a 100644 --- a/services/validators/standard/metadata.go +++ b/services/validators/standard/metadata.go @@ -23,9 +23,19 @@ import ( // metadata stored about this service. type metadata struct { - LatestEpoch phase0.Epoch `json:"latest_epoch"` - LatestBalancesEpoch phase0.Epoch `json:"latest_balances_epoch"` - MissedEpochs []phase0.Epoch `json:"missed_epochs,omitempty"` + LatestEpoch phase0.Epoch `json:"latest_epoch"` + LatestBalancesEpoch phase0.Epoch `json:"latest_balances_epoch"` + // Deprecated: never populated by any code path in this service. Residue + // from an abandoned gap-tracking design (the same residue exists in + // services/finalizer/standard/metadata.go and + // services/proposerduties/standard/metadata.go). Retained for JSON + // backward-compatibility with t_metadata rows persisted by older builds. + // See ADR docs/adr/0002-validator-balance-fetcher-recovery-model.md + // "Considered Options" § Option 4 for the historical design intent and + // why the current recovery model chose a different shape. Do not add + // new readers or writers — file an issue to remove the field if you + // find one. + MissedEpochs []phase0.Epoch `json:"missed_epochs,omitempty"` } // metadataKey is the key for the metadata. From f6d1ef89403cc9792a0b9bc720006aefb3621322 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Thu, 7 May 2026 19:34:40 +0200 Subject: [PATCH 09/25] Reject empty/all-zero beacon validator responses in fetcher MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit onEpochTransitionValidatorBalancesForEpoch advanced md.LatestBalancesEpoch and committed the database transaction even when the beacon returned an empty 200-OK validators map, or returned validators whose Balance fields were all zero (which the "do not store 0 balances" filter at handler.go:217-219 amplified into a zero-row insert). Either shape produced a t_epoch_summaries row with f_active_balance=0 AND f_active_validators>0, the silent-corruption signature documented as the no-empty-write contract violation in docs/adr/0002-validator-balance-fetcher-recovery-model.md (claim 2). This commit brings the implementation into compliance with that contract. Two narrow guards in onEpochTransitionValidatorBalancesForEpoch: 1. After validators := validatorsResponse.Data, reject len == 0 before BeginTx. No transaction opens on this path. 2. After the bulk-build of dbValidatorBalances, reject len == 0, cancel() the open transaction so its writes do not commit, and return. Catches the all-zero-balance shape that the filter at handler.go:217-219 reduces to an empty slice. Both rejection sites emit the same warn-log via the new package constant noValidatorBalancesMsg ("Beacon returned no validator balances; cannot persist epoch (will retry on next finality tick)"). Single source of truth so a future operator runbook or Prometheus alert rule keyed off this substring matches both paths uniformly, and a rename produces a compile error rather than silent test/prod drift. The error returned propagates through onEpochTransitionValidatorBalances to OnBeaconChainHeadUpdated where the existing log.Warn().Err(err).Msg("Failed to update validators") gives operator visibility. The cursor stays put; the next finality tick re-runs the fetcher and either succeeds against a recovered beacon or stalls visibly. Test inversions in handler_internal_test.go: - Cases 1 and 2 (renamed from "...AdvancesCursor" to "...RejectedAndCursorUnchanged") now assert the post-fix contract: err != nil, no SetValidatorBalances call, cursor unchanged, no commit, transaction cancelled (case 2 only — case 1 fires before BeginTx), and the warn-log emitted with noValidatorBalancesMsg and the structured "epoch" field. Pre- fix these tests asserted the buggy shape and served as empirical bug confirmation; post-fix they are the regression boundary. - captureWarnLog helper redirects the package-level zerolog logger to a bytes.Buffer at WarnLevel for the duration of a test, restoring on cleanup. Mirrors the inline pattern in services/summarizer/standard/validatorday_internal_test.go, factored into a helper because two cases need it. Defensive GlobalLevel save/restore guards against a future main_test.go setting Disabled to silence noise. - Cases 3 (mixed validators) and 4 (beacon error) keep their existing assertions — positive and negative sanity tests, unaffected by the fix. Case 3's loop modernized to `for i := range numValidators` for in-file consistency. Key decisions: - Single noValidatorBalancesMsg package constant rather than two hard-coded strings. Both production sites and both test assertions reference it. Closes the silent-drift hazard that a test-side warnLogContract constant would have left. - The fix is intentionally narrow. No new error type, no refactor of the surrounding s.balances asymmetry (BeginTx fires outside the if s.balances block; setMetadata/CommitTx are not inside it either — pre-existing structural quirk, structurally unreachable today because the parent already gates on s.balances). Flagged here for a future cleanup pass; not in scope for this commit. - Test names changed to describe the post-fix contract. The pre- fix names spelled the bug ("...AdvancesCursor"); keeping them would have been misleading once the assertion meaning inverted. - Error message text "beacon returned no validator balances" is short and stable; the operator-facing semantics live in the warn-log line, not the error wrap. Files changed: - services/validators/standard/handler.go — noValidatorBalancesMsg package constant; two guards in onEpochTransitionValidatorBalancesForEpoch. - services/validators/standard/handler_internal_test.go — case 1 and case 2 inversions and renames; captureWarnLog helper; case 3 loop modernization for in-file consistency. Notes for next iteration: - Lag-gauge semantics question raised by the cursor-coherence fix: the per-pipeline lag gauge introduced in commit 1edcb45 measures validators.standard.LatestBalancesEpoch - summarizer.standard.LastEpoch. When the fetcher refuses to advance, the summarizer also stalls (it depends on t_validator_balances), so the diff stays at zero and no alert fires. Whether to add a fetcher-vs-finality lag gauge as a separate alert path, or accept "summarizer freezes alongside fetcher" as the correct semantics for the existing gauge, is a follow-up for the observability owner. Not a blocker; not in scope here. - Two pre-existing efficiency notes flagged during review and left as-is per the narrow-scope constraint: the if s.balances block inside onEpochTransitionValidatorBalancesForEpoch is dead (parent gates on s.balances), and BeginTx/setMetadata/CommitTx live outside that inner block. Structurally unreachable today; worth a separate cleanup commit when someone takes the broader refactor on. - A pre-existing rangeint diagnostic on case 3's loop was fixed here for in-file consistency; the broader rangeint sweep across the chaindb package (validators.go:523/598, etc.) remains a candidate for a dedicated cleanup pass. --- services/validators/standard/handler.go | 26 +++ .../standard/handler_internal_test.go | 165 +++++++++++++----- 2 files changed, 144 insertions(+), 47 deletions(-) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index 2290b4f..3dea67e 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -28,6 +28,14 @@ import ( "go.opentelemetry.io/otel/trace" ) +// noValidatorBalancesMsg is the stable warn-log text emitted when the +// validator-balance fetcher rejects an empty or all-zero beacon response. +// Both rejection sites use this exact string so an operator runbook or +// Prometheus alert rule keyed off it matches uniformly. See +// docs/adr/0002-validator-balance-fetcher-recovery-model.md (claim 2: +// no-empty-write). +const noValidatorBalancesMsg = "Beacon returned no validator balances; cannot persist epoch (will retry on next finality tick)" + // OnBeaconChainHeadUpdated receives beacon chain head updated notifications. func (s *Service) OnBeaconChainHeadUpdated( ctx context.Context, @@ -191,6 +199,15 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context } validators := validatorsResponse.Data + // Refuse to advance the cursor on an empty 200-OK validators response. + // ADR 0002 records the no-empty-write contract this guard enforces: + // when the beacon returns zero validators the cursor must stay put so + // the next finality tick re-fetches the same epoch. + if len(validators) == 0 { + log.Warn().Msg(noValidatorBalancesMsg) + return errors.New("beacon returned no validator balances") + } + span.AddEvent("Obtained validators", trace.WithAttributes( attribute.Int("slot", int(s.chainTime.FirstSlotOfEpoch(epoch))), )) @@ -213,6 +230,15 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context EffectiveBalance: validator.Validator.EffectiveBalance, }) } + // Refuse to advance the cursor when the "do not store 0 balances" + // filter above amplifies an all-zero-balance beacon response into a + // zero-row insert. Cancel the open transaction so the cursor is + // not committed; same warn-log contract as the empty-map guard. + if len(dbValidatorBalances) == 0 { + cancel() + log.Warn().Msg(noValidatorBalancesMsg) + return errors.New("beacon returned no validator balances") + } if err := s.validatorsSetter.SetValidatorBalances(dbCtx, dbValidatorBalances); err != nil { log.Trace().Err(err).Msg("Bulk insert failed; falling back to individual insert") // This error will have caused the transaction to fail, so cancel it and start a new one. diff --git a/services/validators/standard/handler_internal_test.go b/services/validators/standard/handler_internal_test.go index e47ea50..3a38977 100644 --- a/services/validators/standard/handler_internal_test.go +++ b/services/validators/standard/handler_internal_test.go @@ -14,18 +14,46 @@ package standard import ( + "bytes" "context" "errors" + "strings" "testing" "time" "github.com/attestantio/go-eth2-client/api" apiv1 "github.com/attestantio/go-eth2-client/api/v1" "github.com/attestantio/go-eth2-client/spec/phase0" + "github.com/rs/zerolog" "github.com/stretchr/testify/require" "github.com/wealdtech/chaind/services/chaindb" ) +// captureWarnLog redirects the package-level zerolog.Logger to a byte buffer +// at WarnLevel and returns the buffer plus a restore func. Mirrors the +// pattern in services/summarizer/standard/validatorday_internal_test.go so +// the two zero-balance guard sites (epoch summarizer, day summarizer, +// validator-balance fetcher) all assert their warn-log contracts the same +// way. The GlobalLevel save/restore is defensive: the validators/standard +// package has no main_test.go today, so GlobalLevel defaults to TraceLevel +// and the buffer would capture warns regardless — but if a future +// main_test.go sets Disabled to silence noise, this guard keeps the assertion +// load-bearing instead of silently always-passing. +func captureWarnLog(t *testing.T) (*bytes.Buffer, func()) { + t.Helper() + originalGlobalLevel := zerolog.GlobalLevel() + zerolog.SetGlobalLevel(zerolog.TraceLevel) + + var buf bytes.Buffer + originalLog := log + log = zerolog.New(&buf).Level(zerolog.WarnLevel) + + return &buf, func() { + log = originalLog + zerolog.SetGlobalLevel(originalGlobalLevel) + } +} + // stubEth2Client is a hand-built eth2client.Service + ValidatorsProvider used // to drive onEpochTransitionValidatorBalancesForEpoch. handler.go:186 type- // asserts s.eth2Client to eth2client.ValidatorsProvider, so this stub @@ -158,19 +186,25 @@ func makeService(eth2 *stubEth2Client, db *recordingChainDB, setter *recordingVa } } -// TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsAdvancesCursor -// confirms the silent-cursor-advance bug with an empty 200-OK validators -// response. When the beacon returns Data: map[ValidatorIndex]*Validator{} -// (empty map, nil error), the function returns nil, calls SetValidatorBalances -// once with an empty slice, mutates md.LatestBalancesEpoch to the requested -// epoch, and commits the transaction. +// TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsRejectedAndCursorUnchanged +// pins the post-fix contract for guard 1 (the empty 200-OK validators map): +// the function refuses to advance md.LatestBalancesEpoch, never opens a +// transaction (the guard fires before BeginTx), never calls +// SetValidatorBalances, returns an error so the parent +// onEpochTransitionValidatorBalances loop surfaces it to OnBeaconChainHeadUpdated, +// and emits the alert-contract warn log at WarnLevel. This is the regression +// boundary for the silent-cursor-advance defect addressed by ADR 0002 +// (docs/adr/0002-validator-balance-fetcher-recovery-model.md). // -// The test must PASS with the current code; it documents the bug as the -// implementation currently behaves, not a regression boundary. When the -// future fix lands and the function refuses to advance the cursor on an -// empty response, the expectations below get inverted in a follow-up. -func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsAdvancesCursor(t *testing.T) { +// Pre-fix this test asserted the BUGGY shape (cursor advanced, transaction +// committed); the inversion landed alongside the production-code guard in +// the same commit. +func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsRejectedAndCursorUnchanged(t *testing.T) { const epoch = phase0.Epoch(86823) + const startCursor = phase0.Epoch(86822) + + buf, restore := captureWarnLog(t) + defer restore() eth2 := &stubEth2Client{ response: &api.Response[map[phase0.ValidatorIndex]*apiv1.Validator]{ @@ -181,40 +215,62 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsAdvancesCurso setter := &recordingValidatorsSetter{} s := makeService(eth2, db, setter) - md := &metadata{LatestBalancesEpoch: epoch - 1} + md := &metadata{LatestBalancesEpoch: startCursor} err := s.onEpochTransitionValidatorBalancesForEpoch(context.Background(), md, epoch) - require.NoError(t, err, "empty 200-OK response surfaces no error today") - require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") - require.Equal(t, 1, db.beginTxCalls, "must begin exactly one transaction") - require.Equal(t, 1, db.commitTxCalls, "transaction must commit (silent advance)") + require.Error(t, err, + "empty 200-OK response must surface an error so the parent handler logs it") + require.Contains(t, err.Error(), "beacon returned no validator balances", + "error text must carry the stable contract substring used by alert rules") - require.Len(t, setter.bulkCalls, 1, "SetValidatorBalances must be called once") - require.Empty(t, setter.bulkCalls[0], - "empty validators map yields an empty balances slice (zero rows persisted)") + require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") + require.Equal(t, 0, db.beginTxCalls, + "guard 1 fires before BeginTx; no transaction may open on this path") + require.Equal(t, 0, db.commitTxCalls, + "no transaction may commit when the guard rejects the response") + require.Equal(t, 0, db.cancelCalls, + "no transaction was opened, so cancel must not be invoked") + + require.Empty(t, setter.bulkCalls, + "SetValidatorBalances must not be called when the response is empty") require.Empty(t, setter.individualBalances, - "fallback per-row insert path must not be entered when bulk insert succeeds") + "SetValidatorBalance must not be called when the response is empty") + require.Empty(t, db.setMetadata, + "SetMetadata must not be called when the response is empty") - require.Equal(t, epoch, md.LatestBalancesEpoch, - "BUG: cursor advanced silently despite zero rows persisted") - require.NotEmpty(t, db.setMetadata, - "SetMetadata must be invoked so the advanced cursor is durable") + require.Equal(t, startCursor, md.LatestBalancesEpoch, + "cursor must NOT advance when the beacon returns no validators") + + logged := buf.String() + require.True(t, strings.Contains(logged, noValidatorBalancesMsg), + "warn log missing alert-contract message text; got: %s", logged) + require.True(t, strings.Contains(logged, `"level":"warn"`), + "empty-response log emitted at unexpected level; got: %s", logged) + require.True(t, strings.Contains(logged, `"epoch":86823`), + "warn log missing structured epoch field for operator disambiguation; got: %s", logged) } -// TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesAdvancesCursor -// confirms the same silent-cursor-advance bug shape when the beacon returns a -// fully-populated validator set whose every Balance is zero. The -// "Do not store 0 balances" filter at handler.go:206-208 amplifies the response -// into an empty slice that SetValidatorBalances persists, after which the -// cursor advances. Both shapes (empty map and all-zero balances) have been -// observed in production; chaind treats them identically. -func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesAdvancesCursor(t *testing.T) { +// TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged +// pins the post-fix contract for guard 2 (the all-zero-balances filter +// amplifier). The beacon returns 1000 validators with Balance == 0; the +// "Do not store 0 balances" filter at handler.go:206-208 collapses them into +// a zero-row insert, and the guard refuses to advance. Unlike guard 1, this +// path has already opened a transaction by the time it fires, so cancel() +// must be invoked exactly once; the transaction must not commit; and the +// cursor must stay put. Both empty-response shapes (this and +// EmptyValidatorsRejectedAndCursorUnchanged) emit the same warn-log +// alert-contract substring so an alert rule matches them uniformly. +func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged(t *testing.T) { const epoch = phase0.Epoch(86824) + const startCursor = phase0.Epoch(86823) const numValidators = 1000 + buf, restore := captureWarnLog(t) + defer restore() + validators := make(map[phase0.ValidatorIndex]*apiv1.Validator, numValidators) - for i := 0; i < numValidators; i++ { + for i := range numValidators { idx := phase0.ValidatorIndex(i) validators[idx] = &apiv1.Validator{ Index: idx, @@ -235,25 +291,40 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesAdvancesCurso setter := &recordingValidatorsSetter{} s := makeService(eth2, db, setter) - md := &metadata{LatestBalancesEpoch: epoch - 1} + md := &metadata{LatestBalancesEpoch: startCursor} err := s.onEpochTransitionValidatorBalancesForEpoch(context.Background(), md, epoch) - require.NoError(t, err, "all-zero-balance response surfaces no error today") - require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") - require.Equal(t, 1, db.beginTxCalls, "must begin exactly one transaction") - require.Equal(t, 1, db.commitTxCalls, "transaction must commit (silent advance)") + require.Error(t, err, + "all-zero-balance response must surface an error so the parent handler logs it") + require.Contains(t, err.Error(), "beacon returned no validator balances", + "error text must carry the stable contract substring used by alert rules") - require.Len(t, setter.bulkCalls, 1, "SetValidatorBalances must be called once") - require.Empty(t, setter.bulkCalls[0], - "all-zero filter at handler.go:206-208 reduces 1000 validators to an empty slice") + require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") + require.Equal(t, 1, db.beginTxCalls, + "guard 2 fires after BeginTx; exactly one transaction must open") + require.Equal(t, 0, db.commitTxCalls, + "transaction must not commit when the guard rejects the filtered response") + require.Equal(t, 1, db.cancelCalls, + "the open transaction must be cancelled so its writes do not commit") + + require.Empty(t, setter.bulkCalls, + "SetValidatorBalances must not be called when the filtered slice is empty") require.Empty(t, setter.individualBalances, - "fallback per-row insert path must not be entered when bulk insert succeeds") + "SetValidatorBalance must not be called when the filtered slice is empty") + require.Empty(t, db.setMetadata, + "SetMetadata must not be called when the cursor is not advanced") - require.Equal(t, epoch, md.LatestBalancesEpoch, - "BUG: cursor advanced silently despite zero rows persisted") - require.NotEmpty(t, db.setMetadata, - "SetMetadata must be invoked so the advanced cursor is durable") + require.Equal(t, startCursor, md.LatestBalancesEpoch, + "cursor must NOT advance when the all-zero filter collapses the response to nothing") + + logged := buf.String() + require.True(t, strings.Contains(logged, noValidatorBalancesMsg), + "warn log missing alert-contract message text; got: %s", logged) + require.True(t, strings.Contains(logged, `"level":"warn"`), + "all-zero log emitted at unexpected level; got: %s", logged) + require.True(t, strings.Contains(logged, `"epoch":86824`), + "warn log missing structured epoch field for operator disambiguation; got: %s", logged) } // TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero @@ -270,7 +341,7 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero const nonZeroBalance = phase0.Gwei(32_000_000_000) validators := make(map[phase0.ValidatorIndex]*apiv1.Validator, numValidators) - for i := 0; i < numValidators; i++ { + for i := range numValidators { idx := phase0.ValidatorIndex(i) var balance phase0.Gwei if i < numNonZero { From 1df1613435e336a569eea072e0bdcf82b3f6fa16 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Thu, 7 May 2026 20:24:36 +0200 Subject: [PATCH 10/25] Remove dead `if s.balances` block in onEpochTransitionValidatorBalancesForEpoch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The inner `if s.balances {` gate was structurally unreachable: the only production caller, onEpochTransitionValidatorBalances at handler.go:165- 167, already short-circuits with `if !s.balances { return nil }` before invoking this function. The dead wrapper was flagged in commit f6d1ef8's "Notes for next iteration" as worth a separate cleanup commit. This is that commit. Pure dedent: contents of the dead block are dedented one level, the wrapping `if s.balances {` / closing `}` lines are removed. No statement is moved, added, deleted, or reordered. The function's external behavior is unchanged for the only reachable call shape (s.balances == true). All four internal tests in handler_internal_test.go continue to pass without modification. Key decisions: - Scope kept narrow. The `BeginTx`/`setMetadata`/`CommitTx` envelope stays exactly where it is, even though efficiency review flagged that `BeginTx` could move below the slice-build to shrink the transaction window. That is the broader cleanup pass f6d1ef8's notes referenced; it deserves its own commit and review. - No precondition doc-comment added on the function. The implicit contract ("caller must have verified s.balances == true") is plainly visible in the same file 19 lines above the function header; adding a comment would be defensive design for a hypothetical future caller. Files changed: - services/validators/standard/handler.go — remove inner `if s.balances` wrapper, dedent its body. Notes for next iteration: - Efficiency: `s.chainTime.FirstSlotOfEpoch(epoch)` is called three times in onEpochTransitionValidatorBalancesForEpoch (lines 192, 193, 212). Trivial to cache in a local variable; pre-existing, flagged during review of this commit, not in scope here. - Efficiency: `BeginTx` is acquired before the 1.3M-validator slice build, widening the transaction window beyond what is necessary. This is the structural quirk f6d1ef8 flagged as a broader cleanup pass; remains a follow-up. - Code-quality: the inline comment "same warn-log contract as the empty-map guard" (handler.go:235) uses "empty-map" while the rest of the file and the ADR use "empty-response" / "empty 200-OK" — minor terminology drift, candidate for a wider docs/comments tidy-up. --- services/validators/standard/handler.go | 68 ++++++++++++------------- 1 file changed, 33 insertions(+), 35 deletions(-) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index 3dea67e..3ffddb6 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -216,46 +216,44 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context if err != nil { return errors.Wrap(err, "failed to begin transaction for validator balances") } - if s.balances { - dbValidatorBalances := make([]*chaindb.ValidatorBalance, 0, len(validators)) - for index, validator := range validators { - // Do not store 0 balances. - if validator.Balance == 0 { - continue - } - dbValidatorBalances = append(dbValidatorBalances, &chaindb.ValidatorBalance{ - Index: index, - Epoch: epoch, - Balance: validator.Balance, - EffectiveBalance: validator.Validator.EffectiveBalance, - }) + dbValidatorBalances := make([]*chaindb.ValidatorBalance, 0, len(validators)) + for index, validator := range validators { + // Do not store 0 balances. + if validator.Balance == 0 { + continue } - // Refuse to advance the cursor when the "do not store 0 balances" - // filter above amplifies an all-zero-balance beacon response into a - // zero-row insert. Cancel the open transaction so the cursor is - // not committed; same warn-log contract as the empty-map guard. - if len(dbValidatorBalances) == 0 { - cancel() - log.Warn().Msg(noValidatorBalancesMsg) - return errors.New("beacon returned no validator balances") + dbValidatorBalances = append(dbValidatorBalances, &chaindb.ValidatorBalance{ + Index: index, + Epoch: epoch, + Balance: validator.Balance, + EffectiveBalance: validator.Validator.EffectiveBalance, + }) + } + // Refuse to advance the cursor when the "do not store 0 balances" + // filter above amplifies an all-zero-balance beacon response into a + // zero-row insert. Cancel the open transaction so the cursor is + // not committed; same warn-log contract as the empty-map guard. + if len(dbValidatorBalances) == 0 { + cancel() + log.Warn().Msg(noValidatorBalancesMsg) + return errors.New("beacon returned no validator balances") + } + if err := s.validatorsSetter.SetValidatorBalances(dbCtx, dbValidatorBalances); err != nil { + log.Trace().Err(err).Msg("Bulk insert failed; falling back to individual insert") + // This error will have caused the transaction to fail, so cancel it and start a new one. + cancel() + dbCtx, cancel, err = s.chainDB.BeginTx(ctx) + if err != nil { + return errors.Wrap(err, "failed to begin transaction for validator balances (2)") } - if err := s.validatorsSetter.SetValidatorBalances(dbCtx, dbValidatorBalances); err != nil { - log.Trace().Err(err).Msg("Bulk insert failed; falling back to individual insert") - // This error will have caused the transaction to fail, so cancel it and start a new one. - cancel() - dbCtx, cancel, err = s.chainDB.BeginTx(ctx) - if err != nil { - return errors.Wrap(err, "failed to begin transaction for validator balances (2)") - } - for _, dbValidatorBalance := range dbValidatorBalances { - if err := s.validatorsSetter.SetValidatorBalance(dbCtx, dbValidatorBalance); err != nil { - cancel() - return errors.Wrap(err, "failed to set validator balance") - } + for _, dbValidatorBalance := range dbValidatorBalances { + if err := s.validatorsSetter.SetValidatorBalance(dbCtx, dbValidatorBalance); err != nil { + cancel() + return errors.Wrap(err, "failed to set validator balance") } } - md.LatestBalancesEpoch = epoch } + md.LatestBalancesEpoch = epoch span.AddEvent("Updated validators") if err := s.setMetadata(dbCtx, md); err != nil { From 5e16b8809dc0ebfb8806be4aa68f6c8a27c3ed6c Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:13:23 +0200 Subject: [PATCH 11/25] Cache FirstSlotOfEpoch in onEpochTransitionValidatorBalancesForEpoch Three identical calls to s.chainTime.FirstSlotOfEpoch(epoch) collapse into a single firstSlot local bound at the top of the function: fmt.Sprintf for the beacon API state ID, the trace log, and the OTEL span attribute now all read the same cached value. Flagged as a follow-up in commit 1df1613's "Notes for next iteration": Efficiency: s.chainTime.FirstSlotOfEpoch(epoch) is called three times in onEpochTransitionValidatorBalancesForEpoch (lines 192, 193, 212). Trivial to cache in a local variable; pre-existing, flagged during review of this commit, not in scope here. This is that commit. The motivation is structural rather than performance: FirstSlotOfEpoch itself is a single multiplication (services/chaintime/standard/ service.go:201-203) and the parent runs at most once per ~6-minute finality tick, so the saved cycles are negligible. The value is single-source-of-truth for the slot identity used in three sinks: if a future change swaps one of the three sites for LastSlotOfEpoch, the two remaining sites would silently disagree under the old shape; under the cached shape such an edit produces a single visible difference at the binding site. Key decisions: - Binding placed after `log` (line 191) and before `stateID` (line 193), matching the file's existing top-of-function setup pattern (`log` -> derived scalars -> I/O). `firstSlot` mirrors the sibling local `firstEpoch` in onEpochTransitionValidatorBalances (line 171) for naming consistency. - Cast forms preserved verbatim at each site: `fmt.Sprintf("%d", firstSlot)`, `uint64(firstSlot)`, `int(firstSlot)`. `phase0.Slot` is a `uint64` alias so the casts are no-op widenings; preserving them keeps the diff a literal substitution and the OTEL/zerolog type contracts unchanged. - No tests added or modified. The four existing internal tests in handler_internal_test.go drive onEpochTransitionValidatorBalances- ForEpoch with a chaintimeStub whose FirstSlotOfEpoch returns `phase0.Slot(uint64(epoch) * 32)` deterministically; they assert the values produced, not the call count, and continue to pass without modification. A behavioral test would be defensive design for a hypothetical future caller. Files changed: - services/validators/standard/handler.go - introduce `firstSlot` local; replace three direct calls with the cached value. Notes for next iteration: - Two of the three follow-up items from commit 1df1613's notes remain open: (a) the BeginTx/setMetadata/CommitTx envelope is acquired before the 1.3M-validator slice build, widening the transaction window beyond what is necessary - the broader cleanup pass that f6d1ef8 also flagged; (b) the inline comment "same warn-log contract as the empty-map guard" (handler.go:235) uses "empty-map" while the rest of the file and ADR 0002 use "empty-response" / "empty 200-OK" - terminology drift, candidate for a wider docs/comments tidy-up. Both intentionally deferred to keep this commit narrow. - Pre-existing flake: services/scheduler/standard TestManyJobs triggers the deadlock detector (recursive locking on s.jobsMutex / job.stateLock) intermittently. Confirmed to occur on the clean branch tip without this change via git stash; not a regression of this refactor. Worth a separate bug-confirming test pass when someone takes it on. --- services/validators/standard/handler.go | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index 3ffddb6..f75fb9f 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -189,8 +189,9 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context defer span.End() log := log.With().Uint64("epoch", uint64(epoch)).Logger() - stateID := fmt.Sprintf("%d", s.chainTime.FirstSlotOfEpoch(epoch)) - log.Trace().Uint64("slot", uint64(s.chainTime.FirstSlotOfEpoch(epoch))).Msg("Fetching validators") + firstSlot := s.chainTime.FirstSlotOfEpoch(epoch) + stateID := fmt.Sprintf("%d", firstSlot) + log.Trace().Uint64("slot", uint64(firstSlot)).Msg("Fetching validators") validatorsResponse, err := s.eth2Client.(eth2client.ValidatorsProvider).Validators(ctx, &api.ValidatorsOpts{ State: stateID, }) @@ -209,7 +210,7 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context } span.AddEvent("Obtained validators", trace.WithAttributes( - attribute.Int("slot", int(s.chainTime.FirstSlotOfEpoch(epoch))), + attribute.Int("slot", int(firstSlot)), )) dbCtx, cancel, err := s.chainDB.BeginTx(ctx) From b5e3646670b369ebef28a6e962aaa650f59a3528 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:24:00 +0200 Subject: [PATCH 12/25] Align downstream-guard comment with ADR 0002 "empty-response" vocabulary The inline back-reference at services/validators/standard/handler.go:236 called the upstream guard the "empty-map guard", but the rest of the file (line 203's self-description: "an empty 200-OK validators response"), the test file (handler_internal_test.go uses both "empty 200-OK response" and "empty-response"), and ADR 0002 throughout (option-1 header, decision outcome, rejected-alternative discussion) all use "empty-response" as the canonical short name. Flagged twice in recent commit notes (5e16b88, 1df1613) as "minor terminology drift, candidate for a wider docs/comments tidy-up." This is the focused alignment. Single-word swap, comment-only. go build, go vet, gofmt, and the four existing tests in services/validators/standard pass unchanged. No behavior change, no test changes. Key decisions: - Minimal scope. A repo-wide grep for "empty-map" returned exactly one match (handler.go:236); expanding into a "wider docs/comments tidy-up" sweep would be speculative churn against the rest of the file's already-consistent vocabulary. The wider sweep stays a separate follow-up if anyone takes it on. - "empty-response guard" chosen over "empty 200-OK guard" because ADR 0002 uses the former as the canonical short name (option-1 header, decision-outcome paragraph, rejected-alternatives discussion). The downstream guard at line 237 is the same no-empty-write contract reified for the all-zero-balance shape, not a different category, so a single umbrella term applies to both back-references. - Comment structure preserved verbatim. Surrounding lines (233-235, 237-241) are byte-identical to pre-change; no rewrite of adjacent prose, no whitespace shifts. Files changed: - services/validators/standard/handler.go - single word in the inline comment block above the all-zero-balance guard. Notes for next iteration: - The services/scheduler/standard TestManyJobs deadlock-detector trip, flagged as "intermittent" in 5e16b88's notes, is now reproducing 5/5 on the parent commit (without this change). The detector reports recursive locking on s.jobsMutex inside ScheduleJob.func1. The symptom has shifted from intermittent to deterministic since the prior commit's note was written - worth a focused bug-confirming pass when someone takes it on, since reliable repro now makes the bug tractable. - Two structural follow-ups remain from earlier commit notes: the BeginTx/setMetadata/CommitTx envelope in onEpochTransitionValidatorBalancesForEpoch is still acquired before the 1.3M-validator slice build (widening the transaction window beyond what is necessary), and pre-existing chaindb diagnostics (blocks.go:895 nilness, validators.go:523/598 slicessort, validatorepochsummaries.go:221/228/234/251 QF1012 sites) await a dedicated cleanup pass. - The repo's .golangci.yml is incompatible with the installed golangci-lint v2 binary ("'output.formats' expected a map, got 'slice'"). Pre-existing config drift, unrelated to this change; candidate for a separate lint-config refresh commit. - Running "go fix ./..." against the current tree produces a large interface{} -> any rewrite across ~14 files (chaindb/ postgresql/*.go, summarizer/, scheduler/ tests, etc.). These rewrites were reverted from this commit to keep scope narrow. Worth a dedicated commit if the maintainers want the modernisation in. --- services/validators/standard/handler.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index f75fb9f..d6a8604 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -233,7 +233,7 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context // Refuse to advance the cursor when the "do not store 0 balances" // filter above amplifies an all-zero-balance beacon response into a // zero-row insert. Cancel the open transaction so the cursor is - // not committed; same warn-log contract as the empty-map guard. + // not committed; same warn-log contract as the empty-response guard. if len(dbValidatorBalances) == 0 { cancel() log.Warn().Msg(noValidatorBalancesMsg) From ddbb6c4afa06c873b53ca2afdd7989d495ce35b1 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:34:27 +0200 Subject: [PATCH 13/25] Migrate .golangci.yml to golangci-lint v2 schema The repo's .golangci.yml was a v1-format configuration; the installed golangci-lint v2.12.2 fails to load it with: Error: can't load config: 'output.formats' expected a map, got 'slice' Flagged in commit b5e3646's "Notes for next iteration" as pre-existing config drift, candidate for a separate lint-config refresh commit. This is that commit. The migration was performed via "golangci-lint migrate --skip-validation" (the bundled v1->v2 converter). The most visible shape changes are structural rather than behavioural: - Top-level "version: 2" added (now required). - "output.formats" is a map keyed by formatter name instead of a list of {format, path} entries. - "linters.enable-all: true" -> "linters.default: all". - Formatter linters (gofmt, gofumpt, goimports) split out of "linters" into a new top-level "formatters" section. - Per-linter settings moved from "linters-settings" to "linters.settings". - Exclusion globs (".*_ssz\\.go$") moved into both "linters.exclusions.paths" and "formatters.exclusions.paths" -- v2 evaluates exclusions per-section rather than globally. Two linters that the v1 disable list named are silently dropped by the migrator because they no longer exist as separately enableable units in v2: - "gci" -- in v2 it is a formatter, not a linter, and the formatters section here only enables gofmt/gofumpt/goimports. Effective status (gci off) unchanged. - "tenv" -- removed in golangci-lint v2 (folded into usetesting). Nothing to opt out of. Key decisions: - Used the bundled "golangci-lint migrate" rather than hand-editing. The tool warns that comments are not migrated, which the original config used heavily as documentation for default values; the migrated file drops those comments in favour of a compact, valid v2 shape. Restoring the comment block would mean re-deriving the v2 defaults by hand and re-annotating; out of scope for a config-refresh commit. - The migrate tool also drops the "issues.include" list of specific exclusion IDs (EXC0002, EXC0005, EXC0009, EXC0011, EXC0012, EXC0014) and replaces them with the broader v2 presets ("common-false-positives" and "std-error-handling"). This is a small behavioural change in what gets excluded, but the v2 presets are the recommended idiom; tuning back to the exact original ID set is left for a follow-up if specific noise appears. - "run.timeout: 10m" is intentionally not preserved; the migrator warns that v2 disables the timeout by default, and matching that default seems preferable to re-introducing a v1-era setting. - Did not add an explicit "gomodguard_v2" entry even though the v2 linter run prints a deprecation warning for "gomodguard". The warning is informational only -- the config still loads, the lint still runs -- and a deeper linter-set refresh (which linters to enable/disable in v2) is a wider piece of work than this commit's scope. Files changed: - .golangci.yml -- full v1 -> v2 migration via "golangci-lint migrate --skip-validation". 120 lines -> 75 lines. Notes for next iteration: - The migrated config emits a deprecation warning for "gomodguard" on every run. Worth a focused follow-up to either swap to "gomodguard_v2" with equivalent settings or to disable it explicitly if no module restrictions are wanted. - The exclusion-ID -> presets translation is approximate. If the v2 lint output now flags previously-suppressed cases (e.g., revive's package-comments, exported-type comments, comments-on- exported), a future commit should restore the original set explicitly via "linters.exclusions.rules" entries. - "linters-settings.lll" / "lll: 132" still present in the migrated config but lll is in the disable list, so the setting is dead. Cosmetic, not a config error; can be removed in a cleanup pass. --- .golangci.yml | 157 ++++++++++++++------------------------------------ 1 file changed, 44 insertions(+), 113 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index d431254..4359c4f 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -1,120 +1,14 @@ -# This file contains all available configuration options -# with their default values (in comments). -# -# This file is not a configuration example, -# it contains the exhaustive configuration with explanations of the options. - -issues: - # Which files to exclude: they will be analyzed, but issues from them won't be reported. - # There is no need to include all autogenerated files, - # we confidently recognize autogenerated files. - # If it's not, please let us know. - # "/" will be replaced by current OS file path separator to properly work on Windows. - # Default: [] - exclude-files: - - ".*_ssz\\.go$" - - include: - - 'EXC0002' - - 'EXC0005' - - 'EXC0009' - - 'EXC0011' - - 'EXC0012' - - 'EXC0014' - -# Options for analysis running. +version: "2" run: - # The default concurrency value is the number of available CPU. - # concurrency: 4 - - # Timeout for analysis, e.g. 30s, 5m. - # Default: 1m - timeout: 10m - - # Exit code when at least one issue was found. - # Default: 1 - # issues-exit-code: 2 - - # Include test files or not. - # Default: true - tests: false - - # List of build tags, all linters use it. - # Default: []. - # build-tags: - # - mytag - - # Which dirs to skip: issues from them won't be reported. - # Can use regexp here: `generated.*`, regexp is applied on full path. - # Default value is empty list, - # but default dirs are skipped independently of this option's value (see skip-dirs-use-default). - # "/" will be replaced by current OS file path separator to properly work on Windows. - # skip-dirs: - # - autogenerated_by_my_lib - - # Enables skipping of directories: - # - vendor$, third_party$, testdata$, examples$, Godeps$, builtin$ - # Default: true - # skip-dirs-use-default: false - - # If set we pass it to "go list -mod={option}". From "go help modules": - # If invoked with -mod=readonly, the go command is disallowed from the implicit - # automatic updating of go.mod described above. Instead, it fails when any changes - # to go.mod are needed. This setting is most useful to check that go.mod does - # not need updates, such as in a continuous integration and testing system. - # If invoked with -mod=vendor, the go command assumes that the vendor - # directory holds the correct copies of dependencies and ignores - # the dependency descriptions in go.mod. - # - # Allowed values: readonly|vendor|mod - # By default, it isn't set. modules-download-mode: readonly - - # Allow multiple parallel golangci-lint instances running. - # If false (default) - golangci-lint acquires file lock on start. + tests: false allow-parallel-runners: true - - # Define the Go version limit. - # Mainly related to generics support since go1.18. - # Default: use Go version from the go.mod file, fallback on the env var `GOVERSION`, fallback on 1.18 - # go: '1.19' - - -# output configuration options output: formats: - - format: colored-line-number + text: path: stderr - -# All available settings of specific linters. -linters-settings: - gosec: - excludes: - - G115 # This generates a lot of false positives, recheck once https://github.com/securego/gosec/issues/1212 is closed - - lll: - line-length: 132 - - nlreturn: - # Allow two-line blocks without requiring a newline - block-size: 3 - - stylecheck: - checks: [ "all", "-ST1000" ] - - tagliatelle: - case: - # use-field-name: true - rules: - json: snake - yaml: snake - linters: - # Enable all available linters. - # Default: false - enable-all: true - # Disable specific linter - # https://golangci-lint.run/usage/linters/#disabled-by-default + default: all disable: - cyclop - depguard @@ -124,12 +18,11 @@ linters: - exhaustruct - forcetypeassert - funlen - - gci - gochecknoglobals - gocognit - goconst - - ireturn - inamedparam + - ireturn - lll - mnd - nestif @@ -137,7 +30,45 @@ linters: - nlreturn - perfsprint - promlinter - - tenv - varnamelen - wrapcheck - wsl + settings: + gosec: + excludes: + - G115 + lll: + line-length: 132 + nlreturn: + block-size: 3 + staticcheck: + checks: + - all + - -ST1000 + tagliatelle: + case: + rules: + json: snake + yaml: snake + exclusions: + generated: lax + presets: + - common-false-positives + - std-error-handling + paths: + - .*_ssz\.go$ + - third_party$ + - builtin$ + - examples$ +formatters: + enable: + - gofmt + - gofumpt + - goimports + exclusions: + generated: lax + paths: + - .*_ssz\.go$ + - third_party$ + - builtin$ + - examples$ From 67a61f2724d7916991b13bfed10dc9966419b803 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:35:37 +0200 Subject: [PATCH 14/25] Apply "go fix ./..." modernisation sweep Flagged in commit b5e3646's "Notes for next iteration": Running "go fix ./..." against the current tree produces a large interface{} -> any rewrite across ~14 files (chaindb/postgresql/*.go, summarizer/, scheduler/ tests, etc.). These rewrites were reverted from this commit to keep scope narrow. Worth a dedicated commit if the maintainers want the modernisation in. This is that commit. Identical in scope to what "go fix ./..." emits against the tree at this point, with no hand-edits beyond what the fixer produced. All test/build/vet gates remain green; the pre- existing services/scheduler/standard TestManyJobs deadlock-detector trip (deterministic on this branch tip, unrelated to this commit per b5e3646's notes) remains the sole failing case. The sweep covers five categories of go-fix-driven modernisations: 1. interface{} -> any (Go 1.18+ alias) - services/chaindb/postgresql/blobsidecars.go - services/chaindb/postgresql/proposerduties.go - services/chaindb/postgresql/setblobsidecars.go - services/chaindb/postgresql/setconsolidationrequests.go - services/chaindb/postgresql/setdepositrequests.go - services/chaindb/postgresql/setwithdrawalrequests.go 2. sort.Slice(s, func(i, j) ...) -> slices.Sort(s) (Go 1.21+) - services/chaindb/postgresql/aggregatevalidatorbalances.go - services/chaindb/postgresql/validators.go (two sites) - services/summarizer/standard/epoch.go (two sites) Also addresses the slicessort diagnostic flagged in b5e3646's notes (validators.go:523/598). 3. strings.HasPrefix(s, p) + strings.TrimPrefix(s, p) -> strings.CutPrefix (Go 1.20+) - services/chaindb/postgresql/chainspec.go 4. ad-hoc min/max via if-block -> max()/min() builtin (Go 1.21+) - services/chaintime/standard/service.go FirstEpochOfSyncPeriod's altair-fork floor. - services/eth1deposits/getlogs/service.go blocksPerRequest end-block clamp. 5. C-style "for i := 0; i < n; i++" -> "for i := range n" or "for range n" (Go 1.22+) - services/scheduler/standard/service_test.go (two sites) - services/summarizer/standard/handler_internal_test.go Key decisions: - One bundled commit rather than five micro-commits. Each category is mechanical, the diff is small, and "go fix ./..." names the scope at a glance for any future bisection. Splitting by category would introduce churn without making any individual change easier to audit. - No surrounding-code cleanup or refactor folded in. Cases like validators.go's pre-fix sort.Slice closure could plausibly have been reordered alongside other nearby cleanups, but the rule for go-fix sweeps is "exactly what the fixer emits, nothing more" so the diff stays trivially auditable. - The commit covers the slicessort diagnostic flagged in b5e3646's notes (validators.go:523/598) as a side effect. Two of the three pre-existing chaindb diagnostics from those notes (blocks.go:895 nilness, validatorepochsummaries.go:221/228/234/ 251 QF1012) are NOT addressed here -- "go fix" does not target those classes -- and remain candidates for separate focused commits. Files changed (14 total, 25 insertions, 37 deletions): - services/chaindb/postgresql/aggregatevalidatorbalances.go - services/chaindb/postgresql/blobsidecars.go - services/chaindb/postgresql/chainspec.go - services/chaindb/postgresql/proposerduties.go - services/chaindb/postgresql/setblobsidecars.go - services/chaindb/postgresql/setconsolidationrequests.go - services/chaindb/postgresql/setdepositrequests.go - services/chaindb/postgresql/setwithdrawalrequests.go - services/chaindb/postgresql/validators.go - services/chaintime/standard/service.go - services/eth1deposits/getlogs/service.go - services/scheduler/standard/service_test.go - services/summarizer/standard/epoch.go - services/summarizer/standard/handler_internal_test.go Notes for next iteration: - blocks.go:895 nilness diagnostic, flagged in b5e3646's notes, is not addressed by this sweep. Worth a focused commit. - validatorepochsummaries.go:221/228/234/251 QF1012 diagnostics, flagged in b5e3646's notes, are not addressed by this sweep ("go fix" does not target staticcheck quickfixes). Worth a focused commit. - The BeginTx/setMetadata/CommitTx envelope refactor in onEpochTransitionValidatorBalancesForEpoch (flagged across f6d1ef8, 1df1613, 5e16b88, b5e3646) remains open. --- .../chaindb/postgresql/aggregatevalidatorbalances.go | 6 ++---- services/chaindb/postgresql/blobsidecars.go | 2 +- services/chaindb/postgresql/chainspec.go | 4 ++-- services/chaindb/postgresql/proposerduties.go | 2 +- services/chaindb/postgresql/setblobsidecars.go | 4 ++-- .../chaindb/postgresql/setconsolidationrequests.go | 4 ++-- services/chaindb/postgresql/setdepositrequests.go | 4 ++-- services/chaindb/postgresql/setwithdrawalrequests.go | 4 ++-- services/chaindb/postgresql/validators.go | 10 +++------- services/chaintime/standard/service.go | 5 +---- services/eth1deposits/getlogs/service.go | 5 +---- services/scheduler/standard/service_test.go | 4 ++-- services/summarizer/standard/epoch.go | 6 +++--- services/summarizer/standard/handler_internal_test.go | 2 +- 14 files changed, 25 insertions(+), 37 deletions(-) diff --git a/services/chaindb/postgresql/aggregatevalidatorbalances.go b/services/chaindb/postgresql/aggregatevalidatorbalances.go index 3bee86a..46c7932 100644 --- a/services/chaindb/postgresql/aggregatevalidatorbalances.go +++ b/services/chaindb/postgresql/aggregatevalidatorbalances.go @@ -16,7 +16,7 @@ package postgresql import ( "context" "fmt" - "sort" + "slices" "strings" "github.com/attestantio/go-eth2-client/spec/phase0" @@ -215,9 +215,7 @@ func (s *Service) AggregateValidatorBalancesByIndexAndEpochs( // This allows us to form a query that is significantly faster than the simple IN() style. func fastIndices(validatorIndices []phase0.ValidatorIndex) string { // Sort the validator indices. - sort.Slice(validatorIndices, func(i, j int) bool { - return validatorIndices[i] < validatorIndices[j] - }) + slices.Sort(validatorIndices) // Create an array for the validator indices. This gives us higher performance for our query. indices := make([]string, len(validatorIndices)) diff --git a/services/chaindb/postgresql/blobsidecars.go b/services/chaindb/postgresql/blobsidecars.go index c64f49b..92f1bdd 100644 --- a/services/chaindb/postgresql/blobsidecars.go +++ b/services/chaindb/postgresql/blobsidecars.go @@ -46,7 +46,7 @@ func (s *Service) BlobSidecars(ctx context.Context, // Build the query. queryBuilder := strings.Builder{} - queryVals := make([]interface{}, 0) + queryVals := make([]any, 0) queryBuilder.WriteString(` SELECT f_block_root diff --git a/services/chaindb/postgresql/chainspec.go b/services/chaindb/postgresql/chainspec.go index bf1bd8f..af11150 100644 --- a/services/chaindb/postgresql/chainspec.go +++ b/services/chaindb/postgresql/chainspec.go @@ -160,8 +160,8 @@ func dbValToSpec(_ context.Context, key string, val string) any { } // Handle hex strings. - if strings.HasPrefix(val, "0x") { - byteVal, err := hex.DecodeString(strings.TrimPrefix(val, "0x")) + if after, ok := strings.CutPrefix(val, "0x"); ok { + byteVal, err := hex.DecodeString(after) if err == nil { return byteVal } diff --git a/services/chaindb/postgresql/proposerduties.go b/services/chaindb/postgresql/proposerduties.go index 755bfeb..8c90e69 100644 --- a/services/chaindb/postgresql/proposerduties.go +++ b/services/chaindb/postgresql/proposerduties.go @@ -166,7 +166,7 @@ func (s *Service) ProposerDuties(ctx context.Context, filter *chaindb.ProposerDu // Build the query. queryBuilder := strings.Builder{} - queryVals := make([]interface{}, 0) + queryVals := make([]any, 0) queryBuilder.WriteString(` SELECT f_slot diff --git a/services/chaindb/postgresql/setblobsidecars.go b/services/chaindb/postgresql/setblobsidecars.go index 9f0c92d..06449dd 100644 --- a/services/chaindb/postgresql/setblobsidecars.go +++ b/services/chaindb/postgresql/setblobsidecars.go @@ -50,7 +50,7 @@ func (s *Service) SetBlobSidecars(ctx context.Context, blobSidecars []*chaindb.B "f_kzg_proof", "f_kzg_commitment_inclusion_proof", }, - pgx.CopyFromSlice(len(blobSidecars), func(i int) ([]interface{}, error) { + pgx.CopyFromSlice(len(blobSidecars), func(i int) ([]any, error) { var blob *[]byte if len(blobSidecars[i].Blob) > 0 { blobBytes := blobSidecars[i].Blob[:] @@ -63,7 +63,7 @@ func (s *Service) SetBlobSidecars(ctx context.Context, blobSidecars []*chaindb.B kzgCommitmentInclusionProof = append(kzgCommitmentInclusionProof, blobSidecars[i].KZGCommitmentInclusionProof[j][:]...) } - return []interface{}{ + return []any{ blobSidecars[i].InclusionBlockRoot[:], blobSidecars[i].InclusionSlot, blobSidecars[i].InclusionIndex, diff --git a/services/chaindb/postgresql/setconsolidationrequests.go b/services/chaindb/postgresql/setconsolidationrequests.go index cafea0d..42cf869 100644 --- a/services/chaindb/postgresql/setconsolidationrequests.go +++ b/services/chaindb/postgresql/setconsolidationrequests.go @@ -48,8 +48,8 @@ func (s *Service) SetConsolidationRequests(ctx context.Context, requests []*chai "f_source_pubkey", "f_target_pubkey", }, - pgx.CopyFromSlice(len(requests), func(i int) ([]interface{}, error) { - return []interface{}{ + pgx.CopyFromSlice(len(requests), func(i int) ([]any, error) { + return []any{ requests[i].InclusionBlockRoot[:], requests[i].InclusionSlot, requests[i].InclusionIndex, diff --git a/services/chaindb/postgresql/setdepositrequests.go b/services/chaindb/postgresql/setdepositrequests.go index 67a68a3..7fb7d1f 100644 --- a/services/chaindb/postgresql/setdepositrequests.go +++ b/services/chaindb/postgresql/setdepositrequests.go @@ -50,8 +50,8 @@ func (s *Service) SetDepositRequests(ctx context.Context, requests []*chaindb.De "f_signature", "f_deposit_index", }, - pgx.CopyFromSlice(len(requests), func(i int) ([]interface{}, error) { - return []interface{}{ + pgx.CopyFromSlice(len(requests), func(i int) ([]any, error) { + return []any{ requests[i].InclusionBlockRoot[:], requests[i].InclusionSlot, requests[i].InclusionIndex, diff --git a/services/chaindb/postgresql/setwithdrawalrequests.go b/services/chaindb/postgresql/setwithdrawalrequests.go index 001e900..31d9dc6 100644 --- a/services/chaindb/postgresql/setwithdrawalrequests.go +++ b/services/chaindb/postgresql/setwithdrawalrequests.go @@ -48,8 +48,8 @@ func (s *Service) SetWithdrawalRequests(ctx context.Context, requests []*chaindb "f_validator_pubkey", "f_amount", }, - pgx.CopyFromSlice(len(requests), func(i int) ([]interface{}, error) { - return []interface{}{ + pgx.CopyFromSlice(len(requests), func(i int) ([]any, error) { + return []any{ requests[i].InclusionBlockRoot[:], requests[i].InclusionSlot, requests[i].InclusionIndex, diff --git a/services/chaindb/postgresql/validators.go b/services/chaindb/postgresql/validators.go index 2715522..d0efe2e 100644 --- a/services/chaindb/postgresql/validators.go +++ b/services/chaindb/postgresql/validators.go @@ -17,7 +17,7 @@ import ( "context" "database/sql" "fmt" - "sort" + "slices" "strings" "github.com/attestantio/go-eth2-client/spec/phase0" @@ -520,9 +520,7 @@ func (s *Service) ValidatorBalancesByIndexAndEpochRange( } // Sort the validator indices. - sort.Slice(validatorIndices, func(i, j int) bool { - return validatorIndices[i] < validatorIndices[j] - }) + slices.Sort(validatorIndices) // Create a matrix of the values we require. This allows the database to fill in the blanks when it doesn't have a balance for // the required (index,epoch) tuple (for example when the balance is 0). @@ -595,9 +593,7 @@ func (s *Service) ValidatorBalancesByIndexAndEpochs( } // Sort the validator indices. - sort.Slice(validatorIndices, func(i, j int) bool { - return validatorIndices[i] < validatorIndices[j] - }) + slices.Sort(validatorIndices) // Create a matrix of the values we require. This allows the database to fill in the blanks when it doesn't have a balance for // the required (index,epoch) tuple (for example when the balance is 0). diff --git a/services/chaintime/standard/service.go b/services/chaintime/standard/service.go index 96911b8..463e87c 100644 --- a/services/chaintime/standard/service.go +++ b/services/chaintime/standard/service.go @@ -228,10 +228,7 @@ func (s *Service) TimestampToEpoch(timestamp time.Time) phase0.Epoch { // FirstEpochOfSyncPeriod provides the first epoch of the given sync period. // Note that epochs before the sync committee period will provide the Altair hard fork epoch. func (s *Service) FirstEpochOfSyncPeriod(period uint64) phase0.Epoch { - epoch := phase0.Epoch(period * s.epochsPerSyncCommitteePeriod) - if epoch < s.altairForkEpoch { - epoch = s.altairForkEpoch - } + epoch := max(phase0.Epoch(period*s.epochsPerSyncCommitteePeriod), s.altairForkEpoch) return epoch } diff --git a/services/eth1deposits/getlogs/service.go b/services/eth1deposits/getlogs/service.go index 3ac565b..810a7ec 100644 --- a/services/eth1deposits/getlogs/service.go +++ b/services/eth1deposits/getlogs/service.go @@ -218,10 +218,7 @@ func (s *Service) parseNewBlocks(ctx context.Context, md *metadata) { log.Trace().Uint64("start_block", md.LatestBlock+1).Uint64("end_block", latestHeadBlock).Msg("Fetching ETH1 logs in batches") for block := md.LatestBlock + 1; block <= latestHeadBlock; block += s.blocksPerRequest { startBlock := block - endBlock := block + s.blocksPerRequest - 1 - if endBlock > latestHeadBlock { - endBlock = latestHeadBlock - } + endBlock := min(block+s.blocksPerRequest-1, latestHeadBlock) log := log.With().Uint64("start_block", startBlock).Uint64("end_block", endBlock).Logger() // Each update goes in to its own transaction, to make the data available sooner. diff --git a/services/scheduler/standard/service_test.go b/services/scheduler/standard/service_test.go index b9152ff..0434100 100644 --- a/services/scheduler/standard/service_test.go +++ b/services/scheduler/standard/service_test.go @@ -449,7 +449,7 @@ func TestManyJobs(t *testing.T) { runTime := time.Now().Add(200 * time.Millisecond) jobs := 2048 - for i := 0; i < jobs; i++ { + for i := range jobs { require.NoError(t, s.ScheduleJob(ctx, "Test", fmt.Sprintf("Job instance %d", i), runTime, runFunc, nil)) } require.Len(t, s.ListJobs(ctx), jobs) @@ -579,7 +579,7 @@ func TestMulti(t *testing.T) { var runWG sync.WaitGroup var setupWG sync.WaitGroup starter := make(chan any) - for i := 0; i < 32; i++ { + for range 32 { setupWG.Add(1) runWG.Add(1) go func() { diff --git a/services/summarizer/standard/epoch.go b/services/summarizer/standard/epoch.go index 6bb4957..0df7ecf 100644 --- a/services/summarizer/standard/epoch.go +++ b/services/summarizer/standard/epoch.go @@ -16,7 +16,7 @@ package standard import ( "context" "fmt" - "sort" + "slices" "time" "github.com/attestantio/go-eth2-client/spec/electra" @@ -460,8 +460,8 @@ func (s *Service) attesterSlashingStatsForSlotRange(ctx context.Context, // intersection returns a list of items common between the two sets. func intersection(set1 []phase0.ValidatorIndex, set2 []phase0.ValidatorIndex) []phase0.ValidatorIndex { - sort.Slice(set1, func(i, j int) bool { return set1[i] < set1[j] }) - sort.Slice(set2, func(i, j int) bool { return set2[i] < set2[j] }) + slices.Sort(set1) + slices.Sort(set2) res := make([]phase0.ValidatorIndex, 0) set1Pos := 0 diff --git a/services/summarizer/standard/handler_internal_test.go b/services/summarizer/standard/handler_internal_test.go index 40b1f2a..33afb65 100644 --- a/services/summarizer/standard/handler_internal_test.go +++ b/services/summarizer/standard/handler_internal_test.go @@ -170,7 +170,7 @@ func TestSummarizeEpochZeroBalanceAssertion(t *testing.T) { validators := make([]*chaindb.Validator, numValidators) balances := make([]*chaindb.ValidatorBalance, numValidators) - for i := 0; i < numValidators; i++ { + for i := range numValidators { idx := phase0.ValidatorIndex(i) validators[i] = &chaindb.Validator{ Index: idx, From ad5a2e7253d620957d50e9e4522d2c31b4e800e0 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:36:33 +0200 Subject: [PATCH 15/25] Remove dead err-nil check in blocks.go LatestBlocks go vet's nilness analyser flags blocks.go:895's `if err != nil` as unreachable. Walking the code confirms it: at line 861, `err` is freshly bound from `rows.Scan(...)` and immediately checked at line 876. The non-nil branch returns; the nil branch falls through to the value-extraction copies (no operations between line 879 and 895 reassign or shadow `err`). By the time control reaches the second check, `err` is provably nil. Flagged in commit b5e3646's "Notes for next iteration" alongside two other pre-existing chaindb diagnostics (validators.go:523/598 slicessort, validatorepochsummaries.go:221/228/234/251 QF1012) left for "a dedicated cleanup pass." This is the focused commit for the nilness one. Single-pattern removal of three lines: - if err != nil { - return nil, err - } with the surrounding `copy(block.ETH1DepositRoot[:], ...)` and `blocks = append(blocks, block)` lines preserved verbatim. No control-flow change for any reachable input -- the deletion only removes a check that statically cannot fire. Key decisions: - Targeted to LatestBlocks only. A repo-wide grep for `if err != nil` in blocks.go returns 20+ matches; auditing each for analogous dead shape is a separate cleanup pass and is not in scope here. - No replacement comment. The pattern is a half-deleted copy-paste remnant from sibling functions in this file (which return `(nil, err)` after a different sequence); explaining its absence would document a non-event. - The fix does not change function shape: LatestBlocks still walks rows in a for-loop, scans each, populates a *chaindb.Block, and appends. No tests in this package exercise LatestBlocks directly (chaindb/postgresql has no internal tests), and the package compiles + the existing `gosilent test` is unaffected. Files changed: - services/chaindb/postgresql/blocks.go -- remove dead `if err != nil` block at the end of the LatestBlocks row-walk loop. Notes for next iteration: - Two chaindb diagnostics from b5e3646's notes remain unaddressed: validatorepochsummaries.go:221/228/234/251 QF1012 (next commit in this sweep). validators.go:523/598 slicessort was already closed by 67a61f2's go-fix sweep. - The LSP reports four further QF1012 sites in this same file (blocks.go:154/161/167/184) that were not in b5e3646's flagged set. Out of scope for this commit; candidate for a wider QF1012 pass if maintainers want the broader cleanup. - The "various if err != nil" copy-paste pattern across blocks.go is worth a focused audit for analogous dead-branch cases. Not attempted here. --- services/chaindb/postgresql/blocks.go | 3 --- 1 file changed, 3 deletions(-) diff --git a/services/chaindb/postgresql/blocks.go b/services/chaindb/postgresql/blocks.go index 3927b95..47a8348 100644 --- a/services/chaindb/postgresql/blocks.go +++ b/services/chaindb/postgresql/blocks.go @@ -892,9 +892,6 @@ func (s *Service) LatestBlocks(ctx context.Context) ([]*chaindb.Block, error) { copy(block.BlobKZGCommitments[i][:], blobKZGCommitments[i]) } } - if err != nil { - return nil, err - } blocks = append(blocks, block) } From 0486b6ad5b6b708c98c16df0240c3885f0bad60f Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:37:26 +0200 Subject: [PATCH 16/25] Use fmt.Fprintf in validatorepochsummaries query builder staticcheck QF1012 flags four sites in ValidatorEpochSummaries (lines 221, 228, 234, 251) where the strings.Builder's content is appended via: queryBuilder.WriteString(fmt.Sprintf(`...%s...`, args...)) The recommended idiom writes directly into the builder via fmt.Fprintf, avoiding the intermediate string allocation: fmt.Fprintf(&queryBuilder, `...%s...`, args...) Flagged in commit b5e3646's "Notes for next iteration" alongside two other pre-existing chaindb diagnostics. This is the focused commit for those four QF1012 sites. The behavioural surface is unchanged. fmt.Fprintf into a strings.Builder writes the same bytes as fmt.Sprintf followed by WriteString -- the format-string semantics, the verb interpretation, and the resulting byte sequence are identical. The only observable difference is the absence of the intermediate string heap allocation, which the staticcheck quickfix targets. Key decisions: - Targeted to the four sites flagged in b5e3646's notes. Other WriteString(fmt.Sprintf(...)) call sites elsewhere in the chaindb package were not in the flagged set; auditing them all is a separate cleanup pass. - All four sites use the multi-line raw-string template (`... %s f_... = $%d`) for SQL fragment construction. The diff is a pure call-shape rewrite -- no whitespace adjustment, no template edits, no parameter reordering. - No new import. fmt was already imported in the file (line 16); the fmt.Fprintf change is internal to the same package usage. Files changed: - services/chaindb/postgresql/validatorepochsummaries.go -- four queryBuilder.WriteString(fmt.Sprintf(...)) -> fmt.Fprintf(&queryBuilder, ...) rewrites in ValidatorEpochSummaries (lines 221, 228, 234, 251 in the pre-change file). Notes for next iteration: - The LSP reports four further QF1012 sites in blocks.go (lines 154/161/167/184), surfaced after the dead-err-check removal in the previous commit (ad5a2e7). Same pattern, same one-line rewrite per site. Worth a focused commit if maintainers want the broader QF1012 sweep. - All three chaindb diagnostics from b5e3646's "Notes" are now closed: validators.go:523/598 slicessort (closed by 67a61f2's go-fix sweep), blocks.go:895 nilness (closed by ad5a2e7), and validatorepochsummaries.go:221/228/234/251 QF1012 (this commit). - The BeginTx/setMetadata/CommitTx envelope refactor in onEpochTransitionValidatorBalancesForEpoch (flagged across f6d1ef8, 1df1613, 5e16b88, b5e3646) remains the next pickup. --- .../postgresql/validatorepochsummaries.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/services/chaindb/postgresql/validatorepochsummaries.go b/services/chaindb/postgresql/validatorepochsummaries.go index 4fe2719..a3de808 100644 --- a/services/chaindb/postgresql/validatorepochsummaries.go +++ b/services/chaindb/postgresql/validatorepochsummaries.go @@ -218,21 +218,21 @@ FROM t_validator_epoch_summaries`) if filter.From != nil { queryVals = append(queryVals, *filter.From) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_epoch >= $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_epoch >= $%d`, wherestr, len(queryVals)) wherestr = " AND" } if filter.To != nil { queryVals = append(queryVals, *filter.To) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_epoch <= $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_epoch <= $%d`, wherestr, len(queryVals)) } if filter.ValidatorIndices != nil && len(*filter.ValidatorIndices) > 0 { queryVals = append(queryVals, *filter.ValidatorIndices) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_validator_index = ANY($%d)`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_validator_index = ANY($%d)`, wherestr, len(queryVals)) } switch filter.Order { @@ -248,8 +248,8 @@ ORDER BY f_epoch DESC,f_validator_index DESC`) if filter.Limit > 0 { queryVals = append(queryVals, filter.Limit) - queryBuilder.WriteString(fmt.Sprintf(` -LIMIT $%d`, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +LIMIT $%d`, len(queryVals)) } if e := log.Trace(); e.Enabled() { From 8be1f3c09f7431beb2b0b6f4a6d97c8cadbd3cd2 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:39:53 +0200 Subject: [PATCH 17/25] Move BeginTx below slice build in onEpochTransitionValidatorBalancesForEpoch The validator-balance fetcher opened its database transaction before the 1.3M-validator slice build, holding the MVCC snapshot and transaction ID open for the duration of a CPU-bound loop that does no database work. Flagged as a pre-existing structural quirk across four recent commits (f6d1ef8, 1df1613, 5e16b88, b5e3646) with the same prescription each time: Efficiency: BeginTx is acquired before the 1.3M-validator slice build, widening the transaction window beyond what is necessary. This commit closes that follow-up. Refactor shape: BeginTx moves from above the slice-build loop to just below the all-zero guard, immediately before the first SetValidatorBalances call. The transaction window now covers exactly the bulk insert + setMetadata + commit envelope, not the preceding pure-CPU loop. A consequence visible in tests: the two empty-response guards are now symmetric. Both fire BEFORE BeginTx, both short-circuit with no transaction to cancel, and the inline comment block above guard 2 is updated to reflect that ("No transaction is open at this point, so the rejection short-circuits cleanly"). The bulk-insert retry path at the original BeginTx-after-rollback site is unchanged: when SetValidatorBalances fails, the retry still cancels the failed transaction and opens a fresh one for the per-row fallback inserts. Test inversion (GREEN after RED): TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged The pre-refactor assertions encoded the old contract: beginTxCalls == 1, cancelCalls == 1 ("guard 2 fires after BeginTx; the open transaction must be cancelled") Post-refactor assertions encode the new contract: beginTxCalls == 0, cancelCalls == 0 ("guard 2 fires before BeginTx; no transaction may open on this path") The docstring's prose was also rewritten -- previously it called out "Unlike guard 1, this path has already opened a transaction by the time it fires"; that asymmetry between the two guards is now gone, and the docstring says so explicitly. The other three test cases (EmptyValidatorsRejectedAndCursorUnchanged, MixedBalancesPersistsNonZero, BeaconErrorLeavesCursorUnchanged) keep their existing assertions and pass unchanged. Key decisions: - Transaction window shrinks but does NOT change atomicity. The bulk-insert + setMetadata + CommitTx envelope is still an atomic unit: either both the validator-balance rows land AND the cursor advances, or neither. ADR 0002's "atomic co-commit of cursor + balance rows" property is preserved; this refactor only changes WHEN the transaction opens, not what it covers. - Slice-build loop intentionally left outside the transaction even though it touches no database state, because moving it inside would re-introduce the wide-window cost. The slice is a function-local []*chaindb.ValidatorBalance with no shared state, so concurrent fetchers cannot observe it; transaction isolation buys nothing. - "cancel" name preserved on the post-move BeginTx return. The retry path at the bulk-insert failure site reassigns it (`dbCtx, cancel, err = s.chainDB.BeginTx(ctx)`); keeping the same variable name avoids cascading downstream changes. - Empty-response guards are now symmetric. ADR 0002's no-empty- write contract treated them as the same category at the contract level; this refactor reifies that symmetry at the implementation level. Future readers can reason about either guard without reaching for "but this one runs after BeginTx." Files changed: - services/validators/standard/handler.go -- move BeginTx below the slice-build loop and below the all-zero guard; update the inline comment to match the post-refactor "no transaction open" semantics. - services/validators/standard/handler_internal_test.go -- invert the AllZeroBalancesRejectedAndCursorUnchanged assertions (beginTxCalls/cancelCalls go from 1/1 to 0/0); rewrite the docstring to reflect the post-refactor symmetry between the two guards. Notes for next iteration: - The dead `if s.balances` block (closed by 1df1613) is an unrelated structural cleanup that has already landed; this commit is the second of the two pre-existing structural quirks flagged in f6d1ef8's notes. - The lag-gauge semantics question (whether a fetcher-vs-finality gauge should join the existing summarizer-vs-fetcher gauge, given the new no-empty-write contract) remains open per f6d1ef8's notes -- a design call for the observability owner, not actionable AFK. - The deterministic services/scheduler/standard TestManyJobs deadlock-detector trip remains open; investigated as a go-deadlock false positive triggered by the 2048-goroutine contention, but not fixed in this commit. --- services/validators/standard/handler.go | 15 +++++------ .../standard/handler_internal_test.go | 25 ++++++++++--------- 2 files changed, 21 insertions(+), 19 deletions(-) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index d6a8604..91c3a2e 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -213,10 +213,6 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context attribute.Int("slot", int(firstSlot)), )) - dbCtx, cancel, err := s.chainDB.BeginTx(ctx) - if err != nil { - return errors.Wrap(err, "failed to begin transaction for validator balances") - } dbValidatorBalances := make([]*chaindb.ValidatorBalance, 0, len(validators)) for index, validator := range validators { // Do not store 0 balances. @@ -232,13 +228,18 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context } // Refuse to advance the cursor when the "do not store 0 balances" // filter above amplifies an all-zero-balance beacon response into a - // zero-row insert. Cancel the open transaction so the cursor is - // not committed; same warn-log contract as the empty-response guard. + // zero-row insert. No transaction is open at this point, so the + // rejection short-circuits cleanly; same warn-log contract as the + // empty-response guard. if len(dbValidatorBalances) == 0 { - cancel() log.Warn().Msg(noValidatorBalancesMsg) return errors.New("beacon returned no validator balances") } + + dbCtx, cancel, err := s.chainDB.BeginTx(ctx) + if err != nil { + return errors.Wrap(err, "failed to begin transaction for validator balances") + } if err := s.validatorsSetter.SetValidatorBalances(dbCtx, dbValidatorBalances); err != nil { log.Trace().Err(err).Msg("Bulk insert failed; falling back to individual insert") // This error will have caused the transaction to fail, so cancel it and start a new one. diff --git a/services/validators/standard/handler_internal_test.go b/services/validators/standard/handler_internal_test.go index 3a38977..0f93f89 100644 --- a/services/validators/standard/handler_internal_test.go +++ b/services/validators/standard/handler_internal_test.go @@ -254,13 +254,14 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsRejectedAndCu // TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged // pins the post-fix contract for guard 2 (the all-zero-balances filter // amplifier). The beacon returns 1000 validators with Balance == 0; the -// "Do not store 0 balances" filter at handler.go:206-208 collapses them into -// a zero-row insert, and the guard refuses to advance. Unlike guard 1, this -// path has already opened a transaction by the time it fires, so cancel() -// must be invoked exactly once; the transaction must not commit; and the -// cursor must stay put. Both empty-response shapes (this and -// EmptyValidatorsRejectedAndCursorUnchanged) emit the same warn-log -// alert-contract substring so an alert rule matches them uniformly. +// "Do not store 0 balances" filter collapses them into a zero-row insert, +// and the guard refuses to advance. Like guard 1, this path fires before +// any transaction is opened: BeginTx now sits below the slice build so the +// rejection short-circuits with no transaction to cancel; the transaction +// must not commit; and the cursor must stay put. Both empty-response +// shapes (this and EmptyValidatorsRejectedAndCursorUnchanged) emit the +// same warn-log alert-contract substring so an alert rule matches them +// uniformly. func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged(t *testing.T) { const epoch = phase0.Epoch(86824) const startCursor = phase0.Epoch(86823) @@ -301,12 +302,12 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCu "error text must carry the stable contract substring used by alert rules") require.Equal(t, 1, eth2.calls, "beacon Validators must be called exactly once") - require.Equal(t, 1, db.beginTxCalls, - "guard 2 fires after BeginTx; exactly one transaction must open") + require.Equal(t, 0, db.beginTxCalls, + "guard 2 fires before BeginTx; no transaction may open on this path") require.Equal(t, 0, db.commitTxCalls, - "transaction must not commit when the guard rejects the filtered response") - require.Equal(t, 1, db.cancelCalls, - "the open transaction must be cancelled so its writes do not commit") + "no transaction may commit when the guard rejects the filtered response") + require.Equal(t, 0, db.cancelCalls, + "no transaction was opened, so cancel must not be invoked") require.Empty(t, setter.bulkCalls, "SetValidatorBalances must not be called when the filtered slice is empty") From 9c6bd55ae1ad36c2e77da19633ce466ceeff138b Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 09:59:50 +0200 Subject: [PATCH 18/25] Use fmt.Fprintf in blocks query builder staticcheck QF1012 flags four sites in Blocks() (lines 154, 161, 167, 184) where the strings.Builder's content is appended via: queryBuilder.WriteString(fmt.Sprintf(`...%s...`, args...)) The recommended idiom writes directly into the builder via fmt.Fprintf, avoiding the intermediate string allocation: fmt.Fprintf(&queryBuilder, `...%s...`, args...) Flagged in commit 0486b6a's "Notes for next iteration" -- these four sites were surfaced after the dead-err-check removal in ad5a2e7 and are the same pattern that 0486b6a applied to validatorepochsummaries. This is the focused commit for those four QF1012 sites. The behavioural surface is unchanged. fmt.Fprintf into a strings.Builder writes the same bytes as fmt.Sprintf followed by WriteString -- the format-string semantics, the verb interpretation, and the resulting byte sequence are identical. The only observable difference is the absence of the intermediate string heap allocation, which the staticcheck quickfix targets. Key decisions: - Targeted to the four sites flagged in 0486b6a's notes, all inside Blocks() (the BlockFilter query-builder switch). golangci-lint still reports staticcheck QF1012 in attestations.go:720 and beaconcommittees.go:91/98 -- those are the next candidates for the package-wide sweep that 0486b6a's notes anticipated, but auditing all ~41 WriteString(fmt.Sprintf) call sites across chaindb/postgresql is a separate cleanup pass. - All four sites use the multi-line raw-string template (`...%s f_... = $%d` and `...LIMIT $%d`) for SQL fragment construction. The diff is a pure call-shape rewrite -- no whitespace adjustment, no template edits, no parameter reordering. Identical mechanical shape to commit 0486b6a. - No new import. fmt was already imported in the file (line 20); the fmt.Fprintf change is internal to the same package usage. - The unrelated wsl_v5, noinlineerr, and revive issues that golangci-lint reports in this file (lines 54, 103, 108, 262) are NOT addressed here -- they predate the QF1012 cleanup pass and live on different lines. Out of scope. Files changed: - services/chaindb/postgresql/blocks.go -- four queryBuilder.WriteString(fmt.Sprintf(...)) -> fmt.Fprintf(&queryBuilder, ...) rewrites in Blocks() (lines 154, 161, 167, 184 in the pre-change file). Notes for next iteration: - attestations.go:720 and beaconcommittees.go:91/98 (and potentially the unflagged sites at 105/122) report staticcheck QF1012 in current golangci-lint output. Same one-line rewrite per site; candidate for a wider focused commit if maintainers want the package-wide cleanup. - The deterministic services/scheduler/standard TestManyJobs deadlock-detector trip (flagged across f6d1ef8, b5e3646, 8be1f3c) remains open. Per standing direction not to silence go-deadlock false positives in tests or production, it awaits an upstream go-deadlock fix or new direction before the candidate fix paths (Opts callback reconfiguration, dependency drop, test-load pinning) can be applied. - The lag-gauge semantics question (whether a fetcher-vs-finality gauge should join the existing summarizer-vs-fetcher gauge, given the no-empty-write contract introduced in this branch's validator-balance fetcher work) remains open per f6d1ef8's notes -- a design call for the observability owner, not actionable AFK. - The .golangci.yml v2 migration's deprecation warning for "gomodguard" still fires on every lint run (per ddbb6c4's notes); cosmetic-only follow-up. --- services/chaindb/postgresql/blocks.go | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/services/chaindb/postgresql/blocks.go b/services/chaindb/postgresql/blocks.go index 47a8348..a3d3fa9 100644 --- a/services/chaindb/postgresql/blocks.go +++ b/services/chaindb/postgresql/blocks.go @@ -151,21 +151,21 @@ FROM t_blocks`) if filter.From != nil { queryVals = append(queryVals, *filter.From) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_slot >= $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_slot >= $%d`, wherestr, len(queryVals)) wherestr = " AND" } if filter.To != nil { queryVals = append(queryVals, *filter.To) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_slot <= $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_slot <= $%d`, wherestr, len(queryVals)) } if filter.Canonical != nil { queryVals = append(queryVals, *filter.Canonical) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_canonical = $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_canonical = $%d`, wherestr, len(queryVals)) } switch filter.Order { @@ -181,8 +181,8 @@ ORDER BY f_slot DESC,f_root DESC`) if filter.Limit > 0 { queryVals = append(queryVals, filter.Limit) - queryBuilder.WriteString(fmt.Sprintf(` -LIMIT $%d`, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +LIMIT $%d`, len(queryVals)) } if e := log.Trace(); e.Enabled() { From 6a148f66dbfbc1096b0d47094995128a37951252 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 11:29:26 +0200 Subject: [PATCH 19/25] Bump go-deadlock from v0.3.5 to v0.3.9 to fix TestManyJobs trip services/scheduler/standard's TestManyJobs (service_test.go:438) schedules 2048 jobs whose timer goroutines wake at the same instant and contend for s.jobsMutex. Under v0.3.5 of github.com/sasha-s/go-deadlock the detector trips deterministically at any n >= 16 concurrent jobs on darwin, killing the test process via the default Opts.OnPotentialDeadlock = os.Exit(2). Two distinct false-positive shapes reproduce on the v0.3.5 baseline: Shape A -- "Recursive locking" with two identical stack frames from the same line/function context (a real recursive lock by a single goroutine cannot produce identical stacks; this is two different goroutines whose IDs the detector has confused). Shape B -- "happened after" with both holders attributed to "goroutine 0" (the detector's placeholder when its goroutine-ID parser falls back), reporting concurrent holders of distinct job.stateLock instances plus the shared jobsMutex. Walking the lock pattern in services/scheduler/standard/service.go confirms no real circular order: - ScheduleJob.func1 (timer branch): acquires jobsMutex alone. Calls finaliseJob later; finaliseJob acquires stateLock alone. - RunJob / CancelJob: acquire jobsMutex, release before touching stateLock. No nested holding. - runJob: acquires stateLock alone. Order across functions is jobsMutex -> (release) -> stateLock in every reachable code path; no path holds both simultaneously. Both the "recursive" shape (identical stack frames) and the "goroutine 0" placeholder point at go-deadlock's goroutine-ID-from-runtime.Stack parser, which has known limits when many short-lived goroutines contend at once. The fix consumes upstream go-deadlock work rather than adding any chaind-side workaround: v0.3.7 (2026-03-07) fixes "concurrent lock tracking problems during contention" -- the load-bearing change that resolves the false positive. v0.3.8 (2026-03-17) replaces the goroutine-per-lock detection mechanism with time.AfterFunc + sync.Pool, removing the parser code path that the "goroutine 0" symptom pointed at, but introduces -race unit-test failures. v0.3.9 (2026-03-18) ships one day after v0.3.8 to "fix existing -race unit test failures" on the same rework. Stable endpoint of the v0.3.7+ fix range. Probe outcome on the development laptop: Baseline (v0.3.5): 0/5 PASS (5/5 deterministic FAIL with the POTENTIAL DEADLOCK report) Upgraded (v0.3.9): 5/5 PASS, then 10/10 PASS (15/15 total, no POTENTIAL DEADLOCK reports, original jobs := 2048 constant unchanged) gosilent test ./... (full suite, v0.3.9): 87/87 PASS, no NEW lock-ordering issues surfaced by the upgraded detector Key decisions: - Bump rather than mutex swap, deadlock.Opts override, or test-load reduction. The standing project direction is not to silence go-deadlock false positives in tests or production -- the bump is the smallest action that does not violate that direction; the upstream detector remains active and now correctly reports no false positives on chaind's lock pattern. - v0.3.9 rather than v0.3.7 (the minimum-sufficient fix). v0.3.7 alone would close the false positive, but v0.3.8's rework + its v0.3.9 -race fix all ship within an 11-day window with a public -race regression in the v0.3.8 middle. Landing on v0.3.9 picks up the fix + rework + rework's follow-up in one bump rather than freezing chaind at v0.3.7 against an actively-evolving upstream. The API surface chaind uses (deadlock.Mutex / deadlock.RWMutex, no Opts configuration, no callback registration) is stable across the entire v0.3.x series, so the rework is invisible at chaind's call sites. - petermattis/goid (transitive dependency of go-deadlock) bumped alongside, matching the goid pseudo-version pinned in go-deadlock v0.3.9's own go.mod. Mechanically required by go mod tidy, not over-specified. - No source change in this commit. The two existing call sites in services/scheduler/standard/service.go (line 32: stateLock deadlock.Mutex; line 46: jobsMutex deadlock.RWMutex) remain byte-identical. The behaviour change is entirely in the upstream library. Files changed: - go.mod -- github.com/sasha-s/go-deadlock v0.3.5 -> v0.3.9; github.com/petermattis/goid pseudo-version bumped to the goid release pinned by go-deadlock v0.3.9. - go.sum -- corresponding hash replacements for both modules (4 stale lines removed, 4 new lines added; no other module hashes touched). go mod verify confirms all modules verified against the public Go checksum DB. Notes for next iteration: - services/scheduler/standard/service.go retains 21 pre-existing golangci-lint findings (wsl_v5, noinlineerr, revive) that are unrelated to this bump and were already documented as out of scope across the recent QF1012 sweep commits. Candidate for a focused style-pass commit if maintainers want the broader cleanup. - The .golangci.yml v2 migration's deprecation warning for "gomodguard" still fires on every lint run; cosmetic-only follow-up tracked separately as a lint-config-hygiene candidate. - The remaining QF1012 sites in attestations.go and beaconcommittees.go (5 sites total) remain candidates for the next focused commit in the QF1012 sweep started in 0486b6a and 9c6bd55. --- go.mod | 4 ++-- go.sum | 9 ++++----- 2 files changed, 6 insertions(+), 7 deletions(-) diff --git a/go.mod b/go.mod index 247f4f8..9df6534 100644 --- a/go.mod +++ b/go.mod @@ -15,7 +15,7 @@ require ( github.com/prometheus/client_golang v1.20.5 github.com/prysmaticlabs/go-bitfield v0.0.0-20240618144021-706c95b2dd15 github.com/rs/zerolog v1.33.0 - github.com/sasha-s/go-deadlock v0.3.5 + github.com/sasha-s/go-deadlock v0.3.9 github.com/shopspring/decimal v1.4.0 github.com/spf13/pflag v1.0.6 github.com/spf13/viper v1.19.0 @@ -72,7 +72,7 @@ require ( github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect github.com/pelletier/go-toml/v2 v2.2.3 // indirect - github.com/petermattis/goid v0.0.0-20250121172306-05bcfb9a85dc // indirect + github.com/petermattis/goid v0.0.0-20250813065127-a731cc31b4fe // indirect github.com/pk910/dynamic-ssz v0.0.5 // indirect github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect github.com/prometheus/client_model v0.6.1 // indirect diff --git a/go.sum b/go.sum index 8336c0c..0e7c8aa 100644 --- a/go.sum +++ b/go.sum @@ -317,9 +317,8 @@ github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 h1:C3w9PqII01/Oq github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822/go.mod h1:+n7T8mK8HuQTcFwEeznm/DIxMOiR9yIdICNftLE1DvQ= github.com/pelletier/go-toml/v2 v2.2.3 h1:YmeHyLY8mFWbdkNWwpr+qIL2bEqT0o95WSdkNHvL12M= github.com/pelletier/go-toml/v2 v2.2.3/go.mod h1:MfCQTFTvCcUyyvvwm1+G6H/jORL20Xlb6rzQu9GuUkc= -github.com/petermattis/goid v0.0.0-20240813172612-4fcff4a6cae7/go.mod h1:pxMtw7cyUw6B2bRH0ZBANSPg+AoSud1I1iyJHI69jH4= -github.com/petermattis/goid v0.0.0-20250121172306-05bcfb9a85dc h1:Xz/LkK9AJRY5QTkA1uE1faB8yeqRFjeKgwDtI13ogcY= -github.com/petermattis/goid v0.0.0-20250121172306-05bcfb9a85dc/go.mod h1:pxMtw7cyUw6B2bRH0ZBANSPg+AoSud1I1iyJHI69jH4= +github.com/petermattis/goid v0.0.0-20250813065127-a731cc31b4fe h1:vHpqOnPlnkba8iSxU4j/CvDSS9J4+F4473esQsYLGoE= +github.com/petermattis/goid v0.0.0-20250813065127-a731cc31b4fe/go.mod h1:pxMtw7cyUw6B2bRH0ZBANSPg+AoSud1I1iyJHI69jH4= github.com/pk910/dynamic-ssz v0.0.5 h1:VP9heGYUwzlpyhk28P2nCAzhvGsePJOOOO5vQMDh2qQ= github.com/pk910/dynamic-ssz v0.0.5/go.mod h1:b6CrLaB2X7pYA+OSEEbkgXDEcRnjLOZIxZTsMuO/Y9c= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= @@ -355,8 +354,8 @@ github.com/sagikazarmark/locafero v0.7.0 h1:5MqpDsTGNDhY8sGp0Aowyf0qKsPrhewaLSsF github.com/sagikazarmark/locafero v0.7.0/go.mod h1:2za3Cg5rMaTMoG/2Ulr9AwtFaIppKXTRYnozin4aB5k= github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= -github.com/sasha-s/go-deadlock v0.3.5 h1:tNCOEEDG6tBqrNDOX35j/7hL5FcFViG6awUGROb2NsU= -github.com/sasha-s/go-deadlock v0.3.5/go.mod h1:bugP6EGbdGYObIlx7pUZtWqlvo8k9H6vCBBsiChJQ5U= +github.com/sasha-s/go-deadlock v0.3.9 h1:fiaT9rB7g5sr5ddNZvlwheclN9IP86eFW9WgqlEQV+w= +github.com/sasha-s/go-deadlock v0.3.9/go.mod h1:KuZj51ZFmx42q/mPaYbRk0P1xcwe697zsJKE03vD4/Y= github.com/shopspring/decimal v1.4.0 h1:bxl37RwXBklmTi0C79JfXCEBD1cqqHt0bbgBAGFp81k= github.com/shopspring/decimal v1.4.0/go.mod h1:gawqmDU56v4yIKSwfBSFip1HdCCXN8/+DMd9qYNcwME= github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= From 32e48c14c1a569c01e821dcca5b3e044e7566d7b Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 11:38:26 +0200 Subject: [PATCH 20/25] Clean up three .golangci.yml v1->v2 migration leftovers Commit ddbb6c4 (".golangci.yml v1->v2 migration") landed the v2 schema and flagged three pre-existing drift items as out of scope and candidates for a focused follow-up commit. Each of the three has since reproduced in every golangci-lint run as either a `level=warning` emission or a now-unsuppressed staticcheck finding: warning: The linter 'gomodguard' is deprecated (since v2.12.0) due to: new major version. Replaced by gomodguard_v2. warning: [runner/nolint_filter] Found unknown linters in //nolint directives: stylecheck ST1005: error strings should not be capitalized (3 visible sites that //nolint:stylecheck no longer suppresses in v2) This commit closes all three. 1. gomodguard added to the linters.disable list. v2.12.0 deprecated gomodguard in favour of gomodguard_v2. chaind has no linters.settings.gomodguard block and no linters.settings.gomodguard_v2 block -- there is no configured module-restriction policy, so the linter is enabled-by-default (via linters.default: all at line 11) but enforces nothing. Adding gomodguard to the disable list reflects the actual no-op state and silences the deprecation warning. Renaming to gomodguard_v2 would relocate the no-op without adding value. Inserted alphabetically between goconst and inamedparam to preserve the existing sort. 2. Four //nolint:stylecheck directives renamed to //nolint:staticcheck. v2 merged the v1 stylecheck linter into staticcheck (single linter under the staticcheck name in v2's namespace). The four //nolint:stylecheck directives target an unknown-in-v2 linter and currently suppress nothing -- the surrounding `// skipcq: SCC-ST1005` trailers also do not suppress them (skipcq is a SonarCloud/CodeQ tag, not a golangci-lint nolint directive). Renaming each to //nolint:staticcheck restores the original suppression intent under v2's namespace. Sites: services/synccommittees/standard/parameters.go:113, 117 services/beaconcommittees/standard/parameters.go:105, 109 The ST1005 finding being suppressed at all four sites is the capitalized error string `"Ethereum 2 client does not provide ..."`. These error strings are constant English literals returned by the parameter-parsing constructors when the eth2client argument fails the required interface assertion; they are intentionally human-readable (the SCC-ST1005 skipcq tag documents the same intent for a different toolchain). No ST1005 finding outside these four lines is masked by the rename. 3. Dead lll.line-length: 132 settings block removed from linters.settings. .golangci.yml had `lll.line-length: 132` settings, but `lll` is in the disable list (line 26 in the pre-change file). With the linter disabled the settings can never apply. The block is pure dead config; removing the two lines silences nothing visible (lll never ran) but eliminates a genuine config inconsistency that would mislead a future reader auditing the settings. No replacement -- the maximum-line-length policy was already not being enforced. Behavioural surface: Pre-change golangci-lint run ./...: - gomodguard deprecation warning: present - "Found unknown linters: stylecheck" warning: present - issue count: 67 (staticcheck: 8, of which 3 are ST1005 on the four-site source listed above) Post-change golangci-lint run ./...: - gomodguard deprecation warning: silenced - "Found unknown linters: stylecheck" warning: silenced - issue count: 64 (staticcheck: 5, the 3 ST1005 sites are now correctly suppressed by the renamed nolint directives; no new findings) go build ./..., go vet ./..., gosilent test ./...: all clean (87 tests pass, same as the pre-change baseline carried since 6a148f6's full-suite run). Key decisions: - gomodguard disable rather than rename to gomodguard_v2. Because no module-restriction policy is configured anywhere in the repo (verified: no linters.settings.gomodguard block, no linters.settings.gomodguard_v2 block), a rename would relocate a no-op to a new namespace without changing observable behavior. Disable reflects the real intent. - All four //nolint:stylecheck sites in scope, even though golangci-lint v2 only flags three of the four ST1005 warnings in baseline output (the synccommittees parameters.go:117 directive's target line at 118 does not currently surface in staticcheck output; whether by truncation or by some other filter is unclear, but the same `// skipcq: SCC-ST1005` trailer pattern at the other three sites is also flagged, so the absence is not a structural distinction). Renaming all four keeps the file self-consistent and matches the spec. - All three drift items bundled into a single commit. They share the same root cause (the v1->v2 migration in ddbb6c4) and the same review thread. Splitting them would not aid reviewability -- each is small (1-3 lines) -- and would create three near- identical commit messages. - No drive-by edits. The other settings entries under linters.settings (gosec, nlreturn, staticcheck, tagliatelle) are intentionally left as-is in this commit, including a structurally identical orphan that surfaced during review (see Notes below). Strict scope discipline matches the spec list of three drift items; the orphan is tracked for the next focused commit. Files changed: - .golangci.yml -- gomodguard added to linters.disable (insert at alphabetical position); lll.line-length: 132 removed from linters.settings (2 lines). Net: +1 / -2. - services/beaconcommittees/standard/parameters.go -- two //nolint:stylecheck -> //nolint:staticcheck rewrites at lines 105 and 109 (the BeaconCommitteesProvider and EventsProvider type-assertion paths). No other content change; the trailing `// skipcq: SCC-ST1005` comments on the errors.New(...) lines preserved verbatim. - services/synccommittees/standard/parameters.go -- two //nolint:stylecheck -> //nolint:staticcheck rewrites at lines 113 and 117 (the SyncCommitteesProvider and EventsProvider type-assertion paths). No other content change; the trailing `// skipcq: SCC-ST1005` comments on the errors.New(...) lines preserved verbatim. Notes for next iteration: - linters.settings.nlreturn.block-size: 3 (lines 41-42 in the post-change file) is structurally identical to the lll orphan removed by this commit: nlreturn is in the disable list (line 31), so the block-size setting can never apply. Pre-existing, not introduced here, but a natural pickup for the next focused config-hygiene commit -- mechanically the same 2-line removal. - The QF1012 sweep across attestations.go (1 site) and beaconcommittees.go (4 sites) in services/chaindb/postgresql, flagged across 0486b6a and 9c6bd55's "Notes for next iteration", remains the next candidate refactor commit. - The wider QF1012 sweep across the remaining ~36 sites in 12 other chaindb/postgresql files is gated on enabling stricter staticcheck QF rules in .golangci.yml (chaindb/postgresql files outside attestations.go/beaconcommittees.go/blocks.go/ validatorepochsummaries.go are not currently flagged by the default rule set). --- .golangci.yml | 3 +-- services/beaconcommittees/standard/parameters.go | 4 ++-- services/synccommittees/standard/parameters.go | 4 ++-- 3 files changed, 5 insertions(+), 6 deletions(-) diff --git a/.golangci.yml b/.golangci.yml index 4359c4f..207435e 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -21,6 +21,7 @@ linters: - gochecknoglobals - gocognit - goconst + - gomodguard - inamedparam - ireturn - lll @@ -37,8 +38,6 @@ linters: gosec: excludes: - G115 - lll: - line-length: 132 nlreturn: block-size: 3 staticcheck: diff --git a/services/beaconcommittees/standard/parameters.go b/services/beaconcommittees/standard/parameters.go index 1ed5a57..2b36afb 100644 --- a/services/beaconcommittees/standard/parameters.go +++ b/services/beaconcommittees/standard/parameters.go @@ -102,11 +102,11 @@ func parseAndCheckParameters(params ...Parameter) (*parameters, error) { } // Ensure the eth2client can handle our requirements. if _, isProvider := parameters.eth2Client.(eth2client.BeaconCommitteesProvider); !isProvider { - //nolint:stylecheck + //nolint:staticcheck return nil, errors.New("Ethereum 2 client does not provide beacon committee information") // skipcq: SCC-ST1005 } if _, isProvider := parameters.eth2Client.(eth2client.EventsProvider); !isProvider { - //nolint:stylecheck + //nolint:staticcheck return nil, errors.New("Ethereum 2 client does not provide events") // skipcq: SCC-ST1005 } if parameters.chainDB == nil { diff --git a/services/synccommittees/standard/parameters.go b/services/synccommittees/standard/parameters.go index f8aae66..ac26bf7 100644 --- a/services/synccommittees/standard/parameters.go +++ b/services/synccommittees/standard/parameters.go @@ -110,11 +110,11 @@ func parseAndCheckParameters(params ...Parameter) (*parameters, error) { } // Ensure the eth2client can handle our requirements. if _, isProvider := parameters.eth2Client.(eth2client.SyncCommitteesProvider); !isProvider { - //nolint:stylecheck + //nolint:staticcheck return nil, errors.New("Ethereum 2 client does not provide sync committee information") // skipcq: SCC-ST1005 } if _, isProvider := parameters.eth2Client.(eth2client.EventsProvider); !isProvider { - //nolint:stylecheck + //nolint:staticcheck return nil, errors.New("Ethereum 2 client does not provide events") // skipcq: SCC-ST1005 } if parameters.chainDB == nil { From af7981cbecc55b63a4d1130b18a30e29f57b1063 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 11:44:03 +0200 Subject: [PATCH 21/25] Use fmt.Fprintf in attestations and beaconcommittees query builders staticcheck QF1012 flags five sites across attestations.go and beaconcommittees.go where the strings.Builder's content is appended via: queryBuilder.WriteString(fmt.Sprintf(`...%s...`, args...)) The recommended idiom writes directly into the builder via fmt.Fprintf, avoiding the intermediate string allocation: fmt.Fprintf(&queryBuilder, `...%s...`, args...) Flagged in commit 9c6bd55's "Notes for next iteration" -- those five sites were the next pickup in the QF1012 sweep started in 0486b6a (validatorepochsummaries) and continued in 9c6bd55 (blocks). This is the focused commit for those sites. The behavioural surface is unchanged. fmt.Fprintf into a strings.Builder writes the same bytes as fmt.Sprintf followed by WriteString -- the format-string semantics, the verb interpretation, and the resulting byte sequence are identical. strings.Builder.Write is documented to never return an error, so the discarded (n, err) return from fmt.Fprintf is also safe and matches how the prior two QF1012 commits (0486b6a, 9c6bd55) handled the same shape. The only observable difference is the absence of the intermediate string heap allocation per call site, which is exactly what staticcheck QF1012 targets. Sites: attestations.go -- 1 site: Limit-clause append in Attestations() (line 720 in the pre-change file). beaconcommittees.go -- 4 sites: From-bound (line 91), To-bound (line 98), CommitteeIndices ANY-clause (line 105), and Limit-clause (line 122) appends in BeaconCommittees(). Trust-boundary check confirms the rewrite introduces no new SQL injection surface. The two interpolated arguments at every site are either the closed-enum string `wherestr` (assigned only to the literal "WHERE" or " AND" -- no path from user input) or the non-injectable integer `len(queryVals)` (an int-typed slice length formatted with %d). User-supplied filter values continue to flow through the queryVals bind-parameter slice and are passed to tx.Query as positional parameters, never interpolated into the query string. Same pattern as before the rewrite. Key decisions: - Targeted to the five sites flagged in 9c6bd55's notes. Sites 105 and 122 in beaconcommittees.go are byte-for-byte identical patterns to 91 and 98 (the same `WriteString(fmt.Sprintf(...))` shape, the same SQL-fragment template family); golangci-lint's staticcheck integration only surfaces three of the four beaconcommittees sites in default output, but landing all four in the same commit keeps the file self-consistent rather than mixed. File-level consistency outweighs strict "only-flagged-sites" scope. - All five sites use the multi-line raw-string template (`... %s f_... = $%d`, `...LIMIT $%d`) for SQL-fragment construction. The diff is a pure call-shape rewrite -- no whitespace adjustment, no template edits, no parameter reordering. Identical mechanical shape to commits 0486b6a and 9c6bd55. - No new import. fmt was already imported in both files (attestations.go line 19; beaconcommittees.go line 18); the fmt.Fprintf change is internal to the same package usage. - Pointer receiver on the strings.Builder argument (&queryBuilder). strings.Builder's Write method has a pointer receiver, so passing &queryBuilder satisfies io.Writer with no interface boxing of the receiver value. Passing queryBuilder by value would not compile (Builder's no-copy invariant forbids copying after first use, enforced by the noCopy field embedded in the type). - The unrelated wsl_v5, noinlineerr, and revive issues that golangci-lint reports in attestations.go are NOT addressed here -- they predate the QF1012 cleanup pass and live on different lines. Out of scope. The four ST1005 nolint sites cleaned up in 32e48c1 are also unrelated to the QF1012 sweep. Files changed: - services/chaindb/postgresql/attestations.go -- one queryBuilder.WriteString(fmt.Sprintf(...)) -> fmt.Fprintf(&queryBuilder, ...) rewrite at line 720 in the pre-change file (Attestations() Limit clause). - services/chaindb/postgresql/beaconcommittees.go -- four queryBuilder.WriteString(fmt.Sprintf(...)) -> fmt.Fprintf(&queryBuilder, ...) rewrites at lines 91, 98, 105, and 122 in the pre-change file (BeaconCommittees() From, To, CommitteeIndices, and Limit clauses). Behavioural surface: Pre-change golangci-lint run ./services/chaindb/postgresql/: - QF1012 hits in attestations.go: 1 (line 720) - QF1012 hits in beaconcommittees.go: 3 (lines 91, 98, 122 in default staticcheck output; line 105 same shape, surfaced via grill audit) Post-change golangci-lint run ./services/chaindb/postgresql/: - QF1012 hits in attestations.go: 0 - QF1012 hits in beaconcommittees.go: 0 - QF1012 hits in blobsidecars.go: 3 (lines 70, 75, 80; unchanged from baseline -- explicitly out of scope, see notes below) go build ./..., go vet ./..., gofmt -s -l on the two touched files: all clean. gosilent test ./...: 87/87 PASS, same baseline carried since 6a148f6's full-suite run. Notes for next iteration: - blobsidecars.go:70/75/80 are flagged QF1012 in current golangci-lint output. Same one-line rewrite per site; candidate for the next focused commit if maintainers want the blobsidecars cleanup. - The wider package-wide sweep across the remaining ~33 unflagged sites in 11 other chaindb/postgresql files (e.g. validators.go, proposerduties.go, syncaggregates.go, etc.) is gated on enabling stricter staticcheck QF rules in .golangci.yml. Those files are not currently surfaced by the default rule set. - The .golangci.yml hygiene cleanup landed in 32e48c1 closed the three v1->v2 migration drift items; the structurally-identical nlreturn.block-size: 3 orphan flagged in 32e48c1's notes is the next mechanical config-hygiene pickup. - The deterministic services/scheduler/standard TestManyJobs go-deadlock false positive was closed by 6a148f6's bump of go-deadlock from v0.3.5 to v0.3.9; no further action needed there. --- services/chaindb/postgresql/attestations.go | 4 ++-- services/chaindb/postgresql/beaconcommittees.go | 16 ++++++++-------- 2 files changed, 10 insertions(+), 10 deletions(-) diff --git a/services/chaindb/postgresql/attestations.go b/services/chaindb/postgresql/attestations.go index eb5e222..e6c9337 100644 --- a/services/chaindb/postgresql/attestations.go +++ b/services/chaindb/postgresql/attestations.go @@ -717,8 +717,8 @@ ORDER BY f_inclusion_slot DESC,f_inclusion_index DESC`) if filter.Limit > 0 { queryVals = append(queryVals, filter.Limit) - queryBuilder.WriteString(fmt.Sprintf(` -LIMIT $%d`, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +LIMIT $%d`, len(queryVals)) } if e := log.Trace(); e.Enabled() { diff --git a/services/chaindb/postgresql/beaconcommittees.go b/services/chaindb/postgresql/beaconcommittees.go index 4573898..558e097 100644 --- a/services/chaindb/postgresql/beaconcommittees.go +++ b/services/chaindb/postgresql/beaconcommittees.go @@ -88,22 +88,22 @@ FROM t_beacon_committees`) if filter.From != nil { queryVals = append(queryVals, *filter.From) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_slot >= $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_slot >= $%d`, wherestr, len(queryVals)) wherestr = " AND" } if filter.To != nil { queryVals = append(queryVals, *filter.To) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_slot <= $%d`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_slot <= $%d`, wherestr, len(queryVals)) wherestr = " AND" } if len(filter.CommitteeIndices) > 0 { queryVals = append(queryVals, filter.CommitteeIndices) - queryBuilder.WriteString(fmt.Sprintf(` -%s f_index = ANY($%d)`, wherestr, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +%s f_index = ANY($%d)`, wherestr, len(queryVals)) } switch filter.Order { @@ -119,8 +119,8 @@ ORDER BY f_slot DESC,f_committee DESC`) if filter.Limit > 0 { queryVals = append(queryVals, filter.Limit) - queryBuilder.WriteString(fmt.Sprintf(` -LIMIT $%d`, len(queryVals))) + fmt.Fprintf(&queryBuilder, ` +LIMIT $%d`, len(queryVals)) } if e := log.Trace(); e.Enabled() { From 88a6bf2ee2c91be77a758b31f1f0f0bebda18998 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 12:54:29 +0200 Subject: [PATCH 22/25] Trim verbose comments across summarizer-lag-observability files MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Several comments accumulated during the lag-observability work grew into multi-paragraph essays that re-told the PR description: bootstrap-vs- production-state guard explanations, full ADR-0002 retellings on test docstrings, the cursor-diff formula derivation in three places, and the exhaustive Deprecated: blocks on the three MissedEpochs fields each listed the other two services and instructed readers to "file an issue to remove the field." Per the project's "no comments unless the why is non-obvious" convention, trim each to the load-bearing why and drop references that rot (commit hashes, issue numbers, file:line pointers, "added for X" framing). What stays: * The LEFT JOIN duality at the two summarizeEpoch guard sites — the bootstrap path and the post-accumulation path are non-obvious from reading the SQL alone. * The alert-rule message-text contract notes at each warn-log site — a hidden constraint that an external alert rule pattern-matches on these strings. * The clampLag uint64 underflow rationale — the ~1.8e19 phantom value is the surprising behavior the float64 conversion guards against. * The defer ... LIFO ordering note in OnFinalityUpdated — would surprise a reader who doesn't track Go's defer semantics. What goes: * Multi-paragraph "added in commit X / issue Y" framings. * Cross-file inventories on the three Deprecated: MissedEpochs blocks. * Test docstrings that re-state what the test name already says. * Sub-test branch comments where the case name describes the scenario. * Speculative future-proofing rationales (upstreamMetadata "exists so future fields can be added"). Net: -170 comment lines across 12 files (-255 / +85). Build, gosilent test, go vet, and gofmt all clean against the trimmed state. Files: services/finalizer/standard/metadata.go services/proposerduties/standard/metadata.go services/summarizer/standard/epoch.go services/summarizer/standard/handler.go services/summarizer/standard/handler_internal_test.go services/summarizer/standard/metadata.go services/summarizer/standard/metrics.go services/summarizer/standard/validatorday.go services/summarizer/standard/validatorday_internal_test.go services/validators/standard/handler.go services/validators/standard/handler_internal_test.go services/validators/standard/metadata.go --- services/finalizer/standard/metadata.go | 10 +-- services/proposerduties/standard/metadata.go | 13 +-- services/summarizer/standard/epoch.go | 26 ++---- services/summarizer/standard/handler.go | 32 ++----- .../standard/handler_internal_test.go | 31 ++----- services/summarizer/standard/metadata.go | 23 ++--- services/summarizer/standard/metrics.go | 10 +-- services/summarizer/standard/validatorday.go | 36 ++------ .../standard/validatorday_internal_test.go | 39 ++------- services/validators/standard/handler.go | 22 ++--- .../standard/handler_internal_test.go | 85 ++++--------------- services/validators/standard/metadata.go | 13 +-- 12 files changed, 85 insertions(+), 255 deletions(-) diff --git a/services/finalizer/standard/metadata.go b/services/finalizer/standard/metadata.go index ea97561..6185180 100644 --- a/services/finalizer/standard/metadata.go +++ b/services/finalizer/standard/metadata.go @@ -24,13 +24,9 @@ import ( type metadata struct { LastFinalizedEpoch int64 `json:"latest_epoch"` LatestCanonicalSlot int64 `json:"latest_canonical_slot"` - // Deprecated: never populated by any code path in this service. Residue - // from an abandoned gap-tracking design (the same residue exists in - // services/validators/standard/metadata.go and - // services/proposerduties/standard/metadata.go). Retained for JSON - // backward-compatibility with t_metadata rows persisted by older builds. - // Do not add new readers or writers — file an issue to remove the field - // if you find one. + // Deprecated: never populated. Residue from an abandoned gap-tracking + // design. Retained for JSON backward-compatibility with t_metadata rows + // persisted by older builds. MissedEpochs []int64 `json:"missed_epochs,omitempty"` } diff --git a/services/proposerduties/standard/metadata.go b/services/proposerduties/standard/metadata.go index b0f8068..fc3827b 100644 --- a/services/proposerduties/standard/metadata.go +++ b/services/proposerduties/standard/metadata.go @@ -24,16 +24,9 @@ import ( // metadata stored about this service. type metadata struct { LatestEpoch int64 `json:"latest_epoch"` - // Deprecated: never populated by any code path in this service nor in any - // other service in this repository. The handleMissed consumer in - // service.go (lines 166-204) reads from a permanently-empty list — it is - // dormant code, retained alongside the field. Residue from an abandoned - // gap-tracking design that also left the same residue in - // services/validators/standard/metadata.go and - // services/finalizer/standard/metadata.go. Retained for JSON backward- - // compatibility with t_metadata rows persisted by older builds. Do not - // add new readers or writers — file an issue to remove the field (and - // the dormant consumer) if you find one. + // Deprecated: never populated. The handleMissed consumer reads from a + // permanently-empty list and is dormant. Retained for JSON backward- + // compatibility with t_metadata rows persisted by older builds. MissedEpochs []phase0.Epoch `json:"missed_epochs,omitempty"` } diff --git a/services/summarizer/standard/epoch.go b/services/summarizer/standard/epoch.go index 0df7ecf..f2c74ea 100644 --- a/services/summarizer/standard/epoch.go +++ b/services/summarizer/standard/epoch.go @@ -76,16 +76,10 @@ func (s *Service) summarizeEpoch(ctx context.Context, return false, errors.Wrap(err, "failed to obtain validator balances") } if len(balances) == 0 { - // Bootstrap path: chaind has no validators rows yet (first boot before - // the validators service has populated t_validators), or validator - // balances are disabled entirely, so the LEFT JOIN in - // ValidatorBalancesByEpoch returns no rows at all. In production - // state the JOIN returns one zero-balance row per validator instead — - // see the post-accumulation guard below for that case. Both branches - // share one warn message so the alert-annotation contract is single- - // sourced. - // Alert annotation contract: this stable message text is matched by the lag-based alert - // rule; do not change without coordinating the alert update. + // Bootstrap path: t_validators not yet populated, so the LEFT JOIN in + // ValidatorBalancesByEpoch returns no rows. Production state is + // covered by the post-accumulation guard below. The message is the + // alert-rule contract; changing the text breaks the alert. log.Warn().Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil } @@ -106,14 +100,10 @@ func (s *Service) summarizeEpoch(ctx context.Context, } log.Trace().Dur("elapsed", time.Since(started)).Msg("Obtained validator balances") - // Production-state guard: ValidatorBalancesByEpoch LEFT-JOINs t_validators - // against t_validator_balances with COALESCE(f_balance, 0) — when no - // balance rows exist for the epoch, it returns one zero-balance row per - // validator rather than an empty slice, so the len(balances) == 0 check - // above cannot detect that case. If we have active validators but zero - // summed balance, the upstream balance pipeline missed this epoch. Refuse - // to write a corrupt summary; reuse the same warn-log text so the alert - // annotation contract is preserved. + // Production-state guard: ValidatorBalancesByEpoch's LEFT JOIN with + // COALESCE(f_balance, 0) returns one zero-balance row per validator when + // no balance rows exist for the epoch, so the len() check above cannot + // catch this shape. Reuse the alert-rule message text. if summary.ActiveValidators > 0 && summary.ActiveBalance == 0 { log.Warn().Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil diff --git a/services/summarizer/standard/handler.go b/services/summarizer/standard/handler.go index 742daac..11fc6ec 100644 --- a/services/summarizer/standard/handler.go +++ b/services/summarizer/standard/handler.go @@ -47,10 +47,8 @@ func (s *Service) OnFinalityUpdated( return } defer s.activitySem.Release(1) - // Refresh per-pipeline lag gauges on every handler invocation, including - // the error-return paths below. Deferred so it observes the post-run - // metadata. Registered after activitySem.Release so LIFO ordering runs - // the gauge update first, while the semaphore is still held. + // Registered after activitySem.Release so LIFO ordering runs the gauge + // update while the semaphore is still held. defer s.updateLagGauges(ctx, finalizedEpoch) if finalizedEpoch == 0 { @@ -135,8 +133,7 @@ func (s *Service) summarizeEpochs(ctx context.Context, targetEpoch phase0.Epoch) return errors.Wrapf(err, "failed to update summary for epoch %d", epoch) } if !updated { - // Alert annotation contract: this stable message text is matched by the lag-based alert - // rule; do not change without coordinating the alert update. + // Alert-rule contract — do not change message text. log.Warn().Uint64("epoch", uint64(epoch)).Msg("Not enough data to update summary; will retry on next finality tick") return nil } @@ -278,12 +275,8 @@ func (s *Service) epochsPerDay() phase0.Epoch { return phase0.Epoch(86400.0 / s.chainTime.SlotDuration().Seconds() / float64(s.chainTime.SlotsPerEpoch())) } -// updateLagGauges fetches the latest summarizer metadata and the upstream -// validators metadata, then refreshes the per-pipeline lag gauges. Invoked -// via defer at the top of OnFinalityUpdated so it runs on every return path, -// including handler errors. The pure-compute half is split into setLagGauges -// so unit tests can drive the gauge math with synthetic metadata without -// standing up a chainDB. +// updateLagGauges refreshes the per-pipeline lag gauges from current +// metadata. The pure-compute half is split into setLagGauges for testing. func (s *Service) updateLagGauges(ctx context.Context, finalizedEpoch phase0.Epoch) { if finalizedEpoch == 0 { return @@ -301,17 +294,11 @@ func (s *Service) updateLagGauges(ctx context.Context, finalizedEpoch phase0.Epo s.setLagGauges(md, upstream) } -// setLagGauges computes lag-in-epochs per enabled pipeline as a cursor diff -// against each pipeline's direct upstream: +// setLagGauges sets per-pipeline lag as a cursor diff against direct upstream: // // - epoch = validators.LatestBalancesEpoch - summarizer.LastEpoch // - block = summarizer.LastEpoch - summarizer.LastBlockEpoch // - validator = summarizer.LastEpoch - summarizer.LastValidatorEpoch -// -// The float64 conversion in clampLag is deliberate: a uint64 subtraction would -// underflow to ~1.8e19 if a pipeline cursor is briefly ahead of its upstream -// (a benign race window between metadata writes). clampLag floors negatives -// to 0 so the gauge never reports a phantom astronomical lag. func (s *Service) setLagGauges(md *metadata, upstream *upstreamMetadata) { if s.epochSummaries { monitorLag("epoch", clampLag(upstream.LatestBalancesEpoch, md.LastEpoch)) @@ -324,10 +311,9 @@ func (s *Service) setLagGauges(md *metadata, upstream *upstreamMetadata) { } } -// clampLag computes upstream - downstream as a non-negative float64. Pipeline -// cursors can transiently invert across metadata reads (downstream advances in -// a transaction the read snapshot didn't capture), so clamping below at 0 is -// the right semantic: "no lag" beats "phantom 1.8e19 lag" for an alert metric. +// clampLag returns upstream - downstream as a non-negative float64. uint64 +// subtraction would underflow to ~1.8e19 if cursors transiently invert across +// metadata reads, breaking alerts. func clampLag(upstream, downstream phase0.Epoch) float64 { if downstream >= upstream { return 0 diff --git a/services/summarizer/standard/handler_internal_test.go b/services/summarizer/standard/handler_internal_test.go index 33afb65..5966fbd 100644 --- a/services/summarizer/standard/handler_internal_test.go +++ b/services/summarizer/standard/handler_internal_test.go @@ -27,10 +27,8 @@ import ( "github.com/wealdtech/chaind/services/chaindb" ) -// stubValidatorsProvider is a hand-built chaindb.ValidatorsProvider used to -// drive summarizeEpoch into the silent-skip and zero-balance branches. Only -// Validators and ValidatorBalancesByEpoch are exercised; the remaining methods -// satisfy the interface but are not invoked along the tested paths. +// stubValidatorsProvider drives summarizeEpoch into the silent-skip and +// zero-balance branches. type stubValidatorsProvider struct { validators []*chaindb.Validator balances []*chaindb.ValidatorBalance @@ -64,14 +62,10 @@ func (s *stubValidatorsProvider) ValidatorBalancesByIndexAndEpochs(_ context.Con return nil, nil } -// TestSummarizeEpochSilentSkipEmitsWarn drives summarizeEpoch with one active -// validator and an empty balances slice (the bootstrap path), asserting it -// returns (false, nil) and emits the alert-contract warn log. func TestSummarizeEpochSilentSkipEmitsWarn(t *testing.T) { ctx := context.Background() - // main_test.go sets the global level to Disabled; locally raise it so - // the buffer logger below actually emits. + // main_test.go sets GlobalLevel to Disabled; raise it so the buffer captures warns. originalGlobalLevel := zerolog.GlobalLevel() zerolog.SetGlobalLevel(zerolog.TraceLevel) defer zerolog.SetGlobalLevel(originalGlobalLevel) @@ -112,10 +106,8 @@ func TestSummarizeEpochSilentSkipEmitsWarn(t *testing.T) { "silent-skip log emitted at unexpected level; got: %s", logged) } -// recordingChainDB is a chaindb.Service / EpochSummariesSetter that records -// every SetEpochSummary call so a test can assert that the corrupt-summary -// guard prevents writes. No transaction enforcement — tests that drive the -// guard path exit before BeginTx is reached. +// recordingChainDB records every SetEpochSummary call so tests can assert +// the corrupt-summary guard prevents writes. type recordingChainDB struct { summariesSet []*chaindb.EpochSummary } @@ -147,11 +139,8 @@ func (c *recordingChainDB) SetEpochSummary(_ context.Context, summary *chaindb.E return nil } -// TestSummarizeEpochZeroBalanceAssertion drives the production-state path: -// ValidatorBalancesByEpoch returns one row per validator with Balance=0 and -// EffectiveBalance=0 (mimicking the LEFT JOIN behavior when t_validator_balances -// is empty for the epoch). Asserts the guard returns (false, nil), emits the -// alert-contract warn log, and writes nothing to t_epoch_summaries. +// TestSummarizeEpochZeroBalanceAssertion drives the production-state shape +// (LEFT JOIN with all-zero rows) and asserts the guard refuses to write. func TestSummarizeEpochZeroBalanceAssertion(t *testing.T) { ctx := context.Background() @@ -211,10 +200,8 @@ func TestSummarizeEpochZeroBalanceAssertion(t *testing.T) { "zero-balance log emitted at unexpected level; got: %s", logged) } -// TestSetLagGaugesWiring drives setLagGauges with synthetic (summarizer, -// upstream) metadata pairs and asserts per-pipeline cursor-diff values, -// disabled-pipeline absence, and clamp-to-zero behavior in the brief race -// window where a downstream cursor is read ahead of its upstream. +// TestSetLagGaugesWiring asserts per-pipeline cursor-diff math, disabled- +// pipeline absence, and clamp-to-zero behavior. func TestSetLagGaugesWiring(t *testing.T) { originalGauge := summarizerLagEpochs summarizerLagEpochs = prometheus.NewGaugeVec(prometheus.GaugeOpts{ diff --git a/services/summarizer/standard/metadata.go b/services/summarizer/standard/metadata.go index e723b93..3bcfaa0 100644 --- a/services/summarizer/standard/metadata.go +++ b/services/summarizer/standard/metadata.go @@ -83,30 +83,23 @@ type oldmetadata struct { PeriodicValidatorRollups bool `json:"periodic_validator_rollups"` } -// upstreamMetadataKey is the metadata key written by the validators service. -// Mirrored here (rather than imported) so the summarizer stays decoupled from -// the validators package. Format contract owned by services/validators/standard. +// upstreamMetadataKey mirrors validators.standard's metadata key to keep the +// summarizer decoupled from the validators package. const upstreamMetadataKey = "validators.standard" -// upstreamMetadata captures the subset of validators.standard metadata that the -// per-pipeline lag gauge needs. Only LatestBalancesEpoch is consumed today; -// the struct exists so future fields can be added without changing the call -// site. +// upstreamMetadata captures the subset of validators.standard metadata the +// lag gauge consumes. type upstreamMetadata struct { LatestBalancesEpoch phase0.Epoch `json:"latest_balances_epoch"` } -// oldUpstreamMetadata mirrors upstreamMetadata for the pre-0.8.8 unquoted-int -// JSON format the validators service may have written. +// oldUpstreamMetadata is the pre-0.8.8 unquoted-int format. type oldUpstreamMetadata struct { LatestBalancesEpoch uint64 `json:"latest_balances_epoch"` } -// getUpstreamMetadata reads the validators.standard metadata row used to -// compute the epoch-pipeline lag. Returns a zero-valued struct (not an error) -// when the row is absent — bootstrap state where the validators service has -// not yet written metadata is a normal condition, and the cursor diff in -// setLagGauges naturally clamps to 0 in that case. +// getUpstreamMetadata reads validators.standard metadata. Returns a zero +// struct when the row is absent (bootstrap state). func (s *Service) getUpstreamMetadata(ctx context.Context) (*upstreamMetadata, error) { md := &upstreamMetadata{} mdJSON, err := s.chainDB.Metadata(ctx, upstreamMetadataKey) @@ -117,7 +110,7 @@ func (s *Service) getUpstreamMetadata(ctx context.Context) (*upstreamMetadata, e return md, nil } if err := json.Unmarshal(mdJSON, md); err != nil { - // Try the old format. Same dual-path pattern as getMetadata above. + // Try the old format. omd := &oldUpstreamMetadata{} if err := json.Unmarshal(mdJSON, omd); err != nil { return nil, errors.Wrap(err, "failed to unmarshal upstream validators metadata") diff --git a/services/summarizer/standard/metrics.go b/services/summarizer/standard/metrics.go index 76e81f2..17b69ca 100644 --- a/services/summarizer/standard/metrics.go +++ b/services/summarizer/standard/metrics.go @@ -41,13 +41,9 @@ var ( lastBalancePrune prometheus.Gauge ) -// summarizerLagEpochs tracks how many epochs each summarizer pipeline is behind -// the finality target. Pipelines that are disabled never call WithLabelValues -// so their series are absent from /metrics. -// -// Alert annotation contract: this gauge is referenced by the lag-based alert -// rule deployed alongside chaind; do not rename without coordinating the alert -// migration. +// summarizerLagEpochs records how many epochs each pipeline trails its +// upstream cursor. Disabled pipelines never call WithLabelValues, so their +// series are absent from /metrics. Alert-rule contract — do not rename. var summarizerLagEpochs *prometheus.GaugeVec func registerMetrics(_ context.Context, monitor metrics.Service) error { diff --git a/services/summarizer/standard/validatorday.go b/services/summarizer/standard/validatorday.go index e264aac..ae4db6d 100644 --- a/services/summarizer/standard/validatorday.go +++ b/services/summarizer/standard/validatorday.go @@ -226,19 +226,9 @@ func (s *Service) addValidatorBalanceSummaries(ctx context.Context, return false, errors.Wrap(err, "failed to obtain validator start epoch balances") } if !validatorBalancesPresent(startBalances) { - // Bootstrap path (len == 0): t_validators not yet populated, so the - // LEFT JOIN in ValidatorBalancesByEpoch returns no rows. - // Production-state path (all-zero rows): t_validators is populated - // but no t_validator_balances rows exist for this epoch, so the - // LEFT JOIN returns one COALESCE(f_balance, 0) row per validator — - // len() cannot detect this case. Both branches share one warn - // message so the alert annotation contract from issue 2 / issue 10 - // is preserved. The "epoch" wording in the message is preserved - // verbatim to match the alert rule; structured start_time/end_time - // fields carry the day-layer identity so operators can disambiguate - // from epoch-layer trips. - // Alert annotation contract: this stable message text is matched by the lag-based alert - // rule; do not change without coordinating the alert update. + // Same dual-shape guard as epoch.go: bootstrap empty slice or LEFT + // JOIN all-zero rows. Keep the "epoch" wording verbatim for the + // alert rule; start_time/end_time disambiguate the day layer. log.Warn().Time("start_time", startTime).Time("end_time", endTime).Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil } @@ -295,16 +285,7 @@ func (s *Service) addValidatorBalanceSummaries(ctx context.Context, return false, errors.Wrap(err, "failed to obtain validator end epoch balances") } if !validatorBalancesPresent(endBalances) { - // Same dual-branch guard as the startBalances fetch above: - // len(endBalances) == 0 covers the bootstrap path; the all-zero - // case covers the production-state LEFT JOIN shape that defeats - // the naive length check. Reuse the exact warn-log text so the - // alert annotation contract stays single-sourced. The "epoch" - // wording in the message is preserved verbatim to match the alert - // rule; structured start_time/end_time fields carry the day-layer - // identity so operators can disambiguate from epoch-layer trips. - // Alert annotation contract: this stable message text is matched by the lag-based alert - // rule; do not change without coordinating the alert update. + // Same dual-shape guard as startBalances above. log.Warn().Time("start_time", startTime).Time("end_time", endTime).Msg("No validator balances available; cannot summarize epoch (will retry on next finality tick)") return false, nil } @@ -443,13 +424,8 @@ func (s *Service) syncCommitteesForEpochs(ctx context.Context, } // validatorBalancesPresent reports whether the slice contains at least one -// row with a non-zero balance — the inverse of the LEFT-JOIN-all-zeros -// shape produced by ValidatorBalancesByEpoch when t_validator_balances has -// no rows for the queried epoch but t_validators is populated. An empty -// slice (the bootstrap path before t_validators exists) also reports -// false, so callers get a single guard for both shapes. See -// services/summarizer/standard/epoch.go for the equivalent epoch-layer -// guard pattern from commit 1edcb45. +// non-zero row. An empty slice and a LEFT-JOIN-all-zero slice both report +// false, collapsing the bootstrap and production-state shapes into one guard. func validatorBalancesPresent(balances []*chaindb.ValidatorBalance) bool { for _, b := range balances { if b.Balance != 0 || b.EffectiveBalance != 0 { diff --git a/services/summarizer/standard/validatorday_internal_test.go b/services/summarizer/standard/validatorday_internal_test.go index 340efaa..32a0dc9 100644 --- a/services/summarizer/standard/validatorday_internal_test.go +++ b/services/summarizer/standard/validatorday_internal_test.go @@ -27,14 +27,9 @@ import ( mockchaintime "github.com/wealdtech/chaind/services/chaintime/mock" ) -// stubBalanceChainDB is a chaindb.Service that also implements the four -// interfaces addValidatorBalanceSummaries type-asserts against: -// ValidatorsProvider, DepositsProvider, WithdrawalsProvider, and -// ValidatorDaySummariesSetter. The balance responses are popped from a -// queue so a single fixture can drive the start-balances and end-balances -// trip paths from one slice of test inputs. daySummariesSet records every -// SetValidatorDaySummaries call so the test can assert that the corrupt- -// summary guard prevents writes. +// stubBalanceChainDB pops balance responses from a queue so one fixture can +// drive both the start-balances and end-balances guard paths. daySummariesSet +// records writes so tests can assert the guard short-circuits. type stubBalanceChainDB struct { balancesResponses [][]*chaindb.ValidatorBalance balancesCallCount int @@ -100,13 +95,7 @@ func (c *stubBalanceChainDB) Withdrawals(_ context.Context, _ *chaindb.Withdrawa return nil, nil } -// chaindb.ValidatorDaySummariesSetter — addValidatorBalanceSummaries does -// not call these directly; the parent summarizeValidatorsInDay does, after -// addValidatorBalanceSummaries returns (true, nil). Recording these calls -// here is a structural-impossibility witness: the test asserts the -// recording slice stays empty, which is the test's way of documenting the -// guard contract (false, nil) → caller short-circuits → no day summary -// written. Mirrors the recordingChainDB pattern in handler_internal_test.go. +// chaindb.ValidatorDaySummariesSetter. func (c *stubBalanceChainDB) SetValidatorDaySummary(_ context.Context, _ *chaindb.ValidatorDaySummary) error { return nil } @@ -116,15 +105,9 @@ func (c *stubBalanceChainDB) SetValidatorDaySummaries(_ context.Context, summari return nil } -// TestAddValidatorBalanceSummariesZeroBalanceAssertion drives -// addValidatorBalanceSummaries with the LEFT-JOIN-all-zero shape that -// ValidatorBalancesByEpoch returns when no t_validator_balances rows -// exist for an epoch but t_validators is populated, mirroring the -// epoch-layer guard from commit 1edcb45 (issue 10). Both call sites -// (startBalances at validatorday.go:224 and endBalances at :280) must -// refuse to advance: return (false, nil), emit the alert-contract warn -// log with the same stable message string, and write nothing to -// t_validator_day_summaries. +// TestAddValidatorBalanceSummariesZeroBalanceAssertion drives the LEFT-JOIN- +// all-zero shape into both startBalances and endBalances guard sites and +// asserts neither writes to t_validator_day_summaries. func TestAddValidatorBalanceSummariesZeroBalanceAssertion(t *testing.T) { const numValidators = 1000 @@ -157,17 +140,12 @@ func TestAddValidatorBalanceSummariesZeroBalanceAssertion(t *testing.T) { balancesResponses [][]*chaindb.ValidatorBalance }{ { - // Bootstrap-or-production-state path 1: the very first - // ValidatorBalancesByEpoch call returns all-zero rows; the - // startBalances guard must trip immediately. name: "startBalances all zero", balancesResponses: [][]*chaindb.ValidatorBalance{ makeZeroBalances(), }, }, { - // Path 2: startBalances are valid, function proceeds past - // deposits/withdrawals, then endBalances trips the guard. name: "endBalances all zero", balancesResponses: [][]*chaindb.ValidatorBalance{ makeNonZeroBalances(), @@ -180,8 +158,7 @@ func TestAddValidatorBalanceSummariesZeroBalanceAssertion(t *testing.T) { t.Run(test.name, func(t *testing.T) { ctx := context.Background() - // main_test.go sets the global level to Disabled; locally - // raise it so the buffer logger below actually emits. + // main_test.go sets GlobalLevel to Disabled; raise it so the buffer captures warns. originalGlobalLevel := zerolog.GlobalLevel() zerolog.SetGlobalLevel(zerolog.TraceLevel) defer zerolog.SetGlobalLevel(originalGlobalLevel) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index 91c3a2e..5078883 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -28,12 +28,9 @@ import ( "go.opentelemetry.io/otel/trace" ) -// noValidatorBalancesMsg is the stable warn-log text emitted when the -// validator-balance fetcher rejects an empty or all-zero beacon response. -// Both rejection sites use this exact string so an operator runbook or -// Prometheus alert rule keyed off it matches uniformly. See -// docs/adr/0002-validator-balance-fetcher-recovery-model.md (claim 2: -// no-empty-write). +// noValidatorBalancesMsg is the warn-log text emitted when the fetcher +// rejects an empty or all-zero beacon response. Alert-rule contract — both +// rejection sites must emit this exact string. See ADR 0002. const noValidatorBalancesMsg = "Beacon returned no validator balances; cannot persist epoch (will retry on next finality tick)" // OnBeaconChainHeadUpdated receives beacon chain head updated notifications. @@ -200,10 +197,8 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context } validators := validatorsResponse.Data - // Refuse to advance the cursor on an empty 200-OK validators response. - // ADR 0002 records the no-empty-write contract this guard enforces: - // when the beacon returns zero validators the cursor must stay put so - // the next finality tick re-fetches the same epoch. + // Empty 200-OK response: refuse to advance so the next tick re-fetches. + // See ADR 0002 (no-empty-write contract). if len(validators) == 0 { log.Warn().Msg(noValidatorBalancesMsg) return errors.New("beacon returned no validator balances") @@ -226,11 +221,8 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context EffectiveBalance: validator.Validator.EffectiveBalance, }) } - // Refuse to advance the cursor when the "do not store 0 balances" - // filter above amplifies an all-zero-balance beacon response into a - // zero-row insert. No transaction is open at this point, so the - // rejection short-circuits cleanly; same warn-log contract as the - // empty-response guard. + // All-zero beacon response amplified by the filter above into a zero-row + // insert. Refuse to advance. if len(dbValidatorBalances) == 0 { log.Warn().Msg(noValidatorBalancesMsg) return errors.New("beacon returned no validator balances") diff --git a/services/validators/standard/handler_internal_test.go b/services/validators/standard/handler_internal_test.go index 0f93f89..c829686 100644 --- a/services/validators/standard/handler_internal_test.go +++ b/services/validators/standard/handler_internal_test.go @@ -29,16 +29,8 @@ import ( "github.com/wealdtech/chaind/services/chaindb" ) -// captureWarnLog redirects the package-level zerolog.Logger to a byte buffer -// at WarnLevel and returns the buffer plus a restore func. Mirrors the -// pattern in services/summarizer/standard/validatorday_internal_test.go so -// the two zero-balance guard sites (epoch summarizer, day summarizer, -// validator-balance fetcher) all assert their warn-log contracts the same -// way. The GlobalLevel save/restore is defensive: the validators/standard -// package has no main_test.go today, so GlobalLevel defaults to TraceLevel -// and the buffer would capture warns regardless — but if a future -// main_test.go sets Disabled to silence noise, this guard keeps the assertion -// load-bearing instead of silently always-passing. +// captureWarnLog redirects the package logger to a byte buffer at WarnLevel +// and returns the buffer plus a restore func. func captureWarnLog(t *testing.T) (*bytes.Buffer, func()) { t.Helper() originalGlobalLevel := zerolog.GlobalLevel() @@ -54,12 +46,8 @@ func captureWarnLog(t *testing.T) (*bytes.Buffer, func()) { } } -// stubEth2Client is a hand-built eth2client.Service + ValidatorsProvider used -// to drive onEpochTransitionValidatorBalancesForEpoch. handler.go:186 type- -// asserts s.eth2Client to eth2client.ValidatorsProvider, so this stub -// implements both interfaces directly. Only Validators is exercised; the -// base Service methods satisfy the type assertion but are not invoked along -// the tested path. +// stubEth2Client implements eth2client.Service + ValidatorsProvider; only +// Validators is exercised. type stubEth2Client struct { response *api.Response[map[phase0.ValidatorIndex]*apiv1.Validator] err error @@ -82,9 +70,7 @@ func (s *stubEth2Client) Validators(_ context.Context, _ *api.ValidatorsOpts) ( } // recordingChainDB captures BeginTx/CommitTx ordering and SetMetadata payloads -// so a test can assert the cursor advanced and the transaction committed. The -// tested function does not exercise read-only transactions; BeginROTx/CommitROTx -// satisfy the interface but are never called along the tested path. +// so tests can assert the cursor advanced and the transaction committed. type recordingChainDB struct { beginTxCalls int commitTxCalls int @@ -120,9 +106,7 @@ func (c *recordingChainDB) Metadata(_ context.Context, _ string) ([]byte, error) } // recordingValidatorsSetter captures every SetValidatorBalances/SetValidatorBalance -// call so the test can assert exactly what was persisted (or that nothing was -// persisted at all). SetValidator is part of the chaindb.ValidatorsSetter -// interface but is not invoked along the tested path. +// call so tests can assert what was persisted. type recordingValidatorsSetter struct { bulkCalls [][]*chaindb.ValidatorBalance individualBalances []*chaindb.ValidatorBalance @@ -144,9 +128,7 @@ func (r *recordingValidatorsSetter) SetValidatorBalances(_ context.Context, bala return nil } -// stubChainTime satisfies chaintime.Service. Only FirstSlotOfEpoch is invoked -// along the tested path (handler.go:184/195); the other 18 methods return zero -// values so the stub remains type-correct without simulating chain timing. +// stubChainTime satisfies chaintime.Service; only FirstSlotOfEpoch is invoked. type stubChainTime struct{} func (stubChainTime) GenesisTime() time.Time { return time.Time{} } @@ -172,10 +154,8 @@ func (stubChainTime) AltairInitialSyncCommitteePeriod() uint64 { return 0 } func (stubChainTime) BellatrixInitialEpoch() phase0.Epoch { return 0 } func (stubChainTime) CapellaInitialEpoch() phase0.Epoch { return 0 } -// makeService wires the four stubs above into a *Service ready to drive the -// onEpochTransitionValidatorBalancesForEpoch path. s.balances is set to true -// so the tested branch is entered; s.activitySem is left nil because the -// tested function does not touch it. +// makeService wires the stubs into a *Service for the +// onEpochTransitionValidatorBalancesForEpoch path. func makeService(eth2 *stubEth2Client, db *recordingChainDB, setter *recordingValidatorsSetter) *Service { return &Service{ eth2Client: eth2, @@ -186,19 +166,8 @@ func makeService(eth2 *stubEth2Client, db *recordingChainDB, setter *recordingVa } } -// TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsRejectedAndCursorUnchanged -// pins the post-fix contract for guard 1 (the empty 200-OK validators map): -// the function refuses to advance md.LatestBalancesEpoch, never opens a -// transaction (the guard fires before BeginTx), never calls -// SetValidatorBalances, returns an error so the parent -// onEpochTransitionValidatorBalances loop surfaces it to OnBeaconChainHeadUpdated, -// and emits the alert-contract warn log at WarnLevel. This is the regression -// boundary for the silent-cursor-advance defect addressed by ADR 0002 -// (docs/adr/0002-validator-balance-fetcher-recovery-model.md). -// -// Pre-fix this test asserted the BUGGY shape (cursor advanced, transaction -// committed); the inversion landed alongside the production-code guard in -// the same commit. +// Empty 200-OK response: cursor stays put, no transaction opens, warn-log +// fires. Regression boundary for ADR 0002. func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsRejectedAndCursorUnchanged(t *testing.T) { const epoch = phase0.Epoch(86823) const startCursor = phase0.Epoch(86822) @@ -251,17 +220,9 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_EmptyValidatorsRejectedAndCu "warn log missing structured epoch field for operator disambiguation; got: %s", logged) } -// TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged -// pins the post-fix contract for guard 2 (the all-zero-balances filter -// amplifier). The beacon returns 1000 validators with Balance == 0; the -// "Do not store 0 balances" filter collapses them into a zero-row insert, -// and the guard refuses to advance. Like guard 1, this path fires before -// any transaction is opened: BeginTx now sits below the slice build so the -// rejection short-circuits with no transaction to cancel; the transaction -// must not commit; and the cursor must stay put. Both empty-response -// shapes (this and EmptyValidatorsRejectedAndCursorUnchanged) emit the -// same warn-log alert-contract substring so an alert rule matches them -// uniformly. +// All-zero beacon response amplified by the "Do not store 0 balances" filter +// into a zero-row insert. Cursor stays put, no transaction opens, warn-log +// fires. func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCursorUnchanged(t *testing.T) { const epoch = phase0.Epoch(86824) const startCursor = phase0.Epoch(86823) @@ -328,13 +289,8 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_AllZeroBalancesRejectedAndCu "warn log missing structured epoch field for operator disambiguation; got: %s", logged) } -// TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero -// pins the partial-persist behavior with a mix of zero and non-zero balances. -// The "Do not store 0 balances" filter drops the zero half; the function -// persists the non-zero half and advances the cursor. Validator indices are -// constructed so the assertion does not depend on Go's non-deterministic map -// iteration order: the bulkCalls[0] slice should contain exactly numNonZero -// entries, each with Balance == nonZeroBalance. +// Mixed zero/non-zero balances: filter drops zeros, function persists the +// non-zero half and advances the cursor. func TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero(t *testing.T) { const epoch = phase0.Epoch(100) const numValidators = 1000 @@ -395,13 +351,8 @@ func TestOnEpochTransitionValidatorBalancesForEpoch_MixedBalancesPersistsNonZero "cursor advances when at least some rows persist (expected today)") } -// TestOnEpochTransitionValidatorBalancesForEpoch_BeaconErrorLeavesCursorUnchanged -// is the negative control: when the beacon Validators call returns an error, -// the function returns the wrapped error, never opens a transaction, never -// calls SetValidatorBalances, and md.LatestBalancesEpoch stays put. This case -// confirms the silent-advance bug is gated specifically on the empty/zero- -// response path (the previous three cases) — not a blanket "errors are -// swallowed" regression. +// Negative control: beacon error surfaces, no transaction opens, cursor +// stays put. func TestOnEpochTransitionValidatorBalancesForEpoch_BeaconErrorLeavesCursorUnchanged(t *testing.T) { const epoch = phase0.Epoch(50) const startCursor = phase0.Epoch(49) diff --git a/services/validators/standard/metadata.go b/services/validators/standard/metadata.go index 04c628a..ca8e910 100644 --- a/services/validators/standard/metadata.go +++ b/services/validators/standard/metadata.go @@ -25,16 +25,9 @@ import ( type metadata struct { LatestEpoch phase0.Epoch `json:"latest_epoch"` LatestBalancesEpoch phase0.Epoch `json:"latest_balances_epoch"` - // Deprecated: never populated by any code path in this service. Residue - // from an abandoned gap-tracking design (the same residue exists in - // services/finalizer/standard/metadata.go and - // services/proposerduties/standard/metadata.go). Retained for JSON - // backward-compatibility with t_metadata rows persisted by older builds. - // See ADR docs/adr/0002-validator-balance-fetcher-recovery-model.md - // "Considered Options" § Option 4 for the historical design intent and - // why the current recovery model chose a different shape. Do not add - // new readers or writers — file an issue to remove the field if you - // find one. + // Deprecated: never populated. Residue from an abandoned gap-tracking + // design. Retained for JSON backward-compatibility with t_metadata rows + // persisted by older builds. MissedEpochs []phase0.Epoch `json:"missed_epochs,omitempty"` } From 50c6a7a2e3a6bc84399f2b69c2ee7f9e494473f6 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 12:54:56 +0200 Subject: [PATCH 23/25] Bump golangci-lint-action to @v8 and disable wsl_v5 + noinlineerr MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The v1->v2 schema migration in commits ddbb6c4 / 32e48c1 made .golangci.yml v2-only (`version: "2"`, top-level `formatters`, `linters.default`, `linters.exclusions.presets`), but the workflow still pinned `golangci/golangci-lint-action@v6` which is locked to v1.x binaries. CI's lint job consequently failed at config-verify time before any linter ran, with the binary rejecting four v2-schema fields: jsonschema: "output.formats" does not validate ... got object, want array jsonschema: "linters.exclusions" ... additional properties 'presets' not allowed jsonschema: "linters" ... additional properties 'default', 'settings' not allowed jsonschema: "" ... additional properties 'formatters', 'version' not allowed Bumping to `@v8` switches the action to a v2-default binary (`latest` now resolves to the most recent v2.x release). The action's input schema for `version`, `args`, `only-new-issues`, and `skip-cache` is unchanged from v6. Once v2 actually runs, two newly-default-enabled linters fire on PR-touched lines: * `wsl_v5` — the next major version of the whitespace linter. `wsl` is already in the disable list (the codebase doesn't enforce wsl-style spacing rules); `wsl_v5` is a separate linter under v2's namespace and needs its own entry. * `noinlineerr` — flags `if err := f(); err != nil` against plain assignment. The codebase uses inline error handling pervasively; this is idiomatic Go style chaind has always leaned on, so adding the linter to the disable list reflects actual policy. Both additions match the existing pattern set by 32e48c1 (disable `gomodguard` rather than configure it; the codebase has no module-restriction policy to encode). Inserted alphabetically. Verification: golangci-lint config verify → no output (clean) golangci-lint run --new-from-rev=master ./... → 0 issues go build ./... → clean gosilent test ./services/... → ok all packages Files: .github/workflows/golangci-lint.yml .golangci.yml --- .github/workflows/golangci-lint.yml | 2 +- .golangci.yml | 2 ++ 2 files changed, 3 insertions(+), 1 deletion(-) diff --git a/.github/workflows/golangci-lint.yml b/.github/workflows/golangci-lint.yml index be3622d..d8bf604 100644 --- a/.github/workflows/golangci-lint.yml +++ b/.github/workflows/golangci-lint.yml @@ -21,7 +21,7 @@ jobs: go-version: '1.22' - uses: 'actions/checkout@v4' - name: 'golangci-lint' - uses: 'golangci/golangci-lint-action@v6' + uses: 'golangci/golangci-lint-action@v8' with: version: 'latest' args: '--timeout=60m' diff --git a/.golangci.yml b/.golangci.yml index 207435e..d14151f 100644 --- a/.golangci.yml +++ b/.golangci.yml @@ -29,11 +29,13 @@ linters: - nestif - nilnil - nlreturn + - noinlineerr - perfsprint - promlinter - varnamelen - wrapcheck - wsl + - wsl_v5 settings: gosec: excludes: From 723b098c2101331b0c2facbd4d4ffb24086ce29d Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 13:21:33 +0200 Subject: [PATCH 24/25] Use errNoValidatorBalances sentinel for fetcher rejection sites Replace two inline errors.New("beacon returned no validator balances") allocations with a package-level sentinel. Callers can now identify the rejection via errors.Is rather than substring matching, and the two sites are guaranteed to return the same value. Pairs with the existing noValidatorBalancesMsg constant for the operator-facing log line. --- services/validators/standard/handler.go | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/services/validators/standard/handler.go b/services/validators/standard/handler.go index 5078883..1c1f527 100644 --- a/services/validators/standard/handler.go +++ b/services/validators/standard/handler.go @@ -33,6 +33,8 @@ import ( // rejection sites must emit this exact string. See ADR 0002. const noValidatorBalancesMsg = "Beacon returned no validator balances; cannot persist epoch (will retry on next finality tick)" +var errNoValidatorBalances = errors.New("beacon returned no validator balances") + // OnBeaconChainHeadUpdated receives beacon chain head updated notifications. func (s *Service) OnBeaconChainHeadUpdated( ctx context.Context, @@ -201,7 +203,7 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context // See ADR 0002 (no-empty-write contract). if len(validators) == 0 { log.Warn().Msg(noValidatorBalancesMsg) - return errors.New("beacon returned no validator balances") + return errNoValidatorBalances } span.AddEvent("Obtained validators", trace.WithAttributes( @@ -225,7 +227,7 @@ func (s *Service) onEpochTransitionValidatorBalancesForEpoch(ctx context.Context // insert. Refuse to advance. if len(dbValidatorBalances) == 0 { log.Warn().Msg(noValidatorBalancesMsg) - return errors.New("beacon returned no validator balances") + return errNoValidatorBalances } dbCtx, cancel, err := s.chainDB.BeginTx(ctx) From df4db9e8e7de1bbdd435e76ee9062a49c9d20ff1 Mon Sep 17 00:00:00 2001 From: AntiD2ta Date: Fri, 8 May 2026 13:21:40 +0200 Subject: [PATCH 25/25] Note global-state mutation in three new internal-package test files The summarizer- and validators-side internal tests added on this branch swap the package-level log variable (and, for the summarizer handler test, the summarizerLagEpochs gauge) and restore the originals on teardown. Add a top-of-file note in each file warning future contributors not to add t.Parallel() to any test here, since the global mutation would race across goroutines. No behavioural change. --- services/summarizer/standard/handler_internal_test.go | 5 +++++ services/summarizer/standard/validatorday_internal_test.go | 4 ++++ services/validators/standard/handler_internal_test.go | 4 ++++ 3 files changed, 13 insertions(+) diff --git a/services/summarizer/standard/handler_internal_test.go b/services/summarizer/standard/handler_internal_test.go index 5966fbd..16ccebc 100644 --- a/services/summarizer/standard/handler_internal_test.go +++ b/services/summarizer/standard/handler_internal_test.go @@ -27,6 +27,11 @@ import ( "github.com/wealdtech/chaind/services/chaindb" ) +// Tests in this file mutate package-level state — the unexported `log` variable +// and the `summarizerLagEpochs` gauge — and restore the originals on teardown. +// Do NOT add t.Parallel() to any test here; the global mutation would race +// across goroutines. + // stubValidatorsProvider drives summarizeEpoch into the silent-skip and // zero-balance branches. type stubValidatorsProvider struct { diff --git a/services/summarizer/standard/validatorday_internal_test.go b/services/summarizer/standard/validatorday_internal_test.go index 32a0dc9..5504eca 100644 --- a/services/summarizer/standard/validatorday_internal_test.go +++ b/services/summarizer/standard/validatorday_internal_test.go @@ -27,6 +27,10 @@ import ( mockchaintime "github.com/wealdtech/chaind/services/chaintime/mock" ) +// Tests in this file mutate the package-level `log` variable and restore it on +// teardown. Do NOT add t.Parallel() to any test here — the global mutation +// would race across goroutines. + // stubBalanceChainDB pops balance responses from a queue so one fixture can // drive both the start-balances and end-balances guard paths. daySummariesSet // records writes so tests can assert the guard short-circuits. diff --git a/services/validators/standard/handler_internal_test.go b/services/validators/standard/handler_internal_test.go index c829686..ab29d6c 100644 --- a/services/validators/standard/handler_internal_test.go +++ b/services/validators/standard/handler_internal_test.go @@ -29,6 +29,10 @@ import ( "github.com/wealdtech/chaind/services/chaindb" ) +// Tests in this file mutate the package-level `log` variable via captureWarnLog +// and restore it on teardown. Do NOT add t.Parallel() to any test here — the +// global mutation would race across goroutines. + // captureWarnLog redirects the package logger to a byte buffer at WarnLevel // and returns the buffer plus a restore func. func captureWarnLog(t *testing.T) (*bytes.Buffer, func()) {