From f5d00f3f54704f87e8d048a3f8163876a3a51381 Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 10:49:54 +0200 Subject: [PATCH 01/10] Baseline - current block production performance with 100 txs Result: {"status":"keep","allocs_per_op":81,"bytes_per_op":25934,"ns_per_op":34001} --- autoresearch.jsonl | 1 + autoresearch.md | 50 ++++ autoresearch.sh | 34 +++ docs/adr/memory-optimization-plan.md | 425 +++++++++++++++++++++++++++ types/hash_memo_bench_test.go | 42 +++ 5 files changed, 552 insertions(+) create mode 100644 autoresearch.jsonl create mode 100644 autoresearch.md create mode 100755 autoresearch.sh create mode 100644 docs/adr/memory-optimization-plan.md create mode 100644 types/hash_memo_bench_test.go diff --git a/autoresearch.jsonl b/autoresearch.jsonl new file mode 100644 index 0000000000..eba8fbd7c8 --- /dev/null +++ b/autoresearch.jsonl @@ -0,0 +1 @@ +{"type":"config","name":"Block package performance and allocation reduction","metricName":"allocs_per_op","metricUnit":"allocs","bestDirection":"lower"} diff --git a/autoresearch.md b/autoresearch.md new file mode 100644 index 0000000000..3920b43e0b --- /dev/null +++ b/autoresearch.md @@ -0,0 +1,50 @@ +# Autoresearch: Block package performance and allocation reduction + +## Objective +Reduce memory allocations and improve performance of the block production hot path, specifically the `BlockProducer` interface methods (`ProduceBlock`, `CreateBlock`, `ApplyBlock`). The benchmark lives in `block/internal/executing/executor_benchmark_test.go`. + +## Metrics +- **Primary**: `allocs_per_op` (allocs/op, lower is better) — allocation count in `BenchmarkProduceBlock/100_txs` +- **Secondary**: `bytes_per_op` (B/op, lower is better) — memory usage per operation +- **Secondary**: `ns_per_op` (ns/op, lower is better) — execution time + +We target the 100_txs benchmark case because it has the most allocations (81 vs 71 for empty) and shows the clearest per-tx allocation pattern. + +## How to Run +`./autoresearch.sh` — runs `go test -bench=BenchmarkProduceBlock/100_txs -benchmem -count=3` and reports median of the 3 runs with METRIC lines for allocs_per_op, bytes_per_op, and ns_per_op. + +## Files in Scope +- `types/hashing.go` — hash computation; `sha256.New()` allocates ~213 bytes per call; called by `Data.Hash()` and `DACommitment()` +- `types/serialization.go` — `ToProto()`, `MarshalBinary()`, `txsToByteSlices()` all allocate new structs/slices every call +- `block/internal/executing/executor.go` — main block production; `ApplyBlock` converts `Txs` to `[][]byte` every time +- `types/data.go` — `Data` type; `DACommitment()` creates pruned Data allocation +- `types/header.go` — `Header` type +- `types/state.go` — `State` type, `NextState()` + +## Off Limits +- Protobuf definitions (`types/pb/`) — must not change wire format +- Test files except the benchmark +- Public API signatures (keep backward compatibility) + +## Constraints +- All tests must pass (`just test ./block/... ./types/...`) +- No new external dependencies +- Must not change protobuf wire format + +## What's Been Tried +Nothing yet — starting from baseline. + +## Baseline (first run) +Benchmark: `BenchmarkProduceBlock/100_txs` +- **81 allocs/op** (PRIMARY) +- ~25,900 B/op +- ~33,000 ns/op + +### Key allocation hotspots identified: +1. **`leafHashOpt()`** — `sha256.New()` allocates a new hash.Hash every call (~213B). Called by `Data.Hash()` and `DACommitment()` every block. +2. **`txsToByteSlices()`** — allocates new `[][]byte` slice every `Data.ToProto()` call. +3. **`Data.ToProto()` / `Header.ToProto()`** — allocate new protobuf structs every serialization. +4. **`DACommitment()`** — creates pruned `&Data{Txs: d.Txs}` allocation before hashing. +5. **`ApplyBlock`** — `make([][]byte, n)` for raw tx conversion every block. +6. **`Data.Hash()`** — allocates byte slice from `MarshalBinary()` + sha256.New(). +7. **`Header.HashSlim()`** — same pattern. diff --git a/autoresearch.sh b/autoresearch.sh new file mode 100755 index 0000000000..aad35a5215 --- /dev/null +++ b/autoresearch.sh @@ -0,0 +1,34 @@ +#!/bin/bash +# Fast benchmark runner for block production performance. +# Pre-check: syntax compilation under 1s. +# Runs the benchmark 3 times and reports the median for 100_txs case. + +set -euo pipefail + +# Fast syntax check +go build ./block/internal/executing/ >/dev/null 2>&1 || exit 1 + +run_bench() { + go test -run=^$ -bench=BenchmarkProduceBlock/100_txs -benchmem -count=1 \ + ./block/internal/executing/ 2>&1 | grep '100_txs' +} + +# Run 3 times, collect results +declare -a lines=() +for i in 1 2 3; do + lines+=("$(run_bench)") +done + +# Sort by allocs (field before "allocs/op"), pick median +sorted=$(for line in "${lines[@]}"; do echo "$line"; done | sort -t' ' -k1) +median=$(echo "$sorted" | sed -n '2p') + +# Extract numbers using awk +# Format: BenchmarkProduceBlock/100_txs-10 35155 33517 ns/op 25932 B/op 81 allocs/op +ns=$(echo "$median" | awk '{for(i=1;i<=NF;i++){if($i=="ns/op")print $(i-1)}}') +bytes=$(echo "$median" | awk '{for(i=1;i<=NF;i++){if($i=="B/op")print $(i-1)}}') +allocs=$(echo "$median" | awk '{for(i=1;i<=NF;i++){if($i=="allocs/op")print $(i-1)}}') + +echo "METRIC allocs_per_op=$allocs" +echo "METRIC bytes_per_op=$bytes" +echo "METRIC ns_per_op=$ns" diff --git a/docs/adr/memory-optimization-plan.md b/docs/adr/memory-optimization-plan.md new file mode 100644 index 0000000000..95cf25153e --- /dev/null +++ b/docs/adr/memory-optimization-plan.md @@ -0,0 +1,425 @@ +# Memory & Allocation Optimization Plan + +**Date:** 2026-03-30 +**Source:** pprof profiles collected from `prd-eden-testnet-node-1:6060` +**Node binary:** `evm` (Build ID: `c66958170e0ea317dee65bab308c02d0cc6b6098`) + +--- + +## Context + +A production node on a 62 GiB server is consuming 46 GiB in OS Cache+Buffer, using 2.6 GiB of swap, and the pattern repeats when the server is upsized — available RAM is consumed. The GC is spending ~55% of sampled CPU time in `scanobject`/`findObject`/`gcDrain`, indicating sustained heap pressure rather than a CPU-bound workload. + +Profiles collected: + +| Profile | Command | +|---------|---------| +| CPU (30s) | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/profile?seconds=30` | +| Heap (live) | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/heap` | +| Allocs (lifetime) | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/allocs` | +| Goroutines | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/goroutine` | + +**Follow-up profile:** 2026-04-03, `prd-eden-testnet-node-2:6060` (Build ID: `2b20bbdd78f3b2cecb508bb4ba5a7a12803df841`). In-use heap grew to ~2 GB (up from ~1.05 GB). GC dominance is gone (CPU now 2% utilization, no `scanobject`/`gcDrain` in top) but heap footprint increased. Dominant consumers shifted to `block/internal/cache` (see Issue 6). `go-header Hash.String` holds 302 MB (new Issue 7). Proto marshal is no longer in the top allocators. Gzip/flate share of cumulative allocs rose from 0.8% to 19%. Issues 1, 3, 4, 5 remain open. + +**Additional work merged (not in original plan):** +- [#3219](https://github.com/evstack/ev-node/pull/3219) — `Header.MemoizeHash()`: caches computed hash on the struct, avoiding repeated `sha256` + `proto.Marshal` on every `Hash()` call. Partially reduces heap pressure from hash-derived allocations. +- [#3204](https://github.com/evstack/ev-node/pull/3204) — Removed LRU from `block/internal/cache` generic cache; simplified eviction model. + +--- + +## Findings + +### 2026-03-30 — node-1 (Build ID: `c66958170e0ea317dee65bab308c02d0cc6b6098`) + +#### CPU profile (3.68s samples over 30s) + +| flat | cum | function | +|------|-----|----------| +| 16.6% | 17.4% | `runtime.findObject` | +| 11.1% | 41.6% | `runtime.scanobject` | +| 3.5% | — | `runtime.(*gcBits).bitp` | +| — | 41.9%| `runtime.gcDrain` | + +GC accounts for ~55% of sampled CPU. The node is not CPU-bound; the GC is responding to sustained allocation pressure. + +#### Heap — live objects (~1.05 GB in use) + +| MB (flat) | MB (cum) | call site | +|-----------|----------|-----------| +| 312 | 312 | `ristretto/z.Calloc` (off-heap, mmap-based block cache) | +| 101 | 490 | `store.DefaultStore.GetHeader` | +| 83 | 83 | `badger/skl.newArena` (memtables) | +| 82 | 82 | `protobuf consumeBytesSlice` | +| 63 | 63 | LRU `insertValue` | +| 56 | 125 | `types.SignedHeader.FromProto` | +| 9 | 539 | `store.DefaultStore.GetBlockData` | +| 4 | 579 | `store.CachedStore.GetBlockData` | +| 0 | 504 | `go-header/p2p.ExchangeServer.handleRangeRequest` | + +#### Allocs — lifetime (~58.5 TB total) + +| % of total | call site | +|------------|-----------| +| 63.5% | `proto.MarshalOptions.marshal` | +| 12.1% | `encoding/json.(*Decoder).refill` | +| 6.1% | `encoding/json.(*decodeState).literalStore` | +| 5.6% | `encoding/json.(*RawMessage).UnmarshalJSON` | +| 2.8% | `encoding/hex.DecodeString` | +| 2.0% | `encoding/hex.EncodeToString` | +| 1.5% | `go-buffer-pool.(*BufferPool).Get` | +| 0.8% | `compress/flate.dictDecoder.init` | + +The `proto.Marshal` allocations trace to `types.P2PData.MarshalBinary` → `go-header/p2p.ExchangeServer.handleRangeRequest`. For a range request of 128 headers, 128 fresh scratch buffers are allocated. + +The JSON/hex allocations trace to `evm.EngineClient.GetTxs` → `rpc.Client.sendHTTP` (go-ethereum). The gzip allocations are from `net/http`'s transparent decompression of EL responses. + +--- + +### 2026-04-03 — node-2 (Build ID: `2b20bbdd78f3b2cecb508bb4ba5a7a12803df841`) + +#### CPU profile (610ms samples over 30s — 2% utilization) + +GC dominance is gone. Node is largely idle. Top consumers: + +| flat | flat% | cum | function | +|------|-------|-----|----------| +| 100ms | 16.4% | 100ms | `syscall.Syscall6` | +| 50ms | 8.2% | 50ms | `edwards25519/field.feMul` (signature verification) | +| 10ms | 1.6% | 50ms | `rpc.(*httpConn).doRequest` (upstream EL calls) | + +#### Heap — live objects (~2.0 GB in use) + +| MB (flat) | MB (cum) | call site | +|-----------|----------|-----------| +| 514 | 514 | `block/internal/cache.(*Cache).setSeen` | +| 420 | 420 | `block/internal/cache.(*Cache).setDAIncluded` | +| 307 | 307 | `block/internal/cache.HeightPlaceholderKey` (inline) | +| 302 | 302 | `go-header.Hash.String` | +| 148 | 148 | `ristretto/z.Calloc` (off-heap) | +| 83 | 83 | `badger/skl.newArena` (memtables) | +| 75 | 285 | `store.DefaultStore.GetHeader` | +| 38 | 38 | `types.Header.FromProto` | +| 32 | 87 | `types.SignedHeader.FromProto` | +| 13 | 21 | LRU `insertValue` | +| 0 | 229 | `go-header/p2p.ExchangeServer.handleRangeRequest` | + +The top four entries — `setSeen`, `setDAIncluded`, `HeightPlaceholderKey`, `Hash.String` — account for ~1.54 GB (77% of heap). These are all within `block/internal/cache` and the header-exchange path. The store LRU is now negligible (13 MB). + +#### Allocs — lifetime (~805 GB total) + +| % of total | call site | +|------------|-----------| +| 30.2% | `encoding/json.(*Decoder).refill` (cumulative) | +| 24.1% | `compress/flate.NewReader` + `dictDecoder.init` (combined) | +| 21.6% | `ristretto/z.Calloc` | +| 4.6% | `compress/flate.(*dictDecoder).init` (direct) | +| 3.2% | `encoding/json.Marshal` | +| 1.5% | `net/http.Header.Clone` | +| 1.5% | `encoding/json.(*RawMessage).UnmarshalJSON` | +| 1.5% | `io.ReadAll` | + +Proto marshal no longer appears in the top allocators — the prior dominance (63.5%) has dissipated, likely due to reduced range request volume or binary encoding changes. Gzip/flate rose from 0.8% to ~24% of total allocs; the EL HTTP decompression path is now the single largest allocation source alongside JSON decoding. + +--- + +## Issues and Fixes + +### Issue 1 — LRU caches are count-based, not size-bound + +**Severity:** High — still unresolved; store LRU holds 13 MB in the 2026-04-03 profile (down from 63 MB) but defaults remain at 200K. Superseded in live-heap dominance by Issue 6. + +**Location:** `pkg/store/cached_store.go:11-17` + +```go +DefaultHeaderCacheSize = 200_000 +DefaultBlockDataCacheSize = 200_000 +``` + +These are item counts. A `*types.SignedHeader` is typically 300–800 bytes. A `blockDataEntry` includes the full `*types.Data` with all transactions — on a high-TPS chain this can be 50–500 KB per entry. At 200,000 entries, `blockDataCache` has a theoretical maximum in the tens of GB. + +**Fix:** Reduce defaults to `2_048` (headers) and `512` (block data). Expose both as fields in `config.Config` so operators can tune without recompiling. + +**Files:** +- `pkg/store/cached_store.go` — reduce constants, update `NewCachedStore` to accept config values +- `pkg/config/defaults.go` — add `DefaultStoreCacheHeaderSize = 2048`, `DefaultStoreCacheBlockDataSize = 512` +- `pkg/config/config.go` — add `StoreConfig` struct with `HeaderCacheSize int` and `BlockDataCacheSize int` +- `node/full.go` — pass config values to `NewCachedStore` +- `node/light.go` — same + +**Benchmarks:** +``` +// pkg/store/cached_store_test.go +BenchmarkCachedStore_GetHeader_HotPath // cache-hit path — must be 0 allocs/op +BenchmarkCachedStore_GetBlockData_HotPath // cache-hit path — must be 0 allocs/op +TestCachedStore_MemoryBound // runtime.ReadMemStats before/after; assert heap < threshold +``` + +**Acceptance criteria:** +- Cache-hit path: 0 `allocs/op` (pointer return, no copy) +- Two warm caches at new defaults hold < 50 MB total for realistic block sizes +- Configurable via `[store]` TOML section; documented in operator guide + +--- + +### Issue 2 — Badger `IndexCacheSize` unbounded ✅ COMPLETED + +**Severity:** High — index RAM grows proportionally to chain length. + +**Resolved:** [#3209](https://github.com/evstack/ev-node/pull/3209) (2026-03-30) + +**Location:** `pkg/store/badger_options.go` + +`badger.DefaultOptions` sets `BlockCacheSize = 256 MB` but `IndexCacheSize = 0` (unbounded in-memory). As the chain grows, SST block index entries accumulate in RAM without eviction. On a chain with millions of blocks this is several GB. + +**Fix applied:** + +```go +opts.Options = opts.WithIndexCacheSize(DefaultBadgerIndexCacheSize) // 256 MiB (was 0, unbounded) +``` + +Note: implemented at 256 MB rather than the originally planned 128 MB. + +Operators with more RAM should increase `IndexCacheSize` in config. The tradeoff is that cold reads (index entries not in cache) go to disk — acceptable on NVMe, noticeable on spinning disk. + +**Files:** +- `pkg/store/badger_options.go` + +**Benchmarks:** +``` +// pkg/store/store_test.go +BenchmarkStore_WriteThenRead // write N blocks, read random order — assert < 5% throughput regression +``` + +**Acceptance criteria:** +- RSS growth from Badger index is bounded at ~128 MB rather than growing with chain length +- No more than 5% regression in `BenchmarkStore_WriteThenRead` vs baseline + +--- + +### Issue 3 — Proto marshal scratch buffers not pooled (63.5% of all allocations) + +**Severity:** High — dominant allocation source, drives GC pressure. + +**Location:** `types/p2p_envelope.go` (`P2PData.MarshalBinary`, `P2PSignedHeader.MarshalBinary`), `types/serialization.go` (`SignedHeader.MarshalBinary`, `Data.MarshalBinary`, `Metadata.MarshalBinary`, `Header.MarshalBinary`) + +Every call to `proto.Marshal(msg)` allocates a fresh `[]byte` scratch buffer internally. For `handleRangeRequest` serving 128 headers, this is 128 allocations per request. + +**Fix:** Replace `proto.Marshal(msg)` with `proto.MarshalOptions{}.MarshalAppend` and a `sync.Pool` of scratch buffers: + +```go +var marshalPool = sync.Pool{New: func() any { b := make([]byte, 0, 1024); return &b }} + +func marshalProto(msg proto.Message) ([]byte, error) { + bp := marshalPool.Get().(*[]byte) + out, err := proto.MarshalOptions{}.MarshalAppend((*bp)[:0], msg) + marshalPool.Put(bp) + if err != nil { + return nil, err + } + // Copy — caller must not hold a reference into the pool buffer. + result := make([]byte, len(out)) + copy(result, out) + return result, nil +} +``` + +**Critical:** The pool buffer must not escape. The `copy` before return is mandatory. Run all marshal tests with `-race`. + +**Files:** +- `types/p2p_envelope.go` +- `types/serialization.go` +- Optionally extract `marshalProto` to `types/marshal.go` as a package-internal helper + +**Benchmarks:** +``` +// types/p2p_envelope_bench_test.go (new file) +BenchmarkP2PData_MarshalBinary // b.RunParallel to expose pool contention +BenchmarkP2PSignedHeader_MarshalBinary +BenchmarkSignedHeader_MarshalBinary // types/serialization_bench_test.go +``` + +**Acceptance criteria:** +- `allocs/op` drops from 6–10 to 1–2 per marshal call +- `B/op` drops by 30–50% (scratch buffer reused; only final copy allocated) +- No race detector findings: `go test -race ./types/...` +- All existing round-trip tests pass: `TestP2PEnvelope_MarshalUnmarshal`, `TestDataBinaryCompatibility` + +--- + +### Issue 4 — `filterTransactions` hex encoding allocates 2× per transaction + +**Severity:** Medium — ~20% of EVM RPC allocations. + +**Location:** `execution/evm/execution.go` — `filterTransactions` + +```go +"0x" + hex.EncodeToString(tx) // two allocations: hex string + concatenation +``` + +**Fix:** Use `hex.Encode` directly into a pre-allocated, pool-backed buffer: + +```go +var hexBufPool = sync.Pool{New: func() any { b := make([]byte, 0, 512); return &b }} + +func encodeHexTx(tx []byte) string { + bp := hexBufPool.Get().(*[]byte) + needed := 2 + hex.EncodedLen(len(tx)) + buf := append((*bp)[:0], "0x"...) + buf = buf[:needed] + hex.Encode(buf[2:], tx) + s := string(buf[:needed]) + *bp = buf + hexBufPool.Put(bp) + return s +} +``` + +This reduces two allocations to one (the unavoidable `string()` conversion). + +**Files:** +- `execution/evm/execution.go` + +**Benchmarks:** + +Extend the existing `execution/evm/filter_bench_test.go`: +``` +BenchmarkFilterTransactions_HexEncoding // 1000 txs of 150 bytes each, -benchmem +``` + +**Acceptance criteria:** +- `allocs/op` drops from `2N` to `N` for N transactions +- `B/op` drops by ~40% for a 1000-tx batch + +--- + +### Issue 5 — Gzip reader allocated per EL HTTP response + +**Severity:** Medium — ~1 TB cumulative lifetime allocations; removes the gzip decompressor from the hot path. + +**Root cause:** `net/http`'s transport automatically decompresses `Content-Encoding: gzip` responses by allocating a fresh `gzip.Reader` per response. This is in Go's standard library, triggered by go-ethereum's `rpc.Client.sendHTTP`. Not patchable in upstream libraries. + +**Fix A (preferred when EL is co-located):** Disable HTTP compression on the EL client transport in `NewEngineExecutionClient`: + +```go +rpc.WithHTTPClient(&http.Client{ + Transport: &http.Transport{DisableCompression: true}, +}) +``` + +No gzip reader is ever allocated. Valid only when EL and ev-node are on the same host (local compression has no bandwidth benefit). + +**Fix B (when EL is remote):** Implement a `sync.Pool`-backed `http.RoundTripper` that calls `(*gzip.Reader).Reset(body)` instead of `gzip.NewReader(body)`. Higher complexity; only needed for remote EL deployments. + +**Fix C (best for same-host production):** Switch `engine_url` to an IPC socket path (`/tmp/geth.ipc`). The IPC codec uses raw JSON over a Unix socket — no HTTP framing, no gzip, no JSON decoder allocation at the transport layer. + +**Files:** +- `execution/evm/execution.go` — `NewEngineExecutionClient`: add `DisableCompression: true` +- `execution/evm/flags.go` — document IPC URL as the recommended transport for same-host deployments + +**Benchmarks:** + +Verify via pprof before/after: `compress/flate.(*decompressor)` and `compress/gzip.(*Reader)` must disappear from the top allocation sites in the allocs profile. + +**Acceptance criteria:** +- Post-fix allocs profile shows zero `compress/flate` or `compress/gzip` entries +- No correctness regression under round-trip test with a real EL + +--- + +### Issue 6 — `block/internal/cache` unbounded growth (~1.24 GB live) + +**Severity:** Critical — new #1 heap consumer as of 2026-04-03 profile. + +**Location:** `block/internal/cache/generic_cache.go`, `block/internal/cache/manager.go` + +`setSeen`, `setDAIncluded`, and `HeightPlaceholderKey` together hold ~1.24 GB of live heap. The LRU was removed from this cache in [#3204](https://github.com/evstack/ev-node/pull/3204), which simplified eviction but appears to have left these maps growing without a bound. `RestoreFromStore` drives the initial population; ongoing `setSeen`/`setDAIncluded` calls accumulate entries without eviction. + +**Fix:** Audit `generic_cache.go` for unbounded map growth. Reintroduce a maximum-size eviction policy (LRU or a fixed-size ring buffer keyed by height) or a TTL-based cleanup for entries older than the finality horizon. The `HeightPlaceholderKey` allocations suggest the key string is being heap-allocated on every lookup — consider interning or using a numeric key directly. + +**Files:** +- `block/internal/cache/generic_cache.go` +- `block/internal/cache/manager.go` + +**Acceptance criteria:** +- Live heap from `block/internal/cache.*` stays below 100 MB at steady state on a running chain +- `setSeen` / `setDAIncluded` entry count is bounded (log or metric exposed) + +--- + +### Issue 7 — `go-header Hash.String` retaining 302 MB + +**Severity:** High — #4 live-heap consumer in 2026-04-03 profile. + +**Location:** `go-header/p2p.ExchangeServer.handleRangeRequest` → somewhere caching `Hash.String()` results. + +`celestiaorg/go-header.Hash.String()` hex-encodes a `[]byte` hash into a new `string` on every call. 302 MB of live strings means these are being stored (likely as map keys or log fields) and retained. This is separate from `Header.Hash()` computation — memoization in #3219 avoids recomputing the hash but doesn't prevent `String()` from allocating a new hex string each time it is called. + +**Fix options:** +1. If strings are used as map keys: store the raw `Hash` (`[]byte`) as a fixed-size `[32]byte` key instead of a hex string. Eliminates the allocation entirely. +2. If strings are used for logging: use `%x` in a format call rather than pre-allocating the string; zerolog's hex field support avoids allocation. +3. If strings must be cached: add `cachedString string` alongside `cachedHash` on `Header` and memoize via a `Hash.MemoizeString()` approach. + +**Files:** +- Investigate callers of `Hash.String()` in `block/internal/cache/`, `go-header/p2p/`, and any map keyed by hash string. + +**Acceptance criteria:** +- `go-header.Hash.String` disappears from the top-10 live-heap consumers +- No `string(hex.Encode(...))` patterns in hot paths that retain the result + +--- + +## Implementation Sequence + +| Phase | Issue | Effort | Expected impact | Status | +|-------|-------|--------|-----------------|--------| +| 1 | `block/internal/cache` eviction bound | 3h | ~1.24 GB → < 100 MB live heap | ⬜ Open (Issue 6) | +| 2 | `Hash.String` retention | 2h | ~302 MB freed | ⬜ Open (Issue 7) | +| 3 | LRU cache defaults + config exposure | 2h | Prevents future regression | ⬜ Open (Issue 1) | +| 4 | Badger `IndexCacheSize` cap | — | Bounds index RAM to ~256 MB | ✅ Done (#3209, Issue 2) | +| 5 | Disable gzip on EL HTTP transport | 1h | ~24% drop in allocation rate | ⬜ Open (Issue 5) | +| 6 | `filterTransactions` hex pool | 2h | ~40% drop in EVM RPC allocs | ⬜ Open (Issue 4) | +| 7 | Proto marshal `sync.Pool` | 4h | Revisit — may no longer be critical | ⬜ Open (Issue 3) | + +The ordering above reflects the 2026-04-03 profile. Issue 6 (`block/internal/cache`) is now the highest-leverage fix — it is the #1 live-heap consumer and the root cause of continued RAM growth. Issue 3 (proto pool) should be re-profiled before investing effort; proto marshal no longer appears in the top allocators. + +--- + +## Risks + +| Risk | Mitigation | +|------|------------| +| `block/internal/cache` eviction causes re-fetch on cache miss | Profile syncer throughput before/after; ensure DA and P2P retrieval paths tolerate misses without stalling | +| Proto pool buffer escapes | Mandatory `copy` before return; `-race` on all marshal tests | +| LRU reduction slows syncing | Benchmark `BenchmarkSyncerIO` before/after; bump default to 8,192 if regression observed | +| Badger index cap causes cold-read latency | 256 MB is already set — expose via config; document NVMe vs HDD trade-off | +| `DisableCompression` breaks remote EL | Guard behind a config flag; default to `false` for remote URLs, `true` for local | + +--- + +## Running the Benchmarks + +```bash +# Phase 1 — store cache +go test -bench=. -benchmem -count=6 ./pkg/store/... + +# Phase 2 — badger write/read throughput +go test -bench=BenchmarkStore_WriteThenRead -benchmem -count=6 ./pkg/store/... + +# Phase 3 — proto marshal +go test -bench=BenchmarkP2PData_MarshalBinary -benchmem -count=6 -race ./types/... + +# Phase 4 — hex encoding +go test -bench=BenchmarkFilterTransactions -benchmem -count=6 ./execution/evm/... + +# Full regression +go test ./... -count=1 +``` + +Use `benchstat` to compare before/after: + +```bash +go test -bench=. -benchmem -count=10 ./pkg/store/... > before.txt +# apply changes +go test -bench=. -benchmem -count=10 ./pkg/store/... > after.txt +benchstat before.txt after.txt +``` diff --git a/types/hash_memo_bench_test.go b/types/hash_memo_bench_test.go new file mode 100644 index 0000000000..6ac3d63583 --- /dev/null +++ b/types/hash_memo_bench_test.go @@ -0,0 +1,42 @@ +package types + +import ( + "testing" +) + +// BenchmarkHeaderHash_NoMemo measures the cost of the old 3× call pattern with no +// memoization: each call re-marshals every field via ToProto → proto.Marshal → sha256. +func BenchmarkHeaderHash_NoMemo(b *testing.B) { + h := GetRandomHeader("bench-chain", GetRandomBytes(32)) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + _ = h.Hash() + _ = h.Hash() + _ = h.Hash() + } +} + +// BenchmarkHeaderHash_Memoized measures the cost of the same 3× call pattern after +// explicit memoization: first call pays full cost, subsequent two are cache hits. +func BenchmarkHeaderHash_Memoized(b *testing.B) { + h := GetRandomHeader("bench-chain", GetRandomBytes(32)) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + h.InvalidateHash() + _ = h.MemoizeHash() // compute and store + _ = h.Hash() // cache hit + _ = h.Hash() // cache hit + } +} + +// BenchmarkHeaderHash_Single is a baseline: cost of one Hash() call with a cold cache. +func BenchmarkHeaderHash_Single(b *testing.B) { + h := GetRandomHeader("bench-chain", GetRandomBytes(32)) + b.ReportAllocs() + b.ResetTimer() + for b.Loop() { + _ = h.Hash() + } +} From faf34e7fa5782d9b75137c0fd1dcaf24a07feb2f Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 10:51:37 +0200 Subject: [PATCH 02/10] =?UTF-8?q?sync.Pool=20for=20sha256.Hash=20in=20leaf?= =?UTF-8?q?HashOpt=20=E2=80=94=20eliminates=202=20sha256.New()=20allocatio?= =?UTF-8?q?ns=20per=20block?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Result: {"status":"keep","allocs_per_op":79,"bytes_per_op":25697,"ns_per_op":34147} --- autoresearch.jsonl | 1 + types/hashing.go | 18 ++++++++++++++++-- 2 files changed, 17 insertions(+), 2 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index eba8fbd7c8..30e9b62c4f 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -1 +1,2 @@ {"type":"config","name":"Block package performance and allocation reduction","metricName":"allocs_per_op","metricUnit":"allocs","bestDirection":"lower"} +{"run":1,"commit":"f5d00f3","metric":81,"metrics":{"bytes_per_op":25934,"ns_per_op":34001},"status":"keep","description":"Baseline - current block production performance with 100 txs","timestamp":1775292593926,"segment":0,"confidence":null,"iterationTokens":205,"asi":{"hypothesis":"baseline measurement","hotspots":"sha256.New() in leafHashOpt (~213B), txsToByteSlices allocs [][]byte, Data.ToProto allocs, DACommitment creates pruned Data, ApplyBlock converts Txs to [][]byte","baseline_100txs":"81 allocs, 25934 B, 34001 ns","baseline_empty":"71 allocs, ~7KB, ~27µs"}} diff --git a/types/hashing.go b/types/hashing.go index 60abc62599..95a0297314 100644 --- a/types/hashing.go +++ b/types/hashing.go @@ -4,10 +4,20 @@ import ( "crypto/sha256" "errors" "hash" + "sync" ) var ( leafPrefix = []byte{0} + + // sha256Pool reuses sha256 Hash instances to avoid per-block allocation. + // sha256.New() allocates ~213 bytes (216B on 64-bit) per call. Pooling + // eliminates this allocation entirely in the hot path. + sha256Pool = sync.Pool{ + New: func() interface{} { + return sha256.New() + }, + } ) // HashSlim returns the SHA256 hash of the header using the slim (current) binary encoding. @@ -105,7 +115,9 @@ func (d *Data) Hash() Hash { // Ignoring the marshal error for now to satisfy the go-header interface // Later on the usage of Hash should be replaced with DA commitment dBytes, _ := d.MarshalBinary() - return leafHashOpt(sha256.New(), dBytes) + s := sha256Pool.Get().(hash.Hash) + defer sha256Pool.Put(s) + return leafHashOpt(s, dBytes) } // DACommitment returns the DA commitment of the Data excluding the Metadata @@ -115,7 +127,9 @@ func (d *Data) DACommitment() Hash { Txs: d.Txs, } dBytes, _ := prunedData.MarshalBinary() - return leafHashOpt(sha256.New(), dBytes) + s := sha256Pool.Get().(hash.Hash) + defer sha256Pool.Put(s) + return leafHashOpt(s, dBytes) } func leafHashOpt(s hash.Hash, leaf []byte) []byte { From a8b6352b87d6c717a4ff5ef7bcb82549abaa360d Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 10:52:59 +0200 Subject: [PATCH 03/10] =?UTF-8?q?Unsafe=20reinterpret=20cast=20of=20Txs=20?= =?UTF-8?q?to=20[][]byte=20in=20ApplyBlock=20=E2=80=94=20eliminates=20make?= =?UTF-8?q?([][]byte,=20n)=20allocation?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Result: {"status":"keep","allocs_per_op":78,"bytes_per_op":22996,"ns_per_op":33091} --- autoresearch.jsonl | 1 + block/internal/executing/executor.go | 11 +++++------ 2 files changed, 6 insertions(+), 6 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 30e9b62c4f..57e93d4882 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -1,2 +1,3 @@ {"type":"config","name":"Block package performance and allocation reduction","metricName":"allocs_per_op","metricUnit":"allocs","bestDirection":"lower"} {"run":1,"commit":"f5d00f3","metric":81,"metrics":{"bytes_per_op":25934,"ns_per_op":34001},"status":"keep","description":"Baseline - current block production performance with 100 txs","timestamp":1775292593926,"segment":0,"confidence":null,"iterationTokens":205,"asi":{"hypothesis":"baseline measurement","hotspots":"sha256.New() in leafHashOpt (~213B), txsToByteSlices allocs [][]byte, Data.ToProto allocs, DACommitment creates pruned Data, ApplyBlock converts Txs to [][]byte","baseline_100txs":"81 allocs, 25934 B, 34001 ns","baseline_empty":"71 allocs, ~7KB, ~27µs"}} +{"run":2,"commit":"faf34e7","metric":79,"metrics":{"bytes_per_op":25697,"ns_per_op":34147},"status":"keep","description":"sync.Pool for sha256.Hash in leafHashOpt — eliminates 2 sha256.New() allocations per block","timestamp":1775292696995,"segment":0,"confidence":null,"iterationTokens":2904,"asi":{"hypothesis":"pool sha256.Hash to avoid 213B allocation per sha256.New()","result":"saved 2 allocs, 237 bytes, ns flat","next_target":"Data.ToProto allocates [][]byte via txsToByteSlices — should be biggest remaining win"}} diff --git a/block/internal/executing/executor.go b/block/internal/executing/executor.go index 3ddc90211a..9477fea454 100644 --- a/block/internal/executing/executor.go +++ b/block/internal/executing/executor.go @@ -9,6 +9,7 @@ import ( "sync" "sync/atomic" "time" + "unsafe" "github.com/ipfs/go-datastore" "github.com/libp2p/go-libp2p/core/crypto" @@ -792,14 +793,12 @@ func (e *Executor) CreateBlock(ctx context.Context, height uint64, batchData *Ba func (e *Executor) ApplyBlock(ctx context.Context, header types.Header, data *types.Data) (types.State, error) { currentState := e.getLastState() - // Convert Txs to [][]byte for the execution client. - // types.Tx is []byte, so this is a type conversion, not a copy. + // Reinterpret Txs ([][]byte via type aliases) as [][]byte without allocation. + // types.Tx = []byte, so types.Txs = []Tx has identical memory layout to [][]byte. + // Using unsafe.Slice/unsafe.SliceData avoids the heap allocation of make([][]byte, n). var rawTxs [][]byte if n := len(data.Txs); n > 0 { - rawTxs = make([][]byte, n) - for i, tx := range data.Txs { - rawTxs[i] = []byte(tx) - } + rawTxs = unsafe.Slice((*[]byte)(unsafe.SliceData(data.Txs)), n) } // Execute transactions From 823aa620b961c0d718b520b10ed2cddf3e78d6b7 Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 10:56:57 +0200 Subject: [PATCH 04/10] =?UTF-8?q?Direct=20pb.Data=20serialization=20in=20D?= =?UTF-8?q?ACommitment=20=E2=80=94=20avoids=20pruned=20Data=20wrapper=20an?= =?UTF-8?q?d=20txsToByteSlices=20allocations?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Result: {"status":"keep","allocs_per_op":77,"bytes_per_op":20276,"ns_per_op":32480} --- autoresearch.jsonl | 1 + types/hashing.go | 19 +++++++++++++------ 2 files changed, 14 insertions(+), 6 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 57e93d4882..1531468336 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -1,3 +1,4 @@ {"type":"config","name":"Block package performance and allocation reduction","metricName":"allocs_per_op","metricUnit":"allocs","bestDirection":"lower"} {"run":1,"commit":"f5d00f3","metric":81,"metrics":{"bytes_per_op":25934,"ns_per_op":34001},"status":"keep","description":"Baseline - current block production performance with 100 txs","timestamp":1775292593926,"segment":0,"confidence":null,"iterationTokens":205,"asi":{"hypothesis":"baseline measurement","hotspots":"sha256.New() in leafHashOpt (~213B), txsToByteSlices allocs [][]byte, Data.ToProto allocs, DACommitment creates pruned Data, ApplyBlock converts Txs to [][]byte","baseline_100txs":"81 allocs, 25934 B, 34001 ns","baseline_empty":"71 allocs, ~7KB, ~27µs"}} {"run":2,"commit":"faf34e7","metric":79,"metrics":{"bytes_per_op":25697,"ns_per_op":34147},"status":"keep","description":"sync.Pool for sha256.Hash in leafHashOpt — eliminates 2 sha256.New() allocations per block","timestamp":1775292696995,"segment":0,"confidence":null,"iterationTokens":2904,"asi":{"hypothesis":"pool sha256.Hash to avoid 213B allocation per sha256.New()","result":"saved 2 allocs, 237 bytes, ns flat","next_target":"Data.ToProto allocates [][]byte via txsToByteSlices — should be biggest remaining win"}} +{"run":3,"commit":"a8b6352","metric":78,"metrics":{"bytes_per_op":22996,"ns_per_op":33091},"status":"keep","description":"Unsafe reinterpret cast of Txs to [][]byte in ApplyBlock — eliminates make([][]byte, n) allocation","timestamp":1775292779165,"segment":0,"confidence":3,"iterationTokens":3065,"asi":{"hypothesis":"use unsafe.Slice/unsafe.SliceData instead of make+loop for Txs→[][]byte conversion","result":"saved 1 alloc, ~2.7KB, ~1µs","note":"types.Tx = []byte so types.Txs = []Tx has identical memory layout to [][]byte"}} diff --git a/types/hashing.go b/types/hashing.go index 95a0297314..6eae5ad06d 100644 --- a/types/hashing.go +++ b/types/hashing.go @@ -5,6 +5,11 @@ import ( "errors" "hash" "sync" + "unsafe" + + "google.golang.org/protobuf/proto" + + pb "github.com/evstack/ev-node/types/pb/evnode/v1" ) var ( @@ -120,13 +125,15 @@ func (d *Data) Hash() Hash { return leafHashOpt(s, dBytes) } -// DACommitment returns the DA commitment of the Data excluding the Metadata +// DACommitment returns the DA commitment of the Data excluding the Metadata. +// Avoids allocating a pruned Data struct and the [][]byte intermediate slice +// by serializing only the txs field directly to a protobuf message. func (d *Data) DACommitment() Hash { - // Prune the Data to only include the Txs - prunedData := &Data{ - Txs: d.Txs, - } - dBytes, _ := prunedData.MarshalBinary() + // pb.Data{Metadata: nil, Txs: ...} produces the same wire format as + // Data{Txs: d.Txs}.MarshalBinary() but without the intermediate Data + // wrapper allocation or the txsToByteSlices [][]byte copy. + pbData := pb.Data{Txs: unsafe.Slice((*[]byte)(unsafe.SliceData(d.Txs)), len(d.Txs))} + dBytes, _ := proto.Marshal(&pbData) s := sha256Pool.Get().(hash.Hash) defer sha256Pool.Put(s) return leafHashOpt(s, dBytes) From 0720b44a099ab445e2dc3f34322477bf58d2d024 Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 10:59:50 +0200 Subject: [PATCH 05/10] =?UTF-8?q?unsafe.Slice=20in=20Data.ToProto()=20?= =?UTF-8?q?=E2=80=94=20eliminates=20txsToByteSlices=20[][]byte=20allocatio?= =?UTF-8?q?n?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Result: {"status":"keep","allocs_per_op":74,"bytes_per_op":12192,"ns_per_op":31624} --- autoresearch.jsonl | 1 + types/serialization.go | 21 +++++++++++++++++++-- 2 files changed, 20 insertions(+), 2 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 1531468336..0fba182cc3 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -2,3 +2,4 @@ {"run":1,"commit":"f5d00f3","metric":81,"metrics":{"bytes_per_op":25934,"ns_per_op":34001},"status":"keep","description":"Baseline - current block production performance with 100 txs","timestamp":1775292593926,"segment":0,"confidence":null,"iterationTokens":205,"asi":{"hypothesis":"baseline measurement","hotspots":"sha256.New() in leafHashOpt (~213B), txsToByteSlices allocs [][]byte, Data.ToProto allocs, DACommitment creates pruned Data, ApplyBlock converts Txs to [][]byte","baseline_100txs":"81 allocs, 25934 B, 34001 ns","baseline_empty":"71 allocs, ~7KB, ~27µs"}} {"run":2,"commit":"faf34e7","metric":79,"metrics":{"bytes_per_op":25697,"ns_per_op":34147},"status":"keep","description":"sync.Pool for sha256.Hash in leafHashOpt — eliminates 2 sha256.New() allocations per block","timestamp":1775292696995,"segment":0,"confidence":null,"iterationTokens":2904,"asi":{"hypothesis":"pool sha256.Hash to avoid 213B allocation per sha256.New()","result":"saved 2 allocs, 237 bytes, ns flat","next_target":"Data.ToProto allocates [][]byte via txsToByteSlices — should be biggest remaining win"}} {"run":3,"commit":"a8b6352","metric":78,"metrics":{"bytes_per_op":22996,"ns_per_op":33091},"status":"keep","description":"Unsafe reinterpret cast of Txs to [][]byte in ApplyBlock — eliminates make([][]byte, n) allocation","timestamp":1775292779165,"segment":0,"confidence":3,"iterationTokens":3065,"asi":{"hypothesis":"use unsafe.Slice/unsafe.SliceData instead of make+loop for Txs→[][]byte conversion","result":"saved 1 alloc, ~2.7KB, ~1µs","note":"types.Tx = []byte so types.Txs = []Tx has identical memory layout to [][]byte"}} +{"run":4,"commit":"823aa62","metric":77,"metrics":{"bytes_per_op":20276,"ns_per_op":32480},"status":"keep","description":"Direct pb.Data serialization in DACommitment — avoids pruned Data wrapper and txsToByteSlices allocations","timestamp":1775293017595,"segment":0,"confidence":4,"iterationTokens":6395,"asi":{"bytes_saved":"~2.7KB vs baseline","empty_txs_safe":"unsafe.Slice on 0-len slice is safe in Go 1.21+","hypothesis":"Use pb.Data directly with unsafe reinterpreted Txs instead of creating pruned Data wrapper","note":"Must still handle empty_txs case - need to check if nil Metadata affects wire format","next_target":"txsToByteSlices in Data.ToProto allocates [][]byte; Data.Hash allocates from MarshalBinary","result":"-1 alloc, ~2.7KB saved, ~0.6µs faster"}} diff --git a/types/serialization.go b/types/serialization.go index edc4f615b0..f2bc51ea75 100644 --- a/types/serialization.go +++ b/types/serialization.go @@ -4,6 +4,7 @@ import ( "errors" "fmt" "time" + "unsafe" "github.com/libp2p/go-libp2p/core/crypto" "google.golang.org/protobuf/encoding/protowire" @@ -335,11 +336,27 @@ func (m *Metadata) FromProto(other *pb.Metadata) error { func (d *Data) ToProto() *pb.Data { var mProto *pb.Metadata if d.Metadata != nil { - mProto = d.Metadata.ToProto() + // Inline Metadata.ToProto() to keep pb.Metadata allocation on the + // stack for small structs, and avoid the intermediate method frame. + mProto = &pb.Metadata{ + ChainId: d.Metadata.ChainID, + Height: d.Metadata.Height, + Time: d.Metadata.Time, + LastDataHash: d.Metadata.LastDataHash[:], + } + } + // Reinterpret Txs ([]Tx) as [][]byte without allocation. + // types.Tx = []byte, so []Tx and [][]byte share identical memory layout. + if d.Txs == nil { + return &pb.Data{ + Metadata: mProto, + Txs: nil, + } } + txBytes := unsafe.Slice((*[]byte)(unsafe.SliceData(d.Txs)), len(d.Txs)) return &pb.Data{ Metadata: mProto, - Txs: txsToByteSlices(d.Txs), + Txs: txBytes, } } From ccbc2e4748370183ea5fa24c420596d84b045064 Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 11:48:20 +0200 Subject: [PATCH 06/10] =?UTF-8?q?sync.Pool=20for=20protobuf=20message=20st?= =?UTF-8?q?ructs=20in=20MarshalBinary=20=E2=80=94=20eliminates=2010=20allo?= =?UTF-8?q?cs=20per=20block?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Replace per-call allocation of pb.Header/pb.Version/pb.Data/pb.Metadata with sync.Pool reuse in the hot MarshalBinary path. ToProto() API is unchanged — only MarshalBinary is affected since it consumes the result immediately. Metrics (100_txs benchmark): - 74 → 64 allocs/op (-13.5%) - ~12.1 → ~11.1 KB (-8.3%) - ~31ns flat --- autoresearch.jsonl | 2 + types/serialization.go | 96 +++++++++++++++++++++++++++++++++++++++++- 2 files changed, 96 insertions(+), 2 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 0fba182cc3..ec967de20f 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -3,3 +3,5 @@ {"run":2,"commit":"faf34e7","metric":79,"metrics":{"bytes_per_op":25697,"ns_per_op":34147},"status":"keep","description":"sync.Pool for sha256.Hash in leafHashOpt — eliminates 2 sha256.New() allocations per block","timestamp":1775292696995,"segment":0,"confidence":null,"iterationTokens":2904,"asi":{"hypothesis":"pool sha256.Hash to avoid 213B allocation per sha256.New()","result":"saved 2 allocs, 237 bytes, ns flat","next_target":"Data.ToProto allocates [][]byte via txsToByteSlices — should be biggest remaining win"}} {"run":3,"commit":"a8b6352","metric":78,"metrics":{"bytes_per_op":22996,"ns_per_op":33091},"status":"keep","description":"Unsafe reinterpret cast of Txs to [][]byte in ApplyBlock — eliminates make([][]byte, n) allocation","timestamp":1775292779165,"segment":0,"confidence":3,"iterationTokens":3065,"asi":{"hypothesis":"use unsafe.Slice/unsafe.SliceData instead of make+loop for Txs→[][]byte conversion","result":"saved 1 alloc, ~2.7KB, ~1µs","note":"types.Tx = []byte so types.Txs = []Tx has identical memory layout to [][]byte"}} {"run":4,"commit":"823aa62","metric":77,"metrics":{"bytes_per_op":20276,"ns_per_op":32480},"status":"keep","description":"Direct pb.Data serialization in DACommitment — avoids pruned Data wrapper and txsToByteSlices allocations","timestamp":1775293017595,"segment":0,"confidence":4,"iterationTokens":6395,"asi":{"bytes_saved":"~2.7KB vs baseline","empty_txs_safe":"unsafe.Slice on 0-len slice is safe in Go 1.21+","hypothesis":"Use pb.Data directly with unsafe reinterpreted Txs instead of creating pruned Data wrapper","note":"Must still handle empty_txs case - need to check if nil Metadata affects wire format","next_target":"txsToByteSlices in Data.ToProto allocates [][]byte; Data.Hash allocates from MarshalBinary","result":"-1 alloc, ~2.7KB saved, ~0.6µs faster"}} +{"run":5,"commit":"0720b44","metric":74,"metrics":{"bytes_per_op":12192,"ns_per_op":31624},"status":"keep","description":"unsafe.Slice in Data.ToProto() — eliminates txsToByteSlices [][]byte allocation","timestamp":1775293190538,"segment":0,"confidence":7,"iterationTokens":9987,"asi":{"hypothesis":"Use unsafe.Slice in Data.ToProto() to avoid txsToByteSlices allocation","next_target":"Data.Hash() marshal + sha256 allocation; Header.ToProto() allocates pb.Header; Data.Size() marshals for metrics","notes":"Biggest single win yet — per-TX allocation eliminated for every protobuf encoding","result":"-3 allocs, ~8KB, ~0.5µs faster"}} +{"run":6,"commit":"$(git r","metric":74,"metrics":{"bytes_per_op":12187,"ns_per_op":31217},"status":"discard","description":"Reverted hand-written HashSlim wire encoder — produced different hashes than MarshalBinary","timestamp":1775294347584,"segment":0,"confidence":2.8,"iterationTokens":25063,"asi":{"hypothesis":"Direct protobuf wire encoding in HashSlim to avoid pb.Header/pb.Version allocations","result":"Hash mismatch — wire encoding differences in version field (0a02 vs 0a04)","rollback_reason":"Hand-written encoder produces different byte output than protobuf MarshalBinary; would break hash verification","next_action_hint":"Focus on safe optimizations: avoid creating pb.SignedHeader/pb.Signer structs when possible, reduce allocations in store path, look at SignedHeader.ToProto hot path"}} diff --git a/types/serialization.go b/types/serialization.go index f2bc51ea75..5a1258443f 100644 --- a/types/serialization.go +++ b/types/serialization.go @@ -3,6 +3,7 @@ package types import ( "errors" "fmt" + "sync" "time" "unsafe" @@ -14,6 +15,46 @@ import ( pb "github.com/evstack/ev-node/types/pb/evnode/v1" ) +// Proto object pools — avoid heap allocation of short-lived protobuf message +// structs in hot serialization paths (marshal → discard → repeat per block). +var ( + pbHeaderPool = sync.Pool{ + New: func() interface{} { + return &pb.Header{} + }, + } + pbVersionPool = sync.Pool{ + New: func() interface{} { + return &pb.Version{} + }, + } + pbDataPool = sync.Pool{ + New: func() interface{} { + return &pb.Data{} + }, + } + pbMetadataPool = sync.Pool{ + New: func() interface{} { + return &pb.Metadata{} + }, + } + pbSignerPool = sync.Pool{ + New: func() interface{} { + return &pb.Signer{} + }, + } + pbSignedHeaderPool = sync.Pool{ + New: func() interface{} { + return &pb.SignedHeader{} + }, + } + pbStatePool = sync.Pool{ + New: func() interface{} { + return &pb.State{} + }, + } +) + // MarshalBinary encodes Metadata into binary form and returns it. func (m *Metadata) MarshalBinary() ([]byte, error) { return proto.Marshal(m.ToProto()) @@ -30,8 +71,34 @@ func (m *Metadata) UnmarshalBinary(metadata []byte) error { } // MarshalBinary encodes Header into binary form and returns it. +// Uses a pooled pb.Header proto message to avoid allocation. func (h *Header) MarshalBinary() ([]byte, error) { - return proto.Marshal(h.ToProto()) + ph := pbHeaderPool.Get().(*pb.Header) + + pv := pbVersionPool.Get().(*pb.Version) + pv.Block, pv.App = h.Version.Block, h.Version.App + + ph.Reset() + ph.Version = pv + ph.Height = h.BaseHeader.Height + ph.Time = h.BaseHeader.Time + ph.ChainId = h.BaseHeader.ChainID + ph.LastHeaderHash = h.LastHeaderHash + ph.DataHash = h.DataHash + ph.AppHash = h.AppHash + ph.ProposerAddress = h.ProposerAddress + ph.ValidatorHash = h.ValidatorHash + if unknown := encodeLegacyUnknownFields(h.Legacy); len(unknown) > 0 { + ph.ProtoReflect().SetUnknown(unknown) + } + + bz, err := proto.Marshal(ph) + + ph.Reset() + pbHeaderPool.Put(ph) + pv.Reset() + pbVersionPool.Put(pv) + return bz, err } // MarshalBinaryLegacy returns the legacy header encoding that includes the @@ -52,8 +119,33 @@ func (h *Header) UnmarshalBinary(data []byte) error { } // MarshalBinary encodes Data into binary form and returns it. +// Uses pooled protobuf messages to avoid per-block allocation. func (d *Data) MarshalBinary() ([]byte, error) { - return proto.Marshal(d.ToProto()) + pd := pbDataPool.Get().(*pb.Data) + pd.Reset() + + if d.Metadata != nil { + pm := pbMetadataPool.Get().(*pb.Metadata) + pm.Reset() + pm.ChainId = d.Metadata.ChainID + pm.Height = d.Metadata.Height + pm.Time = d.Metadata.Time + pm.LastDataHash = d.Metadata.LastDataHash + pd.Metadata = pm + defer func() { + pm.Reset() + pbMetadataPool.Put(pm) + }() + } + + if d.Txs != nil { + pd.Txs = unsafe.Slice((*[]byte)(unsafe.SliceData(d.Txs)), len(d.Txs)) + } + + bz, err := proto.Marshal(pd) + pd.Reset() + pbDataPool.Put(pd) + return bz, err } // UnmarshalBinary decodes binary form of Data into object. From 805672ebc523d274f2b3abee4d2fc05c0c66aba5 Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 11:56:46 +0200 Subject: [PATCH 07/10] =?UTF-8?q?pool=20SignedHeader.MarshalBinary=20?= =?UTF-8?q?=E2=80=94=20reuse=20pb.SignedHeader/pb.Header/pb.Signer/pb.Vers?= =?UTF-8?q?ion?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Eliminates 4 struct allocations per signed header marshal: pb.SignedHeader, pb.Header, pb.Version, pb.Signer. These are now borrowed from sync.Pool and returned after proto.Marshal completes. Metrics (100_txs benchmark): - 64 → 56 allocs/op - ~11KB → ~10.2KB --- autoresearch.jsonl | 1 + types/serialization.go | 61 ++++++++++++++++++++++++++++++++++++++++-- 2 files changed, 60 insertions(+), 2 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index ec967de20f..213a2be777 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -5,3 +5,4 @@ {"run":4,"commit":"823aa62","metric":77,"metrics":{"bytes_per_op":20276,"ns_per_op":32480},"status":"keep","description":"Direct pb.Data serialization in DACommitment — avoids pruned Data wrapper and txsToByteSlices allocations","timestamp":1775293017595,"segment":0,"confidence":4,"iterationTokens":6395,"asi":{"bytes_saved":"~2.7KB vs baseline","empty_txs_safe":"unsafe.Slice on 0-len slice is safe in Go 1.21+","hypothesis":"Use pb.Data directly with unsafe reinterpreted Txs instead of creating pruned Data wrapper","note":"Must still handle empty_txs case - need to check if nil Metadata affects wire format","next_target":"txsToByteSlices in Data.ToProto allocates [][]byte; Data.Hash allocates from MarshalBinary","result":"-1 alloc, ~2.7KB saved, ~0.6µs faster"}} {"run":5,"commit":"0720b44","metric":74,"metrics":{"bytes_per_op":12192,"ns_per_op":31624},"status":"keep","description":"unsafe.Slice in Data.ToProto() — eliminates txsToByteSlices [][]byte allocation","timestamp":1775293190538,"segment":0,"confidence":7,"iterationTokens":9987,"asi":{"hypothesis":"Use unsafe.Slice in Data.ToProto() to avoid txsToByteSlices allocation","next_target":"Data.Hash() marshal + sha256 allocation; Header.ToProto() allocates pb.Header; Data.Size() marshals for metrics","notes":"Biggest single win yet — per-TX allocation eliminated for every protobuf encoding","result":"-3 allocs, ~8KB, ~0.5µs faster"}} {"run":6,"commit":"$(git r","metric":74,"metrics":{"bytes_per_op":12187,"ns_per_op":31217},"status":"discard","description":"Reverted hand-written HashSlim wire encoder — produced different hashes than MarshalBinary","timestamp":1775294347584,"segment":0,"confidence":2.8,"iterationTokens":25063,"asi":{"hypothesis":"Direct protobuf wire encoding in HashSlim to avoid pb.Header/pb.Version allocations","result":"Hash mismatch — wire encoding differences in version field (0a02 vs 0a04)","rollback_reason":"Hand-written encoder produces different byte output than protobuf MarshalBinary; would break hash verification","next_action_hint":"Focus on safe optimizations: avoid creating pb.SignedHeader/pb.Signer structs when possible, reduce allocations in store path, look at SignedHeader.ToProto hot path"}} +{"run":7,"commit":"ccbc2e4","metric":64,"metrics":{"bytes_per_op":11130,"ns_per_op":31570},"status":"keep","description":"sync.Pool for protobuf message structs in MarshalBinary — eliminates 10 allocs per block","timestamp":1775296105928,"segment":0,"confidence":5.666666666666667,"iterationTokens":11490,"asi":{"hypothesis":"Pool pb.Header, pb.Version, pb.Data, pb.Metadata, pb.SignedHeader, pb.Signer, pb.State to avoid struct allocs in marshal hot path","result":"saved 10 allocs, ~1KB — from 74 to 64 allocs/op","key_files":"types/serialization.go","notes":"Only MarshalBinary uses pools (consumes result immediately). ToProto() API unchanged for external callers.","next_target":"Store path: NewBasicBatch, Put, GenerateKey, getIndexKey allocate per-store-op. Also Datastore.Put allocates."}} diff --git a/types/serialization.go b/types/serialization.go index 5a1258443f..fe863bbbfb 100644 --- a/types/serialization.go +++ b/types/serialization.go @@ -218,12 +218,69 @@ func (sh *SignedHeader) FromProto(other *pb.SignedHeader) error { } // MarshalBinary encodes SignedHeader into binary form and returns it. +// Uses pooled protobuf messages to avoid per-block allocation. func (sh *SignedHeader) MarshalBinary() ([]byte, error) { - hp, err := sh.ToProto() + psh := pbSignedHeaderPool.Get().(*pb.SignedHeader) + psh.Reset() + + // Reuse pooled pb.Header + pb.Version for the nested header. + ph := pbHeaderPool.Get().(*pb.Header) + ph.Reset() + pv := pbVersionPool.Get().(*pb.Version) + pv.Block, pv.App = sh.Header.Version.Block, sh.Header.Version.App + ph.Version = pv + ph.Height = sh.Header.BaseHeader.Height + ph.Time = sh.Header.BaseHeader.Time + ph.ChainId = sh.Header.BaseHeader.ChainID + ph.LastHeaderHash = sh.Header.LastHeaderHash + ph.DataHash = sh.Header.DataHash + ph.AppHash = sh.Header.AppHash + ph.ProposerAddress = sh.Header.ProposerAddress + ph.ValidatorHash = sh.Header.ValidatorHash + if unknown := encodeLegacyUnknownFields(sh.Header.Legacy); len(unknown) > 0 { + ph.ProtoReflect().SetUnknown(unknown) + } + psh.Header = ph + psh.Signature = sh.Signature + + if sh.Signer.PubKey == nil { + psh.Signer = &pb.Signer{} + bz, err := proto.Marshal(psh) + ph.Reset() + pbHeaderPool.Put(ph) + pv.Reset() + pbVersionPool.Put(pv) + psh.Reset() + pbSignedHeaderPool.Put(psh) + return bz, err + } + + pubKey, err := sh.Signer.MarshalledPubKey() if err != nil { + ph.Reset() + pbHeaderPool.Put(ph) + pv.Reset() + pbVersionPool.Put(pv) + psh.Reset() + pbSignedHeaderPool.Put(psh) return nil, err } - return proto.Marshal(hp) + psi := pbSignerPool.Get().(*pb.Signer) + psi.Reset() + psi.Address = sh.Signer.Address + psi.PubKey = pubKey + psh.Signer = psi + bz, err := proto.Marshal(psh) + + ph.Reset() + pbHeaderPool.Put(ph) + pv.Reset() + pbVersionPool.Put(pv) + psi.Reset() + pbSignerPool.Put(psi) + psh.Reset() + pbSignedHeaderPool.Put(psh) + return bz, err } // UnmarshalBinary decodes binary form of SignedHeader into object. From b78ee255ad31665d5f17717dff71e645925d7abd Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 12:03:17 +0200 Subject: [PATCH 08/10] =?UTF-8?q?pool=20State.MarshalBinary=20and=20use=20?= =?UTF-8?q?it=20in=20UpdateState=20=E2=80=94=20saves=202=20allocs=20per=20?= =?UTF-8?q?block?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit State.ToProto allocated pb.State + pb.Version + timestamppb.Timestamp per block. MarshalBinary now pools those structs and returns the marshaled bytes directly. pkg/store/batch.UpdateState switched from ToProto+proto.Marshal to MarshalBinary. --- autoresearch.jsonl | 1 + pkg/store/batch.go | 13 ++++--------- types/serialization.go | 32 ++++++++++++++++++++++++++++++++ 3 files changed, 37 insertions(+), 9 deletions(-) diff --git a/autoresearch.jsonl b/autoresearch.jsonl index 213a2be777..c2bfdad8ec 100644 --- a/autoresearch.jsonl +++ b/autoresearch.jsonl @@ -6,3 +6,4 @@ {"run":5,"commit":"0720b44","metric":74,"metrics":{"bytes_per_op":12192,"ns_per_op":31624},"status":"keep","description":"unsafe.Slice in Data.ToProto() — eliminates txsToByteSlices [][]byte allocation","timestamp":1775293190538,"segment":0,"confidence":7,"iterationTokens":9987,"asi":{"hypothesis":"Use unsafe.Slice in Data.ToProto() to avoid txsToByteSlices allocation","next_target":"Data.Hash() marshal + sha256 allocation; Header.ToProto() allocates pb.Header; Data.Size() marshals for metrics","notes":"Biggest single win yet — per-TX allocation eliminated for every protobuf encoding","result":"-3 allocs, ~8KB, ~0.5µs faster"}} {"run":6,"commit":"$(git r","metric":74,"metrics":{"bytes_per_op":12187,"ns_per_op":31217},"status":"discard","description":"Reverted hand-written HashSlim wire encoder — produced different hashes than MarshalBinary","timestamp":1775294347584,"segment":0,"confidence":2.8,"iterationTokens":25063,"asi":{"hypothesis":"Direct protobuf wire encoding in HashSlim to avoid pb.Header/pb.Version allocations","result":"Hash mismatch — wire encoding differences in version field (0a02 vs 0a04)","rollback_reason":"Hand-written encoder produces different byte output than protobuf MarshalBinary; would break hash verification","next_action_hint":"Focus on safe optimizations: avoid creating pb.SignedHeader/pb.Signer structs when possible, reduce allocations in store path, look at SignedHeader.ToProto hot path"}} {"run":7,"commit":"ccbc2e4","metric":64,"metrics":{"bytes_per_op":11130,"ns_per_op":31570},"status":"keep","description":"sync.Pool for protobuf message structs in MarshalBinary — eliminates 10 allocs per block","timestamp":1775296105928,"segment":0,"confidence":5.666666666666667,"iterationTokens":11490,"asi":{"hypothesis":"Pool pb.Header, pb.Version, pb.Data, pb.Metadata, pb.SignedHeader, pb.Signer, pb.State to avoid struct allocs in marshal hot path","result":"saved 10 allocs, ~1KB — from 74 to 64 allocs/op","key_files":"types/serialization.go","notes":"Only MarshalBinary uses pools (consumes result immediately). ToProto() API unchanged for external callers.","next_target":"Store path: NewBasicBatch, Put, GenerateKey, getIndexKey allocate per-store-op. Also Datastore.Put allocates."}} +{"run":8,"commit":"805672e","metric":56,"metrics":{"bytes_per_op":10217,"ns_per_op":31037},"status":"keep","description":"Pooled SignedHeader.MarshalBinary — reuse pb.SignedHeader/pb.Header/pb.Signer/pb.Version structs","timestamp":1775296668832,"segment":0,"confidence":8.333333333333334,"iterationTokens":22478,"asi":{"hypothesis":"Pool protobuf structs in SignedHeader.MarshalBinary to eliminate 4 struct allocs per marshal","result":"saved 8 allocs, ~1KB — from 64 to 56 allocs/op","key_files":"types/serialization.go","next_target":"State.ToProto (11.5MB), Hash.String() for metrics (10.5MB), datastore batch/put allocs"}} diff --git a/pkg/store/batch.go b/pkg/store/batch.go index 6222b85baa..b49ffaa7b4 100644 --- a/pkg/store/batch.go +++ b/pkg/store/batch.go @@ -6,7 +6,6 @@ import ( "fmt" ds "github.com/ipfs/go-datastore" - "google.golang.org/protobuf/proto" "github.com/evstack/ev-node/types" ) @@ -84,18 +83,14 @@ func (b *DefaultBatch) SaveBlockDataFromBytes(header *types.SignedHeader, header return nil } -// UpdateState updates the state in the batch +// UpdateState updates the state in the batch. +// Uses pooled State.MarshalBinary to reduce per-block allocations. func (b *DefaultBatch) UpdateState(state types.State) error { - // Save the state at the height specified in the state itself height := state.LastBlockHeight - pbState, err := state.ToProto() + data, err := state.MarshalBinary() if err != nil { - return fmt.Errorf("failed to convert type state to protobuf type: %w", err) - } - data, err := proto.Marshal(pbState) - if err != nil { - return fmt.Errorf("failed to marshal state to protobuf: %w", err) + return fmt.Errorf("failed to marshal state: %w", err) } return b.batch.Put(b.ctx, ds.RawKey(getStateAtHeightKey(height)), data) diff --git a/types/serialization.go b/types/serialization.go index fe863bbbfb..9334a58255 100644 --- a/types/serialization.go +++ b/types/serialization.go @@ -528,6 +528,38 @@ func (d *Data) FromProto(other *pb.Data) error { return nil } +// MarshalBinary encodes State into binary form using pooled protobuf messages +// to reduce per-block allocations in the UpdateState hot path. +func (s *State) MarshalBinary() ([]byte, error) { + ps := pbStatePool.Get().(*pb.State) + ps.Reset() + + pv := pbVersionPool.Get().(*pb.Version) + pv.Block, pv.App = s.Version.Block, s.Version.App + + pts := ×tamppb.Timestamp{ + Seconds: s.LastBlockTime.Unix(), + Nanos: int32(s.LastBlockTime.Nanosecond()), + } + + ps.Version = pv + ps.ChainId = s.ChainID + ps.InitialHeight = s.InitialHeight + ps.LastBlockHeight = s.LastBlockHeight + ps.LastBlockTime = pts + ps.DaHeight = s.DAHeight + ps.AppHash = s.AppHash + ps.LastHeaderHash = s.LastHeaderHash + + bz, err := proto.Marshal(ps) + + ps.Reset() + pbStatePool.Put(ps) + pv.Reset() + pbVersionPool.Put(pv) + return bz, err +} + // ToProto converts State into protobuf representation and returns it. func (s *State) ToProto() (*pb.State, error) { // Avoid timestamppb.New allocation by constructing inline. From 8ea6a1a5635bbe65e8e7901fad5249e1f9a6c9cf Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 12:08:14 +0200 Subject: [PATCH 09/10] fix lint --- types/serialization.go | 26 +++++++++++++------------- 1 file changed, 13 insertions(+), 13 deletions(-) diff --git a/types/serialization.go b/types/serialization.go index 9334a58255..6b2eba1170 100644 --- a/types/serialization.go +++ b/types/serialization.go @@ -130,7 +130,7 @@ func (d *Data) MarshalBinary() ([]byte, error) { pm.ChainId = d.Metadata.ChainID pm.Height = d.Metadata.Height pm.Time = d.Metadata.Time - pm.LastDataHash = d.Metadata.LastDataHash + pm.LastDataHash = d.LastDataHash pd.Metadata = pm defer func() { pm.Reset() @@ -222,22 +222,22 @@ func (sh *SignedHeader) FromProto(other *pb.SignedHeader) error { func (sh *SignedHeader) MarshalBinary() ([]byte, error) { psh := pbSignedHeaderPool.Get().(*pb.SignedHeader) psh.Reset() - + // Reuse pooled pb.Header + pb.Version for the nested header. ph := pbHeaderPool.Get().(*pb.Header) ph.Reset() pv := pbVersionPool.Get().(*pb.Version) - pv.Block, pv.App = sh.Header.Version.Block, sh.Header.Version.App + pv.Block, pv.App = sh.Version.Block, sh.Version.App ph.Version = pv - ph.Height = sh.Header.BaseHeader.Height - ph.Time = sh.Header.BaseHeader.Time - ph.ChainId = sh.Header.BaseHeader.ChainID - ph.LastHeaderHash = sh.Header.LastHeaderHash - ph.DataHash = sh.Header.DataHash - ph.AppHash = sh.Header.AppHash - ph.ProposerAddress = sh.Header.ProposerAddress - ph.ValidatorHash = sh.Header.ValidatorHash - if unknown := encodeLegacyUnknownFields(sh.Header.Legacy); len(unknown) > 0 { + ph.Height = sh.BaseHeader.Height + ph.Time = sh.BaseHeader.Time + ph.ChainId = sh.BaseHeader.ChainID + ph.LastHeaderHash = sh.LastHeaderHash + ph.DataHash = sh.DataHash + ph.AppHash = sh.AppHash + ph.ProposerAddress = sh.ProposerAddress + ph.ValidatorHash = sh.ValidatorHash + if unknown := encodeLegacyUnknownFields(sh.Legacy); len(unknown) > 0 { ph.ProtoReflect().SetUnknown(unknown) } psh.Header = ph @@ -491,7 +491,7 @@ func (d *Data) ToProto() *pb.Data { ChainId: d.Metadata.ChainID, Height: d.Metadata.Height, Time: d.Metadata.Time, - LastDataHash: d.Metadata.LastDataHash[:], + LastDataHash: d.LastDataHash[:], } } // Reinterpret Txs ([]Tx) as [][]byte without allocation. From 2438dc036b2fba07765f47cd149857526a8b1e4a Mon Sep 17 00:00:00 2001 From: tac0turtle Date: Sat, 4 Apr 2026 12:09:12 +0200 Subject: [PATCH 10/10] remove files --- autoresearch.jsonl | 9 - autoresearch.md | 50 ---- autoresearch.sh | 34 --- docs/adr/memory-optimization-plan.md | 425 --------------------------- 4 files changed, 518 deletions(-) delete mode 100644 autoresearch.jsonl delete mode 100644 autoresearch.md delete mode 100755 autoresearch.sh delete mode 100644 docs/adr/memory-optimization-plan.md diff --git a/autoresearch.jsonl b/autoresearch.jsonl deleted file mode 100644 index c2bfdad8ec..0000000000 --- a/autoresearch.jsonl +++ /dev/null @@ -1,9 +0,0 @@ -{"type":"config","name":"Block package performance and allocation reduction","metricName":"allocs_per_op","metricUnit":"allocs","bestDirection":"lower"} -{"run":1,"commit":"f5d00f3","metric":81,"metrics":{"bytes_per_op":25934,"ns_per_op":34001},"status":"keep","description":"Baseline - current block production performance with 100 txs","timestamp":1775292593926,"segment":0,"confidence":null,"iterationTokens":205,"asi":{"hypothesis":"baseline measurement","hotspots":"sha256.New() in leafHashOpt (~213B), txsToByteSlices allocs [][]byte, Data.ToProto allocs, DACommitment creates pruned Data, ApplyBlock converts Txs to [][]byte","baseline_100txs":"81 allocs, 25934 B, 34001 ns","baseline_empty":"71 allocs, ~7KB, ~27µs"}} -{"run":2,"commit":"faf34e7","metric":79,"metrics":{"bytes_per_op":25697,"ns_per_op":34147},"status":"keep","description":"sync.Pool for sha256.Hash in leafHashOpt — eliminates 2 sha256.New() allocations per block","timestamp":1775292696995,"segment":0,"confidence":null,"iterationTokens":2904,"asi":{"hypothesis":"pool sha256.Hash to avoid 213B allocation per sha256.New()","result":"saved 2 allocs, 237 bytes, ns flat","next_target":"Data.ToProto allocates [][]byte via txsToByteSlices — should be biggest remaining win"}} -{"run":3,"commit":"a8b6352","metric":78,"metrics":{"bytes_per_op":22996,"ns_per_op":33091},"status":"keep","description":"Unsafe reinterpret cast of Txs to [][]byte in ApplyBlock — eliminates make([][]byte, n) allocation","timestamp":1775292779165,"segment":0,"confidence":3,"iterationTokens":3065,"asi":{"hypothesis":"use unsafe.Slice/unsafe.SliceData instead of make+loop for Txs→[][]byte conversion","result":"saved 1 alloc, ~2.7KB, ~1µs","note":"types.Tx = []byte so types.Txs = []Tx has identical memory layout to [][]byte"}} -{"run":4,"commit":"823aa62","metric":77,"metrics":{"bytes_per_op":20276,"ns_per_op":32480},"status":"keep","description":"Direct pb.Data serialization in DACommitment — avoids pruned Data wrapper and txsToByteSlices allocations","timestamp":1775293017595,"segment":0,"confidence":4,"iterationTokens":6395,"asi":{"bytes_saved":"~2.7KB vs baseline","empty_txs_safe":"unsafe.Slice on 0-len slice is safe in Go 1.21+","hypothesis":"Use pb.Data directly with unsafe reinterpreted Txs instead of creating pruned Data wrapper","note":"Must still handle empty_txs case - need to check if nil Metadata affects wire format","next_target":"txsToByteSlices in Data.ToProto allocates [][]byte; Data.Hash allocates from MarshalBinary","result":"-1 alloc, ~2.7KB saved, ~0.6µs faster"}} -{"run":5,"commit":"0720b44","metric":74,"metrics":{"bytes_per_op":12192,"ns_per_op":31624},"status":"keep","description":"unsafe.Slice in Data.ToProto() — eliminates txsToByteSlices [][]byte allocation","timestamp":1775293190538,"segment":0,"confidence":7,"iterationTokens":9987,"asi":{"hypothesis":"Use unsafe.Slice in Data.ToProto() to avoid txsToByteSlices allocation","next_target":"Data.Hash() marshal + sha256 allocation; Header.ToProto() allocates pb.Header; Data.Size() marshals for metrics","notes":"Biggest single win yet — per-TX allocation eliminated for every protobuf encoding","result":"-3 allocs, ~8KB, ~0.5µs faster"}} -{"run":6,"commit":"$(git r","metric":74,"metrics":{"bytes_per_op":12187,"ns_per_op":31217},"status":"discard","description":"Reverted hand-written HashSlim wire encoder — produced different hashes than MarshalBinary","timestamp":1775294347584,"segment":0,"confidence":2.8,"iterationTokens":25063,"asi":{"hypothesis":"Direct protobuf wire encoding in HashSlim to avoid pb.Header/pb.Version allocations","result":"Hash mismatch — wire encoding differences in version field (0a02 vs 0a04)","rollback_reason":"Hand-written encoder produces different byte output than protobuf MarshalBinary; would break hash verification","next_action_hint":"Focus on safe optimizations: avoid creating pb.SignedHeader/pb.Signer structs when possible, reduce allocations in store path, look at SignedHeader.ToProto hot path"}} -{"run":7,"commit":"ccbc2e4","metric":64,"metrics":{"bytes_per_op":11130,"ns_per_op":31570},"status":"keep","description":"sync.Pool for protobuf message structs in MarshalBinary — eliminates 10 allocs per block","timestamp":1775296105928,"segment":0,"confidence":5.666666666666667,"iterationTokens":11490,"asi":{"hypothesis":"Pool pb.Header, pb.Version, pb.Data, pb.Metadata, pb.SignedHeader, pb.Signer, pb.State to avoid struct allocs in marshal hot path","result":"saved 10 allocs, ~1KB — from 74 to 64 allocs/op","key_files":"types/serialization.go","notes":"Only MarshalBinary uses pools (consumes result immediately). ToProto() API unchanged for external callers.","next_target":"Store path: NewBasicBatch, Put, GenerateKey, getIndexKey allocate per-store-op. Also Datastore.Put allocates."}} -{"run":8,"commit":"805672e","metric":56,"metrics":{"bytes_per_op":10217,"ns_per_op":31037},"status":"keep","description":"Pooled SignedHeader.MarshalBinary — reuse pb.SignedHeader/pb.Header/pb.Signer/pb.Version structs","timestamp":1775296668832,"segment":0,"confidence":8.333333333333334,"iterationTokens":22478,"asi":{"hypothesis":"Pool protobuf structs in SignedHeader.MarshalBinary to eliminate 4 struct allocs per marshal","result":"saved 8 allocs, ~1KB — from 64 to 56 allocs/op","key_files":"types/serialization.go","next_target":"State.ToProto (11.5MB), Hash.String() for metrics (10.5MB), datastore batch/put allocs"}} diff --git a/autoresearch.md b/autoresearch.md deleted file mode 100644 index 3920b43e0b..0000000000 --- a/autoresearch.md +++ /dev/null @@ -1,50 +0,0 @@ -# Autoresearch: Block package performance and allocation reduction - -## Objective -Reduce memory allocations and improve performance of the block production hot path, specifically the `BlockProducer` interface methods (`ProduceBlock`, `CreateBlock`, `ApplyBlock`). The benchmark lives in `block/internal/executing/executor_benchmark_test.go`. - -## Metrics -- **Primary**: `allocs_per_op` (allocs/op, lower is better) — allocation count in `BenchmarkProduceBlock/100_txs` -- **Secondary**: `bytes_per_op` (B/op, lower is better) — memory usage per operation -- **Secondary**: `ns_per_op` (ns/op, lower is better) — execution time - -We target the 100_txs benchmark case because it has the most allocations (81 vs 71 for empty) and shows the clearest per-tx allocation pattern. - -## How to Run -`./autoresearch.sh` — runs `go test -bench=BenchmarkProduceBlock/100_txs -benchmem -count=3` and reports median of the 3 runs with METRIC lines for allocs_per_op, bytes_per_op, and ns_per_op. - -## Files in Scope -- `types/hashing.go` — hash computation; `sha256.New()` allocates ~213 bytes per call; called by `Data.Hash()` and `DACommitment()` -- `types/serialization.go` — `ToProto()`, `MarshalBinary()`, `txsToByteSlices()` all allocate new structs/slices every call -- `block/internal/executing/executor.go` — main block production; `ApplyBlock` converts `Txs` to `[][]byte` every time -- `types/data.go` — `Data` type; `DACommitment()` creates pruned Data allocation -- `types/header.go` — `Header` type -- `types/state.go` — `State` type, `NextState()` - -## Off Limits -- Protobuf definitions (`types/pb/`) — must not change wire format -- Test files except the benchmark -- Public API signatures (keep backward compatibility) - -## Constraints -- All tests must pass (`just test ./block/... ./types/...`) -- No new external dependencies -- Must not change protobuf wire format - -## What's Been Tried -Nothing yet — starting from baseline. - -## Baseline (first run) -Benchmark: `BenchmarkProduceBlock/100_txs` -- **81 allocs/op** (PRIMARY) -- ~25,900 B/op -- ~33,000 ns/op - -### Key allocation hotspots identified: -1. **`leafHashOpt()`** — `sha256.New()` allocates a new hash.Hash every call (~213B). Called by `Data.Hash()` and `DACommitment()` every block. -2. **`txsToByteSlices()`** — allocates new `[][]byte` slice every `Data.ToProto()` call. -3. **`Data.ToProto()` / `Header.ToProto()`** — allocate new protobuf structs every serialization. -4. **`DACommitment()`** — creates pruned `&Data{Txs: d.Txs}` allocation before hashing. -5. **`ApplyBlock`** — `make([][]byte, n)` for raw tx conversion every block. -6. **`Data.Hash()`** — allocates byte slice from `MarshalBinary()` + sha256.New(). -7. **`Header.HashSlim()`** — same pattern. diff --git a/autoresearch.sh b/autoresearch.sh deleted file mode 100755 index aad35a5215..0000000000 --- a/autoresearch.sh +++ /dev/null @@ -1,34 +0,0 @@ -#!/bin/bash -# Fast benchmark runner for block production performance. -# Pre-check: syntax compilation under 1s. -# Runs the benchmark 3 times and reports the median for 100_txs case. - -set -euo pipefail - -# Fast syntax check -go build ./block/internal/executing/ >/dev/null 2>&1 || exit 1 - -run_bench() { - go test -run=^$ -bench=BenchmarkProduceBlock/100_txs -benchmem -count=1 \ - ./block/internal/executing/ 2>&1 | grep '100_txs' -} - -# Run 3 times, collect results -declare -a lines=() -for i in 1 2 3; do - lines+=("$(run_bench)") -done - -# Sort by allocs (field before "allocs/op"), pick median -sorted=$(for line in "${lines[@]}"; do echo "$line"; done | sort -t' ' -k1) -median=$(echo "$sorted" | sed -n '2p') - -# Extract numbers using awk -# Format: BenchmarkProduceBlock/100_txs-10 35155 33517 ns/op 25932 B/op 81 allocs/op -ns=$(echo "$median" | awk '{for(i=1;i<=NF;i++){if($i=="ns/op")print $(i-1)}}') -bytes=$(echo "$median" | awk '{for(i=1;i<=NF;i++){if($i=="B/op")print $(i-1)}}') -allocs=$(echo "$median" | awk '{for(i=1;i<=NF;i++){if($i=="allocs/op")print $(i-1)}}') - -echo "METRIC allocs_per_op=$allocs" -echo "METRIC bytes_per_op=$bytes" -echo "METRIC ns_per_op=$ns" diff --git a/docs/adr/memory-optimization-plan.md b/docs/adr/memory-optimization-plan.md deleted file mode 100644 index 95cf25153e..0000000000 --- a/docs/adr/memory-optimization-plan.md +++ /dev/null @@ -1,425 +0,0 @@ -# Memory & Allocation Optimization Plan - -**Date:** 2026-03-30 -**Source:** pprof profiles collected from `prd-eden-testnet-node-1:6060` -**Node binary:** `evm` (Build ID: `c66958170e0ea317dee65bab308c02d0cc6b6098`) - ---- - -## Context - -A production node on a 62 GiB server is consuming 46 GiB in OS Cache+Buffer, using 2.6 GiB of swap, and the pattern repeats when the server is upsized — available RAM is consumed. The GC is spending ~55% of sampled CPU time in `scanobject`/`findObject`/`gcDrain`, indicating sustained heap pressure rather than a CPU-bound workload. - -Profiles collected: - -| Profile | Command | -|---------|---------| -| CPU (30s) | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/profile?seconds=30` | -| Heap (live) | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/heap` | -| Allocs (lifetime) | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/allocs` | -| Goroutines | `curl http://prd-eden-testnet-node-1:6060/debug/pprof/goroutine` | - -**Follow-up profile:** 2026-04-03, `prd-eden-testnet-node-2:6060` (Build ID: `2b20bbdd78f3b2cecb508bb4ba5a7a12803df841`). In-use heap grew to ~2 GB (up from ~1.05 GB). GC dominance is gone (CPU now 2% utilization, no `scanobject`/`gcDrain` in top) but heap footprint increased. Dominant consumers shifted to `block/internal/cache` (see Issue 6). `go-header Hash.String` holds 302 MB (new Issue 7). Proto marshal is no longer in the top allocators. Gzip/flate share of cumulative allocs rose from 0.8% to 19%. Issues 1, 3, 4, 5 remain open. - -**Additional work merged (not in original plan):** -- [#3219](https://github.com/evstack/ev-node/pull/3219) — `Header.MemoizeHash()`: caches computed hash on the struct, avoiding repeated `sha256` + `proto.Marshal` on every `Hash()` call. Partially reduces heap pressure from hash-derived allocations. -- [#3204](https://github.com/evstack/ev-node/pull/3204) — Removed LRU from `block/internal/cache` generic cache; simplified eviction model. - ---- - -## Findings - -### 2026-03-30 — node-1 (Build ID: `c66958170e0ea317dee65bab308c02d0cc6b6098`) - -#### CPU profile (3.68s samples over 30s) - -| flat | cum | function | -|------|-----|----------| -| 16.6% | 17.4% | `runtime.findObject` | -| 11.1% | 41.6% | `runtime.scanobject` | -| 3.5% | — | `runtime.(*gcBits).bitp` | -| — | 41.9%| `runtime.gcDrain` | - -GC accounts for ~55% of sampled CPU. The node is not CPU-bound; the GC is responding to sustained allocation pressure. - -#### Heap — live objects (~1.05 GB in use) - -| MB (flat) | MB (cum) | call site | -|-----------|----------|-----------| -| 312 | 312 | `ristretto/z.Calloc` (off-heap, mmap-based block cache) | -| 101 | 490 | `store.DefaultStore.GetHeader` | -| 83 | 83 | `badger/skl.newArena` (memtables) | -| 82 | 82 | `protobuf consumeBytesSlice` | -| 63 | 63 | LRU `insertValue` | -| 56 | 125 | `types.SignedHeader.FromProto` | -| 9 | 539 | `store.DefaultStore.GetBlockData` | -| 4 | 579 | `store.CachedStore.GetBlockData` | -| 0 | 504 | `go-header/p2p.ExchangeServer.handleRangeRequest` | - -#### Allocs — lifetime (~58.5 TB total) - -| % of total | call site | -|------------|-----------| -| 63.5% | `proto.MarshalOptions.marshal` | -| 12.1% | `encoding/json.(*Decoder).refill` | -| 6.1% | `encoding/json.(*decodeState).literalStore` | -| 5.6% | `encoding/json.(*RawMessage).UnmarshalJSON` | -| 2.8% | `encoding/hex.DecodeString` | -| 2.0% | `encoding/hex.EncodeToString` | -| 1.5% | `go-buffer-pool.(*BufferPool).Get` | -| 0.8% | `compress/flate.dictDecoder.init` | - -The `proto.Marshal` allocations trace to `types.P2PData.MarshalBinary` → `go-header/p2p.ExchangeServer.handleRangeRequest`. For a range request of 128 headers, 128 fresh scratch buffers are allocated. - -The JSON/hex allocations trace to `evm.EngineClient.GetTxs` → `rpc.Client.sendHTTP` (go-ethereum). The gzip allocations are from `net/http`'s transparent decompression of EL responses. - ---- - -### 2026-04-03 — node-2 (Build ID: `2b20bbdd78f3b2cecb508bb4ba5a7a12803df841`) - -#### CPU profile (610ms samples over 30s — 2% utilization) - -GC dominance is gone. Node is largely idle. Top consumers: - -| flat | flat% | cum | function | -|------|-------|-----|----------| -| 100ms | 16.4% | 100ms | `syscall.Syscall6` | -| 50ms | 8.2% | 50ms | `edwards25519/field.feMul` (signature verification) | -| 10ms | 1.6% | 50ms | `rpc.(*httpConn).doRequest` (upstream EL calls) | - -#### Heap — live objects (~2.0 GB in use) - -| MB (flat) | MB (cum) | call site | -|-----------|----------|-----------| -| 514 | 514 | `block/internal/cache.(*Cache).setSeen` | -| 420 | 420 | `block/internal/cache.(*Cache).setDAIncluded` | -| 307 | 307 | `block/internal/cache.HeightPlaceholderKey` (inline) | -| 302 | 302 | `go-header.Hash.String` | -| 148 | 148 | `ristretto/z.Calloc` (off-heap) | -| 83 | 83 | `badger/skl.newArena` (memtables) | -| 75 | 285 | `store.DefaultStore.GetHeader` | -| 38 | 38 | `types.Header.FromProto` | -| 32 | 87 | `types.SignedHeader.FromProto` | -| 13 | 21 | LRU `insertValue` | -| 0 | 229 | `go-header/p2p.ExchangeServer.handleRangeRequest` | - -The top four entries — `setSeen`, `setDAIncluded`, `HeightPlaceholderKey`, `Hash.String` — account for ~1.54 GB (77% of heap). These are all within `block/internal/cache` and the header-exchange path. The store LRU is now negligible (13 MB). - -#### Allocs — lifetime (~805 GB total) - -| % of total | call site | -|------------|-----------| -| 30.2% | `encoding/json.(*Decoder).refill` (cumulative) | -| 24.1% | `compress/flate.NewReader` + `dictDecoder.init` (combined) | -| 21.6% | `ristretto/z.Calloc` | -| 4.6% | `compress/flate.(*dictDecoder).init` (direct) | -| 3.2% | `encoding/json.Marshal` | -| 1.5% | `net/http.Header.Clone` | -| 1.5% | `encoding/json.(*RawMessage).UnmarshalJSON` | -| 1.5% | `io.ReadAll` | - -Proto marshal no longer appears in the top allocators — the prior dominance (63.5%) has dissipated, likely due to reduced range request volume or binary encoding changes. Gzip/flate rose from 0.8% to ~24% of total allocs; the EL HTTP decompression path is now the single largest allocation source alongside JSON decoding. - ---- - -## Issues and Fixes - -### Issue 1 — LRU caches are count-based, not size-bound - -**Severity:** High — still unresolved; store LRU holds 13 MB in the 2026-04-03 profile (down from 63 MB) but defaults remain at 200K. Superseded in live-heap dominance by Issue 6. - -**Location:** `pkg/store/cached_store.go:11-17` - -```go -DefaultHeaderCacheSize = 200_000 -DefaultBlockDataCacheSize = 200_000 -``` - -These are item counts. A `*types.SignedHeader` is typically 300–800 bytes. A `blockDataEntry` includes the full `*types.Data` with all transactions — on a high-TPS chain this can be 50–500 KB per entry. At 200,000 entries, `blockDataCache` has a theoretical maximum in the tens of GB. - -**Fix:** Reduce defaults to `2_048` (headers) and `512` (block data). Expose both as fields in `config.Config` so operators can tune without recompiling. - -**Files:** -- `pkg/store/cached_store.go` — reduce constants, update `NewCachedStore` to accept config values -- `pkg/config/defaults.go` — add `DefaultStoreCacheHeaderSize = 2048`, `DefaultStoreCacheBlockDataSize = 512` -- `pkg/config/config.go` — add `StoreConfig` struct with `HeaderCacheSize int` and `BlockDataCacheSize int` -- `node/full.go` — pass config values to `NewCachedStore` -- `node/light.go` — same - -**Benchmarks:** -``` -// pkg/store/cached_store_test.go -BenchmarkCachedStore_GetHeader_HotPath // cache-hit path — must be 0 allocs/op -BenchmarkCachedStore_GetBlockData_HotPath // cache-hit path — must be 0 allocs/op -TestCachedStore_MemoryBound // runtime.ReadMemStats before/after; assert heap < threshold -``` - -**Acceptance criteria:** -- Cache-hit path: 0 `allocs/op` (pointer return, no copy) -- Two warm caches at new defaults hold < 50 MB total for realistic block sizes -- Configurable via `[store]` TOML section; documented in operator guide - ---- - -### Issue 2 — Badger `IndexCacheSize` unbounded ✅ COMPLETED - -**Severity:** High — index RAM grows proportionally to chain length. - -**Resolved:** [#3209](https://github.com/evstack/ev-node/pull/3209) (2026-03-30) - -**Location:** `pkg/store/badger_options.go` - -`badger.DefaultOptions` sets `BlockCacheSize = 256 MB` but `IndexCacheSize = 0` (unbounded in-memory). As the chain grows, SST block index entries accumulate in RAM without eviction. On a chain with millions of blocks this is several GB. - -**Fix applied:** - -```go -opts.Options = opts.WithIndexCacheSize(DefaultBadgerIndexCacheSize) // 256 MiB (was 0, unbounded) -``` - -Note: implemented at 256 MB rather than the originally planned 128 MB. - -Operators with more RAM should increase `IndexCacheSize` in config. The tradeoff is that cold reads (index entries not in cache) go to disk — acceptable on NVMe, noticeable on spinning disk. - -**Files:** -- `pkg/store/badger_options.go` - -**Benchmarks:** -``` -// pkg/store/store_test.go -BenchmarkStore_WriteThenRead // write N blocks, read random order — assert < 5% throughput regression -``` - -**Acceptance criteria:** -- RSS growth from Badger index is bounded at ~128 MB rather than growing with chain length -- No more than 5% regression in `BenchmarkStore_WriteThenRead` vs baseline - ---- - -### Issue 3 — Proto marshal scratch buffers not pooled (63.5% of all allocations) - -**Severity:** High — dominant allocation source, drives GC pressure. - -**Location:** `types/p2p_envelope.go` (`P2PData.MarshalBinary`, `P2PSignedHeader.MarshalBinary`), `types/serialization.go` (`SignedHeader.MarshalBinary`, `Data.MarshalBinary`, `Metadata.MarshalBinary`, `Header.MarshalBinary`) - -Every call to `proto.Marshal(msg)` allocates a fresh `[]byte` scratch buffer internally. For `handleRangeRequest` serving 128 headers, this is 128 allocations per request. - -**Fix:** Replace `proto.Marshal(msg)` with `proto.MarshalOptions{}.MarshalAppend` and a `sync.Pool` of scratch buffers: - -```go -var marshalPool = sync.Pool{New: func() any { b := make([]byte, 0, 1024); return &b }} - -func marshalProto(msg proto.Message) ([]byte, error) { - bp := marshalPool.Get().(*[]byte) - out, err := proto.MarshalOptions{}.MarshalAppend((*bp)[:0], msg) - marshalPool.Put(bp) - if err != nil { - return nil, err - } - // Copy — caller must not hold a reference into the pool buffer. - result := make([]byte, len(out)) - copy(result, out) - return result, nil -} -``` - -**Critical:** The pool buffer must not escape. The `copy` before return is mandatory. Run all marshal tests with `-race`. - -**Files:** -- `types/p2p_envelope.go` -- `types/serialization.go` -- Optionally extract `marshalProto` to `types/marshal.go` as a package-internal helper - -**Benchmarks:** -``` -// types/p2p_envelope_bench_test.go (new file) -BenchmarkP2PData_MarshalBinary // b.RunParallel to expose pool contention -BenchmarkP2PSignedHeader_MarshalBinary -BenchmarkSignedHeader_MarshalBinary // types/serialization_bench_test.go -``` - -**Acceptance criteria:** -- `allocs/op` drops from 6–10 to 1–2 per marshal call -- `B/op` drops by 30–50% (scratch buffer reused; only final copy allocated) -- No race detector findings: `go test -race ./types/...` -- All existing round-trip tests pass: `TestP2PEnvelope_MarshalUnmarshal`, `TestDataBinaryCompatibility` - ---- - -### Issue 4 — `filterTransactions` hex encoding allocates 2× per transaction - -**Severity:** Medium — ~20% of EVM RPC allocations. - -**Location:** `execution/evm/execution.go` — `filterTransactions` - -```go -"0x" + hex.EncodeToString(tx) // two allocations: hex string + concatenation -``` - -**Fix:** Use `hex.Encode` directly into a pre-allocated, pool-backed buffer: - -```go -var hexBufPool = sync.Pool{New: func() any { b := make([]byte, 0, 512); return &b }} - -func encodeHexTx(tx []byte) string { - bp := hexBufPool.Get().(*[]byte) - needed := 2 + hex.EncodedLen(len(tx)) - buf := append((*bp)[:0], "0x"...) - buf = buf[:needed] - hex.Encode(buf[2:], tx) - s := string(buf[:needed]) - *bp = buf - hexBufPool.Put(bp) - return s -} -``` - -This reduces two allocations to one (the unavoidable `string()` conversion). - -**Files:** -- `execution/evm/execution.go` - -**Benchmarks:** - -Extend the existing `execution/evm/filter_bench_test.go`: -``` -BenchmarkFilterTransactions_HexEncoding // 1000 txs of 150 bytes each, -benchmem -``` - -**Acceptance criteria:** -- `allocs/op` drops from `2N` to `N` for N transactions -- `B/op` drops by ~40% for a 1000-tx batch - ---- - -### Issue 5 — Gzip reader allocated per EL HTTP response - -**Severity:** Medium — ~1 TB cumulative lifetime allocations; removes the gzip decompressor from the hot path. - -**Root cause:** `net/http`'s transport automatically decompresses `Content-Encoding: gzip` responses by allocating a fresh `gzip.Reader` per response. This is in Go's standard library, triggered by go-ethereum's `rpc.Client.sendHTTP`. Not patchable in upstream libraries. - -**Fix A (preferred when EL is co-located):** Disable HTTP compression on the EL client transport in `NewEngineExecutionClient`: - -```go -rpc.WithHTTPClient(&http.Client{ - Transport: &http.Transport{DisableCompression: true}, -}) -``` - -No gzip reader is ever allocated. Valid only when EL and ev-node are on the same host (local compression has no bandwidth benefit). - -**Fix B (when EL is remote):** Implement a `sync.Pool`-backed `http.RoundTripper` that calls `(*gzip.Reader).Reset(body)` instead of `gzip.NewReader(body)`. Higher complexity; only needed for remote EL deployments. - -**Fix C (best for same-host production):** Switch `engine_url` to an IPC socket path (`/tmp/geth.ipc`). The IPC codec uses raw JSON over a Unix socket — no HTTP framing, no gzip, no JSON decoder allocation at the transport layer. - -**Files:** -- `execution/evm/execution.go` — `NewEngineExecutionClient`: add `DisableCompression: true` -- `execution/evm/flags.go` — document IPC URL as the recommended transport for same-host deployments - -**Benchmarks:** - -Verify via pprof before/after: `compress/flate.(*decompressor)` and `compress/gzip.(*Reader)` must disappear from the top allocation sites in the allocs profile. - -**Acceptance criteria:** -- Post-fix allocs profile shows zero `compress/flate` or `compress/gzip` entries -- No correctness regression under round-trip test with a real EL - ---- - -### Issue 6 — `block/internal/cache` unbounded growth (~1.24 GB live) - -**Severity:** Critical — new #1 heap consumer as of 2026-04-03 profile. - -**Location:** `block/internal/cache/generic_cache.go`, `block/internal/cache/manager.go` - -`setSeen`, `setDAIncluded`, and `HeightPlaceholderKey` together hold ~1.24 GB of live heap. The LRU was removed from this cache in [#3204](https://github.com/evstack/ev-node/pull/3204), which simplified eviction but appears to have left these maps growing without a bound. `RestoreFromStore` drives the initial population; ongoing `setSeen`/`setDAIncluded` calls accumulate entries without eviction. - -**Fix:** Audit `generic_cache.go` for unbounded map growth. Reintroduce a maximum-size eviction policy (LRU or a fixed-size ring buffer keyed by height) or a TTL-based cleanup for entries older than the finality horizon. The `HeightPlaceholderKey` allocations suggest the key string is being heap-allocated on every lookup — consider interning or using a numeric key directly. - -**Files:** -- `block/internal/cache/generic_cache.go` -- `block/internal/cache/manager.go` - -**Acceptance criteria:** -- Live heap from `block/internal/cache.*` stays below 100 MB at steady state on a running chain -- `setSeen` / `setDAIncluded` entry count is bounded (log or metric exposed) - ---- - -### Issue 7 — `go-header Hash.String` retaining 302 MB - -**Severity:** High — #4 live-heap consumer in 2026-04-03 profile. - -**Location:** `go-header/p2p.ExchangeServer.handleRangeRequest` → somewhere caching `Hash.String()` results. - -`celestiaorg/go-header.Hash.String()` hex-encodes a `[]byte` hash into a new `string` on every call. 302 MB of live strings means these are being stored (likely as map keys or log fields) and retained. This is separate from `Header.Hash()` computation — memoization in #3219 avoids recomputing the hash but doesn't prevent `String()` from allocating a new hex string each time it is called. - -**Fix options:** -1. If strings are used as map keys: store the raw `Hash` (`[]byte`) as a fixed-size `[32]byte` key instead of a hex string. Eliminates the allocation entirely. -2. If strings are used for logging: use `%x` in a format call rather than pre-allocating the string; zerolog's hex field support avoids allocation. -3. If strings must be cached: add `cachedString string` alongside `cachedHash` on `Header` and memoize via a `Hash.MemoizeString()` approach. - -**Files:** -- Investigate callers of `Hash.String()` in `block/internal/cache/`, `go-header/p2p/`, and any map keyed by hash string. - -**Acceptance criteria:** -- `go-header.Hash.String` disappears from the top-10 live-heap consumers -- No `string(hex.Encode(...))` patterns in hot paths that retain the result - ---- - -## Implementation Sequence - -| Phase | Issue | Effort | Expected impact | Status | -|-------|-------|--------|-----------------|--------| -| 1 | `block/internal/cache` eviction bound | 3h | ~1.24 GB → < 100 MB live heap | ⬜ Open (Issue 6) | -| 2 | `Hash.String` retention | 2h | ~302 MB freed | ⬜ Open (Issue 7) | -| 3 | LRU cache defaults + config exposure | 2h | Prevents future regression | ⬜ Open (Issue 1) | -| 4 | Badger `IndexCacheSize` cap | — | Bounds index RAM to ~256 MB | ✅ Done (#3209, Issue 2) | -| 5 | Disable gzip on EL HTTP transport | 1h | ~24% drop in allocation rate | ⬜ Open (Issue 5) | -| 6 | `filterTransactions` hex pool | 2h | ~40% drop in EVM RPC allocs | ⬜ Open (Issue 4) | -| 7 | Proto marshal `sync.Pool` | 4h | Revisit — may no longer be critical | ⬜ Open (Issue 3) | - -The ordering above reflects the 2026-04-03 profile. Issue 6 (`block/internal/cache`) is now the highest-leverage fix — it is the #1 live-heap consumer and the root cause of continued RAM growth. Issue 3 (proto pool) should be re-profiled before investing effort; proto marshal no longer appears in the top allocators. - ---- - -## Risks - -| Risk | Mitigation | -|------|------------| -| `block/internal/cache` eviction causes re-fetch on cache miss | Profile syncer throughput before/after; ensure DA and P2P retrieval paths tolerate misses without stalling | -| Proto pool buffer escapes | Mandatory `copy` before return; `-race` on all marshal tests | -| LRU reduction slows syncing | Benchmark `BenchmarkSyncerIO` before/after; bump default to 8,192 if regression observed | -| Badger index cap causes cold-read latency | 256 MB is already set — expose via config; document NVMe vs HDD trade-off | -| `DisableCompression` breaks remote EL | Guard behind a config flag; default to `false` for remote URLs, `true` for local | - ---- - -## Running the Benchmarks - -```bash -# Phase 1 — store cache -go test -bench=. -benchmem -count=6 ./pkg/store/... - -# Phase 2 — badger write/read throughput -go test -bench=BenchmarkStore_WriteThenRead -benchmem -count=6 ./pkg/store/... - -# Phase 3 — proto marshal -go test -bench=BenchmarkP2PData_MarshalBinary -benchmem -count=6 -race ./types/... - -# Phase 4 — hex encoding -go test -bench=BenchmarkFilterTransactions -benchmem -count=6 ./execution/evm/... - -# Full regression -go test ./... -count=1 -``` - -Use `benchstat` to compare before/after: - -```bash -go test -bench=. -benchmem -count=10 ./pkg/store/... > before.txt -# apply changes -go test -bench=. -benchmem -count=10 ./pkg/store/... > after.txt -benchstat before.txt after.txt -```