diff --git a/.gitignore b/.gitignore index 71e0d97..cf8ff87 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build/ +zz_fw_update_e2e_identity.go .vscode/settings.json diff --git a/main.go b/main.go index 74b705d..978f3c8 100644 --- a/main.go +++ b/main.go @@ -7,14 +7,25 @@ import ( "devicecode-go/bus" "devicecode-go/services/hal" "devicecode-go/services/reactor" + "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" ) // HAL const halTimeout = 5 * time.Second + var halReadiness = bus.T("hal", "state") +// Firmware identity is set by host build tooling before main runs. The e2e +// harness generates a same-package init file because TinyGo's -X support is +// narrower than the standard Go linker's support. +var ( + FirmwareVersion = "0.0.0-dev" + FirmwareBuild = "local" + FirmwareImageID = "img-dev" +) + // ----------------------------------------------------------------------------- // Main // ----------------------------------------------------------------------------- @@ -47,6 +58,15 @@ func main() { } } + // boot_id (master R3 / fabric-update W6): generate AFTER HAL ready + // and BEFORE the reactor opens fabric. RAM-only — never persisted. + bootID := updater.GenerateBootID() + log.Println("[main] boot_id =", bootID) + + reactor.FirmwareVersion = FirmwareVersion + reactor.FirmwareBuild = FirmwareBuild + reactor.FirmwareImageID = FirmwareImageID + // Reactor r := reactor.NewReactor(b, uiConn) r.Run(ctx) @@ -76,4 +96,4 @@ func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool } // Global logger instance -var log = utilities.Logger{LineStart: true} \ No newline at end of file +var log = utilities.Logger{LineStart: true} diff --git a/services/fabric/config.go b/services/fabric/config.go index 28b7cb1..ac8dff6 100644 --- a/services/fabric/config.go +++ b/services/fabric/config.go @@ -1,88 +1,15 @@ package fabric -import ( - "encoding/json" - - "devicecode-go/types" -) - -// decodeHALConfig extracts a HALConfig from an arbitrary payload, -// normalizing Lua empty-table encoding ({} → []) for known slice fields. -func decodeHALConfig(payload any) (types.HALConfig, string) { - switch v := payload.(type) { - case types.HALConfig: - return v, "" - case *types.HALConfig: - if v == nil { - return types.HALConfig{}, "nil_hal_config" - } - return *v, "" - case json.RawMessage: - return decodeHALConfigBytes(v) - case []byte: - return decodeHALConfigBytes(v) - default: - b, err := json.Marshal(v) - if err != nil { - return types.HALConfig{}, "payload_marshal_failed: " + err.Error() - } - return decodeHALConfigBytes(b) - } -} - -func decodeHALConfigBytes(b []byte) (types.HALConfig, string) { - var probe map[string]json.RawMessage - if err := json.Unmarshal(b, &probe); err != nil { - return types.HALConfig{}, "json_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(b) - } - if _, ok := probe["devices"]; !ok { - return types.HALConfig{}, "missing_devices_field; raw=" + truncateRawJSON(b) - } - - // Lua encodes empty tables as {} (object) not [] (array). - // Normalize known slice fields so Go unmarshal accepts them. - for _, key := range []string{"devices", "pollers"} { - if raw, ok := probe[key]; ok && len(raw) == 2 && raw[0] == '{' && raw[1] == '}' { - probe[key] = json.RawMessage("[]") - } - } - fixed, err := json.Marshal(probe) - if err != nil { - return types.HALConfig{}, "normalize_failed: " + err.Error() - } - - var out types.HALConfig - if err := json.Unmarshal(fixed, &out); err != nil { - return types.HALConfig{}, "hal_config_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(fixed) - } - return out, "" -} - -func decodeHALState(payload any) (types.HALState, bool) { - switch v := payload.(type) { - case types.HALState: - return v, true - case *types.HALState: - if v == nil { - return types.HALState{}, false - } - return *v, true - case json.RawMessage: - var out types.HALState - return out, json.Unmarshal(v, &out) == nil - case []byte: - var out types.HALState - return out, json.Unmarshal(v, &out) == nil - default: - b, err := json.Marshal(v) - if err != nil { - return types.HALState{}, false - } - var out types.HALState - return out, json.Unmarshal(b, &out) == nil - } -} - +import "encoding/json" + +// decodePayload normalises whatever shape the bus delivered into a +// reasonable Go value for the reply path. The wire delivers +// json.RawMessage; in-process callers may pass already-typed values. +// Used by session.onReply when forwarding RPC replies onto the +// originating Request's reply path. +// +// This file intentionally contains only reply-payload decoding; legacy +// config/device and rpc/hal/dump glue is no longer part of the MCU contract. func decodePayload(payload any) any { switch v := payload.(type) { case nil: @@ -111,14 +38,3 @@ func decodePayload(payload any) any { return v } } - -func truncateRawJSON(b []byte) string { - if len(b) == 0 { - return "" - } - const max = 160 - if len(b) <= max { - return string(b) - } - return string(b[:max]) + "..." -} diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 0a2ab70..91cf908 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -6,6 +6,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/updater" "devicecode-go/x/strconvx" ) @@ -16,7 +17,6 @@ type Transport interface { Close() error } -const protoVersion = 1 const defaultLinkID = "mcu0" // LinkConfig carries the fabric link parameters that the CM5 publishes @@ -95,7 +95,11 @@ func (c *LinkConfig) applyDefaults() { var nextSessionID atomic.Uint64 func newLocalSID() string { - return "mcu-sid-" + strconvx.Utoa64(nextSessionID.Add(1)) + bootID := updater.BootID() + if bootID == "" { + bootID = updater.GenerateBootID() + } + return "mcu-sid-" + bootID + "-" + strconvx.Utoa64(nextSessionID.Add(1)) } // Run starts the fabric session. Blocks until ctx is cancelled or the diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 361495c..705cb15 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -11,7 +11,6 @@ import ( "time" "devicecode-go/bus" - "devicecode-go/types" "devicecode-go/x/shmring" ) @@ -67,10 +66,10 @@ const testCM5SID = "s1" func bringUp(t *testing.T, cm5 Transport) protoHelloAck { t.Helper() sendMsg(t, cm5, protoHello{ - Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: testCM5SID, Proto: protoVersion, + Type: "hello", Proto: protocolName, Node: "cm5-local", SID: testCM5SID, }) ack := readMsg[protoHelloAck](t, cm5) - if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { + if ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protocolName { t.Fatalf("bad hello_ack: %+v", ack) } time.Sleep(50 * time.Millisecond) @@ -89,7 +88,7 @@ func unlockExports(t *testing.T, cm5 Transport) { // ---- codec ---- func TestCodecRoundTrip(t *testing.T) { - orig := protoHello{Type: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} + orig := protoHello{Type: "hello", Proto: protocolName, Node: "cm5-local", SID: "abc"} data := marshal(orig) if !bytes.HasSuffix(data, []byte("\n")) { t.Error("marshal should end with newline") @@ -103,7 +102,7 @@ func TestCodecRoundTrip(t *testing.T) { } var dec protoHello json.Unmarshal(jsonPart, &dec) - if dec != orig { + if dec.Type != orig.Type || dec.Proto != orig.Proto || dec.Node != orig.Node || dec.SID != orig.SID { t.Errorf("round-trip: %+v vs %+v", dec, orig) } } @@ -349,10 +348,10 @@ func TestHandshake(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) sendMsg(t, cm5, protoHello{ - Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, + Type: "hello", Proto: protocolName, Node: "cm5-local", SID: "s1", }) ack := readMsg[protoHelloAck](t, cm5) - if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { + if ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protocolName { t.Errorf("bad ack: %+v", ack) } time.Sleep(50 * time.Millisecond) @@ -371,10 +370,10 @@ func TestSessionReset(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "cm5-local", SID: "s2"}) ack := readMsg[protoHelloAck](t, cm5) - if !ack.OK || ack.SID == "" || ack.Proto != protoVersion { - t.Error("hello_ack.OK = false") + if ack.SID == "" || ack.Proto != protocolName { + t.Errorf("bad hello_ack: %+v", ack) } sendMsg(t, cm5, protoPing{Type: "ping", TS: 55, SID: "s2"}) pong := readMsg[protoPong](t, cm5) @@ -383,14 +382,14 @@ func TestSessionReset(t *testing.T) { } } -func TestRejectsWrongPeer(t *testing.T) { +func TestRejectsWrongNode(t *testing.T) { mcu, cm5 := pipePair() b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "cm5-wrong", SID: "s1"}) gotLine := make(chan readResult, 1) go func() { line, err := cm5.ReadLine() @@ -398,10 +397,10 @@ func TestRejectsWrongPeer(t *testing.T) { }() select { case <-gotLine: - t.Fatal("got response to wrong-peer hello") + t.Fatal("got response to wrong-node hello") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "cm5-local", SID: "s2"}) select { case res := <-gotLine: if res.err != nil { @@ -411,8 +410,8 @@ func TestRejectsWrongPeer(t *testing.T) { if err := json.Unmarshal(res.line, &ack); err != nil { t.Fatalf("expected hello_ack: %v", err) } - if !ack.OK { - t.Fatal("hello_ack.OK = false") + if ack.Proto != protocolName { + t.Fatalf("bad hello_ack: %+v", ack) } case <-time.After(2 * time.Second): t.Fatal("no hello_ack for correct peer") @@ -432,14 +431,14 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { gotLine <- readResult{line: line, err: err} }() - sendMsg(t, cm5, protoHello{Type: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, SID: "s1"}) select { case <-gotLine: t.Fatal("got response to hello without node") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Proto: protocolName, Node: "cm5-local", SID: "s2"}) select { case res := <-gotLine: if res.err != nil { @@ -449,8 +448,8 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { if err := json.Unmarshal(res.line, &ack); err != nil { t.Fatalf("expected hello_ack: %v", err) } - if !ack.OK { - t.Fatal("hello_ack.OK = false") + if ack.Proto != protocolName { + t.Fatalf("bad hello_ack: %+v", ack) } case <-time.After(2 * time.Second): t.Fatal("no hello_ack for correct peer") @@ -471,6 +470,40 @@ func TestPingPong(t *testing.T) { } } +func TestEchoedPingIgnored(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + ack := bringUp(t, cm5) + + sendMsg(t, cm5, protoPing{Type: "ping", TS: 41, SID: ack.SID}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: testCM5SID}) + + pong := readMsg[protoPong](t, cm5) + if pong.TS != 42 || pong.SID != ack.SID { + t.Errorf("bad pong after echoed ping: %+v ack=%+v", pong, ack) + } +} + +func TestEchoedTransferControlIgnored(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + bringUp(t, cm5) + + sendMsg(t, cm5, protoXferNeed{Type: msgXferNeed, XferID: "echoed", Next: 0}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: testCM5SID}) + + pong := readMsg[protoPong](t, cm5) + if pong.TS != 42 { + t.Errorf("bad pong after echoed transfer control: %+v", pong) + } +} + func TestMCUNeverInitiates(t *testing.T) { // Pre-handshake the MCU is silent; tickPing only fires once the link // is up. Active outbound pings post-handshake are covered by @@ -554,10 +587,22 @@ func TestSessionResetUnretainsImports(t *testing.T) { // in promoteLink/teardownImportedRetained: each tracked local topic // gets a nil-payload retained publish that clears the bus's retain // store, so consumers don't see stale CM5-session data. + // importPublishRules is empty in the production contract, so this test + // installs a scoped temp rule. The mechanism under test is the generic + // retain-tracking + session-reset teardown chain, not the specific topic. + prev := importPublishRules + importPublishRules = append([]importRule{}, prev...) + importPublishRules = append(importPublishRules, importRule{ + wire: []string{"test", "wire", "config"}, + local: []string{"test", "local", "config"}, + }) + t.Cleanup(func() { importPublishRules = prev }) + cfgTopic := bus.T("test", "local", "config") + mcu, cm5 := pipePair() b := newBus() observer := b.NewConnection("observer") - cfgSub := observer.Subscribe(tConfigHAL) + cfgSub := observer.Subscribe(cfgTopic) defer observer.Unsubscribe(cfgSub) ctx, cancel := context.WithCancel(context.Background()) @@ -565,12 +610,12 @@ func TestSessionResetUnretainsImports(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) - // Push a config via the import pub path so config/hal becomes a - // tracked imported retain. + // Push a payload via the temp import path so the local topic + // becomes a tracked imported retain. sendMsg(t, cm5, protoPub{ Type: msgPub, - Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"devices":[]}`), + Topic: []string{"test", "wire", "config"}, + Payload: json.RawMessage(`{"hello":"world"}`), Retain: true, }) @@ -584,7 +629,7 @@ func TestSessionResetUnretainsImports(t *testing.T) { gotInitial = true } case <-deadline: - t.Fatal("timeout waiting for initial config/hal retain") + t.Fatal("timeout waiting for initial imported retain") } } @@ -595,13 +640,13 @@ func TestSessionResetUnretainsImports(t *testing.T) { // not run. go func() { _ = readMsg[protoHelloAck](t, cm5) }() sendMsg(t, cm5, protoHello{ - Type: msgHello, - Node: "cm5-local", - Peer: "mcu-1", - SID: "cm5-sid-new", + Type: msgHello, + Proto: protocolName, + Node: "cm5-local", + SID: "cm5-sid-new", }) - // Expect a nil-payload retained publish on config/hal. + // Expect a nil-payload retained publish on the imported topic. deadline = time.After(2 * time.Second) for { select { @@ -683,10 +728,10 @@ func TestSessionResetUnretainsImportsAfterTransientPub(t *testing.T) { // 3) Session reset → expect the original retain to be cleared. go func() { _ = readMsg[protoHelloAck](t, cm5) }() sendMsg(t, cm5, protoHello{ - Type: msgHello, - Node: "cm5-local", - Peer: "mcu-1", - SID: "cm5-sid-new", + Type: msgHello, + Proto: protocolName, + Node: "cm5-local", + SID: "cm5-sid-new", }) deadline = time.After(2 * time.Second) @@ -907,28 +952,29 @@ func topicString(t bus.Topic) string { } func TestImportPublishTopic(t *testing.T) { - for _, tc := range []struct { - wire []string - want string - }{ - {[]string{"config", "device"}, "config/hal"}, - {[]string{"config", "other"}, ""}, - {[]string{"unknown", "x"}, ""}, - {nil, ""}, + // importPublishRules is empty. Anything queried returns nil. + for _, tc := range [][]string{ + {"config", "device"}, // legacy gone + {"config", "other"}, + {"unknown", "x"}, + nil, } { - got := importPublishTopic(tc.wire) - if gotStr := topicString(got); gotStr != tc.want { - t.Errorf("importPublishTopic(%v) = %q, want %q", tc.wire, gotStr, tc.want) + if got := importPublishTopic(tc); got != nil { + t.Errorf("importPublishTopic(%v) = %v, want nil", tc, got) } } } func TestImportCallTopic(t *testing.T) { + // The current wire surface keeps cmd/self/updater/{prepare,commit}; the + // legacy rpc/hal/dump inline handler has no route. for _, tc := range []struct { wire []string want string }{ - // rpc/hal/dump is handled directly by onCall, not via import rules. + {[]string{"cmd", "self", "updater", "prepare"}, "rpc/updater/prepare"}, + {[]string{"cmd", "self", "updater", "commit"}, "rpc/updater/commit"}, + {[]string{"rpc", "hal", "dump"}, ""}, {[]string{"rpc", "hal", "other"}, ""}, {[]string{"config", "device"}, ""}, {nil, ""}, @@ -941,14 +987,16 @@ func TestImportCallTopic(t *testing.T) { } func TestExportTopic(t *testing.T) { + // The current wire surface exports state/self/* and event/self/* only. for _, tc := range []struct { bus bus.Topic want []string }{ - {bus.T("hal", "cap", "env", "temperature", "core", "value"), []string{"state", "env", "temperature", "core", "value"}}, - {bus.T("hal", "cap", "power", "battery", "internal", "value"), []string{"state", "power", "battery", "internal", "value"}}, - {bus.T("hal", "state"), []string{"state", "hal"}}, - {bus.T("hal", "cap", "gpio", "fan", "value"), nil}, + {bus.T("state", "self", "software"), []string{"state", "self", "software"}}, + {bus.T("state", "self", "power", "battery"), []string{"state", "self", "power", "battery"}}, + {bus.T("event", "self", "power", "charger", "alert"), []string{"event", "self", "power", "charger", "alert"}}, + {bus.T("hal", "cap", "env", "temperature", "core", "value"), nil}, // legacy gone + {bus.T("hal", "state"), nil}, // legacy gone {bus.T("other", "topic"), nil}, } { got := exportTopic(tc.bus) @@ -956,41 +1004,23 @@ func TestExportTopic(t *testing.T) { if got != nil { t.Errorf("exportTopic(%v) = %v, want nil", tc.bus, got) } - } else { - if !slicesEqual(got, tc.want) { - t.Errorf("exportTopic(%v) = %v, want %v", tc.bus, got, tc.want) - } + } else if !slicesEqual(got, tc.want) { + t.Errorf("exportTopic(%v) = %v, want %v", tc.bus, got, tc.want) } } } func TestExportCallTopic(t *testing.T) { - for _, tc := range []struct { - bus bus.Topic - want []string - }{ - {bus.T("fabric", "out", "rpc", "hal", "dump"), []string{"rpc", "hal", "dump"}}, - {bus.T("fabric", "out", "rpc", "hal"), nil}, - {bus.T("other", "topic"), nil}, - } { - got := exportCallTopic(tc.bus) - if tc.want == nil { - if got != nil { - t.Errorf("exportCallTopic(%v) = %v, want nil", tc.bus, got) - } - } else if !slicesEqual(got, tc.want) { - t.Errorf("exportCallTopic(%v) = %v, want %v", tc.bus, got, tc.want) - } + // exportCallRules is empty; the MCU does not originate outbound RPC calls. + if got := exportCallTopic(bus.T("fabric", "out", "rpc", "hal", "dump")); got != nil { + t.Errorf("exportCallTopic(legacy dump path) = %v, want nil", got) } } func TestExportCallPatterns(t *testing.T) { patterns := exportCallPatterns() - if len(patterns) != 1 { - t.Fatalf("len(exportCallPatterns()) = %d, want 1", len(patterns)) - } - if got := topicString(patterns[0]); got != "fabric/out/rpc/hal/dump" { - t.Fatalf("exportCallPatterns()[0] = %q, want fabric/out/rpc/hal/dump", got) + if len(patterns) != 0 { + t.Fatalf("len(exportCallPatterns()) = %d, want 0", len(patterns)) } } @@ -1006,107 +1036,8 @@ func slicesEqual(a, b []string) bool { return true } -// ---- pub import ---- - -func TestPubImport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - conn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, conn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - reader := b.NewConnection("test") - sub := reader.Subscribe(bus.T("config", "hal")) - - sendMsg(t, cm5, protoPub{ - Type: "pub", - Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), - Retain: true, - }) - - select { - case m := <-sub.Channel(): - if m == nil { - t.Fatal("nil message") - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for imported config on config/hal") - } -} - // ---- pub export ---- -func TestPubExport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - publishConn := b.NewConnection("hal") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - publishConn.Publish(publishConn.NewMessage( - bus.T("hal", "cap", "env", "temperature", "core", "value"), - map[string]int{"deci_c": 412}, - true, - )) - - msg := readMsg[protoPub](t, cm5) - if msg.Type != "pub" { - t.Fatalf("expected pub, got %q", msg.Type) - } - want := []string{"state", "env", "temperature", "core", "value"} - if !slicesEqual(msg.Topic, want) { - t.Errorf("topic = %v, want %v", msg.Topic, want) - } - if !msg.Retain { - t.Error("expected retain=true") - } -} - -func TestUnretainExport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - publishConn := b.NewConnection("hal") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - // Publish retained value first. - publishConn.Publish(publishConn.NewMessage( - bus.T("hal", "cap", "env", "temperature", "core", "value"), - map[string]int{"deci_c": 412}, - true, - )) - pub := readMsg[protoPub](t, cm5) - if pub.Type != "pub" || !pub.Retain { - t.Fatalf("expected retained pub, got t=%q retain=%v", pub.Type, pub.Retain) - } - - // Clear retained state (retain=true, payload=nil). - publishConn.Publish(publishConn.NewMessage( - bus.T("hal", "cap", "env", "temperature", "core", "value"), - nil, - true, - )) - unr := readMsg[protoUnretain](t, cm5) - if unr.Type != "unretain" { - t.Fatalf("expected unretain, got %q", unr.Type) - } - want := []string{"state", "env", "temperature", "core", "value"} - if !slicesEqual(unr.Topic, want) { - t.Errorf("topic = %v, want %v", unr.Topic, want) - } -} - func TestDrainExportsReturnsWhenSubscriptionClosed(t *testing.T) { b := newBus() conn := b.NewConnection("fabric") @@ -1166,6 +1097,125 @@ func TestDrainExportsWaitsForStartupHoldoff(t *testing.T) { } } +func TestDrainExportsPausesDuringIncomingTransfer(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + exportsEnabled: true, + incomingTransfer: &incomingTransfer{}, + } + + s.setupExports() + defer s.teardownExports() + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "runtime", "memory"), + map[string]int{"alloc_bytes": 241376}, + true, + )) + s.drainExports() + + if len(tr.writes) != 0 { + t.Fatalf("writes during transfer = %d, want 0", len(tr.writes)) + } + + s.incomingTransfer = nil + s.drainExports() + + if len(tr.writes) != 1 { + t.Fatalf("writes after transfer = %d, want 1", len(tr.writes)) + } +} + +func TestDrainExportsPausesAfterPrepareCall(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + cfg := DefaultLinkConfig() + s := session{ + conn: fabricConn, + tr: tr, + cfg: cfg, + link: linkUp, + exportsEnabled: true, + exportReadyAt: time.Now().Add(-time.Second), + } + + s.setupExports() + defer s.teardownExports() + defer s.teardownInbound() + + s.onCall(&protoCall{ + Type: msgCall, + ID: "prepare-1", + Topic: []string{"cmd", "self", "updater", "prepare"}, + }) + + pubConn.Publish(pubConn.NewMessage( + bus.T("state", "self", "updater"), + map[string]any{ + "state": "ready", + "pending_image_id": "mcu-dev-13.0", + "job_id": "job-1", + }, + true, + )) + s.drainExports() + + if len(tr.writes) != 0 { + t.Fatalf("writes during prepare quiet = %d, want 0", len(tr.writes)) + } + + s.transferQuietUntil = time.Time{} + s.transferQuietReason = "" + s.drainExports() + + if len(tr.writes) != 1 { + t.Fatalf("writes after prepare quiet = %d, want 1", len(tr.writes)) + } +} + +func TestPongSuppressedDuringIncomingTransfer(t *testing.T) { + tr := &captureTransport{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + incomingTransfer: &incomingTransfer{ + meta: transferMeta{ID: "xfer-1"}, + }, + } + + s.onPing(&protoPing{Type: msgPing, TS: 42, SID: "cm5-sid"}) + + if len(tr.writes) != 0 { + t.Fatalf("pong writes during transfer = %d, want 0", len(tr.writes)) + } +} + +func TestPongSuppressedDuringPrepareQuiet(t *testing.T) { + tr := &captureTransport{} + s := session{ + tr: tr, + link: linkUp, + localSID: "mcu-sid-test", + transferQuietUntil: time.Now().Add(time.Second), + transferQuietReason: "prepare_call_rx", + } + + s.onPing(&protoPing{Type: msgPing, TS: 42, SID: "cm5-sid"}) + + if len(tr.writes) != 0 { + t.Fatalf("pong writes during prepare quiet = %d, want 0", len(tr.writes)) + } +} + // ---- unretain ---- func TestPubIgnoredBeforeHandshake(t *testing.T) { @@ -1221,34 +1271,6 @@ func TestUnretainIgnoredBeforeHandshake(t *testing.T) { } } -func TestUnretain(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - conn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, conn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - sendMsg(t, cm5, protoPub{ - Type: "pub", Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"v":1}`), Retain: true, - }) - time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, protoUnretain{Type: "unretain", Topic: []string{"config", "device"}}) - time.Sleep(50 * time.Millisecond) - - reader := b.NewConnection("test") - sub := reader.Subscribe(bus.T("config", "device")) - select { - case m := <-sub.Channel(): - if m != nil && m.Payload != nil { - t.Errorf("expected no retained message, got %+v", m) - } - case <-time.After(100 * time.Millisecond): - } -} - // ---- call import ---- func TestCallIgnoredBeforeHandshake(t *testing.T) { @@ -1276,6 +1298,8 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { } func TestCallImport(t *testing.T) { + // Test the canonical inbound call route: cmd/self/updater/prepare maps to + // local rpc/updater/prepare where services/updater binds. mcu, cm5 := pipePair() b := newBus() fabricConn := b.NewConnection("fabric") @@ -1285,7 +1309,7 @@ func TestCallImport(t *testing.T) { bringUp(t, cm5) handler := b.NewConnection("handler") - sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) + sub := handler.Subscribe(bus.T("rpc", "updater", "prepare")) go func() { for m := range sub.Channel() { handler.Reply(m, map[string]string{"result": "ok"}, false) @@ -1293,7 +1317,7 @@ func TestCallImport(t *testing.T) { }() sendMsg(t, cm5, protoCall{ - Type: "call", ID: "test-corr-1", Topic: []string{"rpc", "hal", "dump"}, + Type: "call", ID: "test-corr-1", Topic: []string{"cmd", "self", "updater", "prepare"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) @@ -1331,239 +1355,6 @@ func TestCallNoRoute(t *testing.T) { } } -func TestDumpCallReturnsConfigState(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - // Send config first so the session has state. - sendMsg(t, cm5, protoPub{ - Type: "pub", - Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), - Retain: true, - }) - time.Sleep(100 * time.Millisecond) - - // Call dump. - sendMsg(t, cm5, protoCall{ - Type: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, - Payload: json.RawMessage(`{"ask":"status"}`), TimeoutMs: 5000, - }) - - reply := readMsg[protoReply](t, cm5) - if reply.Corr != "dump-1" { - t.Errorf("corr = %q", reply.Corr) - } - if !reply.OK { - t.Errorf("expected ok=true, got err=%q", reply.Err) - } - var dump dumpReply - if err := json.Unmarshal(reply.Value, &dump); err != nil { - t.Fatalf("unmarshal dump reply: %v", err) - } - if !dump.Applied { - t.Error("expected applied=true") - } - if dump.ConfigCount != 1 { - t.Errorf("config_count = %d, want 1", dump.ConfigCount) - } -} - -func TestDumpCallDoesNotBlockPing(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - - // Send dump call and ping back-to-back. - sendMsg(t, cm5, protoCall{ - Type: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, - Payload: json.RawMessage(`{}`), TimeoutMs: 1000, - }) - sendMsg(t, cm5, protoPing{Type: "ping", TS: 77, SID: testCM5SID}) - - type readResult struct { - line []byte - err error - } - type wireHeader struct { - Type string `json:"type"` - } - var gotReply, gotPong bool - for i := 0; i < 2; i++ { - msg := readMsg[wireHeader](t, cm5) - switch msg.Type { - case msgReply: - gotReply = true - case msgPong: - gotPong = true - default: - t.Fatalf("unexpected message type %q", msg.Type) - } - } - if !gotReply { - t.Error("missing dump reply") - } - if !gotPong { - t.Error("missing pong") - } -} - -func TestCallExport(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - type result struct { - msg *bus.Message - err error - } - done := make(chan result, 1) - go func() { - msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - )) - done <- result{msg: msg, err: err} - }() - - call := readMsg[protoCall](t, cm5) - if call.Type != "call" { - t.Fatalf("expected call, got %q", call.Type) - } - want := []string{"rpc", "hal", "dump"} - if !slicesEqual(call.Topic, want) { - t.Fatalf("topic = %v, want %v", call.Topic, want) - } - var payload map[string]string - if err := json.Unmarshal(call.Payload, &payload); err != nil { - t.Fatalf("Unmarshal payload: %v", err) - } - if payload["ask"] != "status" { - t.Fatalf("payload.ask = %q, want status", payload["ask"]) - } - - sendMsg(t, cm5, protoReply{ - Type: "reply", - Corr: call.ID, - OK: true, - Value: json.RawMessage(`{"ok":true,"remote":"cm5"}`), - }) - - select { - case res := <-done: - if res.err != nil { - t.Fatalf("RequestWait: %v", res.err) - } - if res.msg == nil { - t.Fatal("nil bus reply") - } - reply, ok := res.msg.Payload.(map[string]any) - if !ok { - t.Fatalf("payload type = %T, want map[string]any", res.msg.Payload) - } - if reply["remote"] != "cm5" { - t.Fatalf("reply.remote = %#v", reply["remote"]) - } - if reply["ok"] != true { - t.Fatalf("reply.ok = %#v", reply["ok"]) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for local reply") - } -} - -func TestCallExportOnlyConfiguredRule(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - // Use an unconfigured topic — only fabric/out/rpc/hal/dump is routed. - reqCtx, reqCancel := context.WithTimeout(context.Background(), 250*time.Millisecond) - defer reqCancel() - go func() { - _, _ = reqConn.RequestWait(reqCtx, reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "not_configured"), - map[string]string{"ask": "status"}, - false, - )) - }() - - gotLine := make(chan struct{}) - go func() { - _, _ = cm5.ReadLine() - close(gotLine) - }() - - select { - case <-gotLine: - t.Fatal("got wire call for unconfigured export rule") - case <-time.After(200 * time.Millisecond): - } -} - -func TestPendingWireCallsTimeout(t *testing.T) { - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - ) - sub := reqConn.Request(msg) - defer reqConn.Unsubscribe(sub) - - s := session{ - conn: fabricConn, - outboundCalls: []*outboundCall{ - {id: "wire-1", req: msg, deadline: time.Now().Add(-time.Millisecond)}, - }, - } - - s.drainOutbound(time.Now()) - - select { - case reply := <-sub.Channel(): - if reply == nil { - t.Fatal("nil timeout reply") - } - out, ok := reply.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != "timeout" { - t.Fatalf("error = %q, want timeout", out.Error) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for timeout reply") - } -} - func TestDrainExportsDropsUnmarshalablePayload(t *testing.T) { b := newBus() fabricConn := b.NewConnection("fabric") @@ -1637,225 +1428,3 @@ func TestDrainPendingCallsReportsMarshalFailure(t *testing.T) { t.Fatalf("err = %q, want %q", reply.Err, errPayloadMarshal) } } - -func TestDrainOutgoingWireCallsReportsMarshalFailure(t *testing.T) { - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - tr := &captureTransport{} - s := session{ - conn: fabricConn, - tr: tr, - link: linkUp, - } - - s.setupExports() - defer s.teardownExports() - - msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - make(chan int), - false, - ) - replySub := reqConn.Request(msg) - defer reqConn.Unsubscribe(replySub) - - s.drainOutbound(time.Now()) - - if len(tr.writes) != 0 { - t.Fatalf("writes = %d, want 0", len(tr.writes)) - } - if len(s.outboundCalls) != 0 { - t.Fatalf("outboundCalls = %d, want 0", len(s.outboundCalls)) - } - - select { - case reply := <-replySub.Channel(): - if reply == nil { - t.Fatal("nil reply") - } - out, ok := reply.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != errPayloadMarshal { - t.Fatalf("error = %q, want %q", out.Error, errPayloadMarshal) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for marshal failure reply") - } -} - -func TestDrainOutgoingWireCallsReportsWriteFailure(t *testing.T) { - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - tr := &captureTransport{writeErr: errors.New("boom")} - s := session{ - conn: fabricConn, - tr: tr, - link: linkUp, - } - - s.setupExports() - defer s.teardownExports() - - msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - ) - replySub := reqConn.Request(msg) - defer reqConn.Unsubscribe(replySub) - - s.drainOutbound(time.Now()) - - if s.link != linkDown { - t.Fatalf("link = %v, want %v", s.link, linkDown) - } - if len(s.outboundCalls) != 0 { - t.Fatalf("outboundCalls = %d, want 0", len(s.outboundCalls)) - } - - select { - case reply := <-replySub.Channel(): - if reply == nil { - t.Fatal("nil reply") - } - out, ok := reply.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != "transport_write_failed" { - t.Fatalf("error = %q, want transport_write_failed", out.Error) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for write failure reply") - } -} - -func TestCallExportPeerReset(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - bringUp(t, cm5) - unlockExports(t, cm5) - - type result struct { - msg *bus.Message - err error - } - done := make(chan result, 1) - go func() { - msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - )) - done <- result{msg: msg, err: err} - }() - - call := readMsg[protoCall](t, cm5) - if call.Type != "call" { - t.Fatalf("expected call, got %q", call.Type) - } - - sendMsg(t, cm5, protoHello{ - Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "fresh-session", Proto: protoVersion, - }) - _ = readMsg[protoHelloAck](t, cm5) - - select { - case res := <-done: - if res.err != nil { - t.Fatalf("RequestWait: %v", res.err) - } - if res.msg == nil { - t.Fatal("nil bus reply") - } - out, ok := res.msg.Payload.(types.ErrorReply) - if !ok { - t.Fatalf("payload type = %T, want types.ErrorReply", res.msg.Payload) - } - if out.OK { - t.Fatal("expected ok=false") - } - if out.Error != "session_reset" { - t.Fatalf("error = %q, want session_reset", out.Error) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for peer-reset reply") - } -} - -func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { - mcu, cm5 := pipePair() - b := newBus() - fabricConn := b.NewConnection("fabric") - reqConn := b.NewConnection("caller") - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) - ack := bringUp(t, cm5) - unlockExports(t, cm5) - - type result struct { - msg *bus.Message - err error - } - done := make(chan result, 1) - go func() { - msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), - map[string]string{"ask": "status"}, - false, - )) - done <- result{msg: msg, err: err} - }() - - call := readMsg[protoCall](t, cm5) - if call.Type != "call" { - t.Fatalf("expected call, got %q", call.Type) - } - - // Send an echoed hello_ack (our own SID) — should be ignored. - sendMsg(t, cm5, protoHelloAck{ - Type: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, - }) - - sendMsg(t, cm5, protoReply{ - Type: "reply", - Corr: call.ID, - OK: true, - Value: json.RawMessage(`{"ok":true,"remote":"cm5"}`), - }) - - select { - case res := <-done: - if res.err != nil { - t.Fatalf("RequestWait: %v", res.err) - } - if res.msg == nil { - t.Fatal("nil bus reply") - } - reply, ok := res.msg.Payload.(map[string]any) - if !ok { - t.Fatalf("payload type = %T, want map[string]any", res.msg.Payload) - } - if reply["remote"] != "cm5" || reply["ok"] != true { - t.Fatalf("unexpected reply payload: %#v", reply) - } - case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for local reply after echoed hello_ack") - } -} diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 023b0ec..a5d3bca 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -4,14 +4,15 @@ import "encoding/json" // ---- Wire message type identifiers ---- // -// Wire schema mirrors devicecode-lua/src/services/fabric/protocol.lua at -// update-migration tip (commit 2c88090). The frame discriminator field is -// "type" (not "t"). Reply frames carry {id, ok, value, err}. Transfer frames -// use xfer_id/offset/checksum/data with a minimal xfer_chunk shape and -// xxHash32 hex wire integrity (no algorithm field; Lua source treats checksum -// as opaque hex). +// Wire schema mirrors ../docs/updating.md and +// devicecode-lua/src/services/fabric/protocol.lua. The frame discriminator is +// "type". Replies carry {id, ok, payload, err}. Transfers use explicit +// digest_alg/digest fields and required per-chunk chunk_digest. const ( + protocolName = "fabric-jsonl/1" + digestAlg = "xxhash32" + msgHello = "hello" msgHelloAck = "hello_ack" msgPing = "ping" @@ -31,28 +32,22 @@ const ( // ---- Wire message structs ---- -// protoCaps is carried in hello for forward compatibility. The Lua side -// sends caps but neither side enforces them in v1. -type protoCaps struct { - Pub bool `json:"pub,omitempty"` - Call bool `json:"call,omitempty"` -} - type protoHello struct { - Type string `json:"type"` - Node string `json:"node"` - Peer string `json:"peer"` - SID string `json:"sid"` - Proto int `json:"proto,omitempty"` - Caps *protoCaps `json:"caps,omitempty"` + Type string `json:"type"` + Proto string `json:"proto"` + SID string `json:"sid"` + Node string `json:"node"` + Identity json.RawMessage `json:"identity,omitempty"` + Auth json.RawMessage `json:"auth,omitempty"` } type protoHelloAck struct { - Type string `json:"type"` - Node string `json:"node"` - SID string `json:"sid,omitempty"` - Proto int `json:"proto,omitempty"` - OK bool `json:"ok"` + Type string `json:"type"` + Proto string `json:"proto"` + SID string `json:"sid"` + Node string `json:"node"` + Identity json.RawMessage `json:"identity,omitempty"` + Auth json.RawMessage `json:"auth,omitempty"` } type protoPing struct { @@ -87,28 +82,27 @@ type protoCall struct { TimeoutMs int `json:"timeout_ms"` } -// protoReply mirrors Lua's reply frame: {type, id, ok, value, err}. The Go +// protoReply mirrors Lua's reply frame: {type, id, ok, payload, err}. The Go // field for the correlation id keeps the name "Corr" for readability — the // wire spelling is "id" because the reply correlates to a prior call.id. type protoReply struct { - Type string `json:"type"` - Corr string `json:"id"` - OK bool `json:"ok"` - Value json.RawMessage `json:"value,omitempty"` - Err string `json:"err,omitempty"` + Type string `json:"type"` + Corr string `json:"id"` + OK bool `json:"ok"` + Payload json.RawMessage `json:"payload,omitempty"` + Err string `json:"err,omitempty"` } -// protoXferBegin (control lane) — required fields per protocol.lua -// validate_control: xfer_id, size, checksum (xxHash32 hex). meta is -// optional but source-used: transfer_mgr.lua sends it on xfer_begin and -// later does conn:call(meta.receiver, …) before xfer_done. Preserve the -// blob opaquely so fabric-update's receiver can pull meta.receiver out. +// protoXferBegin starts an incoming transfer to a named target. The only +// supported digest for fabric-jsonl/1 is xxhash32 seed 0, lower-hex. type protoXferBegin struct { - Type string `json:"type"` - XferID string `json:"xfer_id"` - Size uint32 `json:"size"` - Checksum string `json:"checksum"` - Meta json.RawMessage `json:"meta,omitempty"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Target string `json:"target"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` + Meta json.RawMessage `json:"meta,omitempty"` } // protoXferReady (control) carries only xfer_id; success/failure is implicit @@ -118,29 +112,31 @@ type protoXferReady struct { XferID string `json:"xfer_id"` } -// protoXferChunk (bulk) — minimal {xfer_id, offset, data}. No chunk-level -// checksum, no sequence number; ack is by byte offset via xfer_need.next. +// protoXferChunk carries unpadded base64url data plus a required xxhash32 +// digest over the raw decoded chunk bytes. type protoXferChunk struct { - Type string `json:"type"` - XferID string `json:"xfer_id"` - Offset uint32 `json:"offset"` - Data string `json:"data"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Offset uint32 `json:"offset"` + Data string `json:"data"` + ChunkDigest string `json:"chunk_digest"` } -// protoXferNeed (control) acks the receiver's expected next byte offset. +// protoXferNeed (control) acks the MCU's expected next byte offset. type protoXferNeed struct { Type string `json:"type"` XferID string `json:"xfer_id"` Next uint32 `json:"next"` } -// protoXferCommit (control) carries the same wire-integrity shape as -// xfer_begin: xfer_id, size, checksum (xxHash32 hex over the payload bytes). +// protoXferCommit repeats the whole-object digest so begin/commit/streamed +// content can be reconciled before the target accepts the object. type protoXferCommit struct { - Type string `json:"type"` - XferID string `json:"xfer_id"` - Size uint32 `json:"size"` - Checksum string `json:"checksum"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` } // protoXferDone (control) carries only xfer_id; failure is signalled via diff --git a/services/fabric/remap.go b/services/fabric/remap.go index 175c44d..ba65ffa 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -9,16 +9,20 @@ import "devicecode-go/bus" // of routes. If new routes are required, add them here and on the Lua // config side. // -// CM5 -> MCU wire publish: -// ["config","device"] -> config/hal (with Lua empty-table normalization) +// The legacy MCU surface (config/device -> config/hal import, rpc/hal/dump +// inline handler, hal/cap/env/* and hal/cap/power/* exports, hal/state -> +// state/hal export, fabric/out/rpc/hal/dump call export) has been removed. The +// canonical surface is now: // // CM5 -> MCU wire call: -// ["rpc","hal","dump"] -> handled directly by session (not via import rules) +// ["cmd","self","updater","prepare"] -> rpc/updater/prepare +// ["cmd","self","updater","commit"] -> rpc/updater/commit +// xfer_begin target="updater/main" is handled by the transfer path +// and routed to the local updater staging RPC after xfer_commit. // // MCU local bus publish -> wire: -// hal/cap/env/# -> ["state","env",...] -// hal/cap/power/# -> ["state","power",...] -// hal/state -> ["state","hal"] +// state/self/# -> state/self/... (identity, telemetry, update facts) +// event/self/# -> event/self/... (sparse charger alerts) type importRule struct { wire []string @@ -31,39 +35,47 @@ type busExportRule struct { suffix bool } -var importPublishRules = []importRule{ +// importPublishRules is empty. Config-like data flows through +// cmd/self/updater/prepare's metadata field instead of retained publishes. +var importPublishRules = []importRule{} + +// rpc/hal/dump is handled directly by onCall, not via import rules. +// +// cmd/self/updater/{prepare,commit} land here from the wire and are +// routed to local rpc/updater/{prepare,commit} where the updater +// service binds. The updater package re-uses the same local topic +// strings (services/updater.TopicPrepareRPC / TopicCommitRPC) so +// callers stay consistent. +var importCallRules = []importRule{ { - wire: []string{"config", "device"}, - local: []string{"config", "hal"}, + wire: []string{"cmd", "self", "updater", "prepare"}, + local: []string{"rpc", "updater", "prepare"}, + }, + { + wire: []string{"cmd", "self", "updater", "commit"}, + local: []string{"rpc", "updater", "commit"}, }, } -// rpc/hal/dump is handled directly by onCall, not via import rules. -var importCallRules = []importRule{} - +// exportPublishRules is the minimal surface: local `state/self/*` retains and +// `event/self/*` events flow to the wire under the same name. Legacy HAL export +// topics are replaced by telemetry publishers under state/self/*. var exportPublishRules = []busExportRule{ { - localPrefix: []string{"hal", "cap", "env"}, - remotePrefix: []string{"state", "env"}, + localPrefix: []string{"state", "self"}, + remotePrefix: []string{"state", "self"}, suffix: true, }, { - localPrefix: []string{"hal", "cap", "power"}, - remotePrefix: []string{"state", "power"}, + localPrefix: []string{"event", "self"}, + remotePrefix: []string{"event", "self"}, suffix: true, }, - { - localPrefix: []string{"hal", "state"}, - remotePrefix: []string{"state", "hal"}, - }, } -var exportCallRules = []busExportRule{ - { - localPrefix: []string{"fabric", "out", "rpc", "hal", "dump"}, - remotePrefix: []string{"rpc", "hal", "dump"}, - }, -} +// exportCallRules is empty. The MCU does not originate outbound RPC calls for +// the current Fabric/update contract. +var exportCallRules = []busExportRule{} func importPublishTopic(wire []string) bus.Topic { return importMatch(wire, importPublishRules) diff --git a/services/fabric/session.go b/services/fabric/session.go index 91a4908..80bbd23 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -4,6 +4,7 @@ import ( "context" "encoding/json" "errors" + "strings" "time" "devicecode-go/bus" @@ -48,6 +49,17 @@ const ( exportMaxPerTick = 1 exportTickInterval = 50 * time.Millisecond errPayloadMarshal = "payload_marshal_failed" + + // The USB/UART path used during OTA echoes MCU-originated JSONL back into + // the MCU receiver. If exported retained state is in flight while CM5 starts + // an OTA transfer, the echoed line can contain CM5's xfer_begin spliced into + // the middle of the state pub. Hold exports quiet from prepare until either + // xfer_begin arrives or this window expires. + transferPrepareQuiet = 10 * time.Second + // Keep telemetry/state exports quiet long enough for the host to send the + // follow-up updater commit call after xfer_done. On echo-prone UART links, + // retained export backlog can otherwise splice into the commit JSONL frame. + transferCompleteQuiet = 10 * time.Second ) // ---- link reasons and error strings ---- @@ -65,29 +77,14 @@ const ( reasonTimeout = "timeout" ) -// ---- bus topics for config handling ---- - -var ( - tConfigHAL = bus.T("config", "hal") - dumpCallTopic = []string{"rpc", "hal", "dump"} -) - // ---- types ---- -type dumpReply struct { - OK bool `json:"ok"` - Method string `json:"method"` - Echo any `json:"echo,omitempty"` - HAL *types.HALState `json:"hal,omitempty"` - Applied bool `json:"applied"` - ConfigCount int `json:"config_count,omitempty"` - ConfigError string `json:"config_error,omitempty"` -} - type inboundCall struct { - id string - sub *bus.Subscription - deadline time.Time + id string + topic []string + sub *bus.Subscription + deadline time.Time + transferPrepare bool } type outboundCall struct { @@ -110,7 +107,7 @@ type linkStatePayload struct { LocalSID string `json:"local_sid"` PeerSID string `json:"peer_sid,omitempty"` PeerNode string `json:"peer_node,omitempty"` - PeerProto int `json:"peer_proto,omitempty"` + PeerProto string `json:"peer_proto,omitempty"` LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` @@ -137,38 +134,41 @@ type session struct { link linkState peerNode string peerSID string - peerProto int + peerProto string lastRxAt time.Time lastTxAt time.Time lastPongAt time.Time exportReadyAt time.Time exportsEnabled bool - exportSubs []*bus.Subscription - exportCallSubs []*bus.Subscription - inboundCalls []*inboundCall - outboundCalls []*outboundCall - nextOutboundID uint64 - nextPingAt time.Time - txControl txLane - txRPC txLane - txBulk txLane - importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports - rpcReady bool // bridge replay complete; gates linkStatePayload.Ready - incomingTransfer *incomingTransfer - beginTransfer func(transferMeta) (transferSink, error) - - // Config state — tracks config/device → config/hal translation. - configApplied bool - configCount int - lastConfigErr string + exportSubs []*bus.Subscription + exportCallSubs []*bus.Subscription + inboundCalls []*inboundCall + outboundCalls []*outboundCall + nextOutboundID uint64 + nextPingAt time.Time + txControl txLane + txRPC txLane + txBulk txLane + importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports + rpcReady bool // bridge replay complete; gates linkStatePayload.Ready + incomingTransfer *incomingTransfer + transferQuietUntil time.Time + transferQuietReason string + beginTransfer func(transferMeta) (transferSink, error) } func (s *session) log(msg string) { + if !fabricTraceEnabled { + return + } println("[fabric]", "sid", s.localSID, msg) } func (s *session) logKV(msg, key, value string) { + if !fabricTraceEnabled { + return + } println("[fabric]", "sid", s.localSID, msg, key, value) } @@ -334,10 +334,12 @@ func (s *session) handleLinkDown(reason, err string) { s.link = linkDown s.peerNode = "" s.peerSID = "" - s.peerProto = 0 + s.peerProto = "" s.exportReadyAt = time.Time{} s.exportsEnabled = false s.rpcReady = false + s.transferQuietUntil = time.Time{} + s.transferQuietReason = "" s.teardownExports() s.teardownInbound() s.teardownOutbound(pendingReason) @@ -468,6 +470,8 @@ func (s *session) dispatch(line []byte) { typedDispatch(s, line, s.onTransferCommit) case msgXferAbort: typedDispatch(s, line, s.onTransferAbort) + case msgXferReady, msgXferNeed, msgXferDone: + s.logKV("echoed transfer control ignored", "type", t) default: s.logKV("unknown message type dropped", "type", t) } @@ -495,21 +499,38 @@ func (s *session) logMalformed(line []byte, err error) { if err != nil { errStr = err.Error() } - println( - "[fabric]", "sid", s.localSID, - "malformed frame dropped", - "line_len", strconvx.Itoa(len(line)), - "line_head", tracePreview(line), - "err", errStr, - ) + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "malformed frame dropped", + "line_len", strconvx.Itoa(len(line)), + "line_head", tracePreview(line), + "err", errStr, + ) + } + + // If a transfer is in flight, the dropped frame was very likely a + // corrupted xfer_chunk. Without an explicit signal CM5 keeps + // streaming chunks past the gap and the receiver silently drops + // them as out-of-order; the transfer eventually fails on the + // phase timeout. Re-request the next expected byte so CM5 + // retransmits from the gap. Cheap if it wasn't actually a chunk + // (the sender just gets one stale need frame and ignores it once + // it has caught up). + if cur := s.incomingTransfer; cur != nil { + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + // Refresh the idle-chunk deadline so a stream of malformed frames can + // recover instead of tripping phase_timeout mid-retry. + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + } } -// notePeerIdentity records the remote peer's node, SID, and proto version. +// notePeerIdentity records the remote peer's node, SID, and protocol name. // If the SID changes mid-session, the returned reason triggers a full // teardown of exports and pending calls on the Go side. Note: the Lua // side only tears down pending calls on SID change, not exports — this // asymmetry is intentional since the CM5 re-subscribes on reconnect. -func (s *session) notePeerIdentity(node, sid string, proto int) string { +func (s *session) notePeerIdentity(node, sid string, proto string) string { reason := "" if s.link == linkUp && s.peerSID != "" && sid != "" && s.peerSID != sid { reason = reasonSessionReset @@ -520,7 +541,7 @@ func (s *session) notePeerIdentity(node, sid string, proto int) string { if sid != "" { s.peerSID = sid } - if proto > 0 { + if proto != "" { s.peerProto = proto } return reason @@ -548,9 +569,55 @@ func hasWirePrefix(topic, prefix []string) bool { return true } +func wireTopicEquals(topic, want []string) bool { + if len(topic) != len(want) { + return false + } + for i := range want { + if topic[i] != want[i] { + return false + } + } + return true +} + +func wireTopicString(topic []string) string { + if len(topic) == 0 { + return "" + } + return strings.Join(topic, "/") +} + +func (s *session) extendTransferQuiet(reason string, d time.Duration) { + now := time.Now() + until := now.Add(d) + if until.After(s.transferQuietUntil) { + s.transferQuietUntil = until + s.transferQuietReason = reason + } +} + +func (s *session) transferQuiet(now time.Time) (bool, string) { + if cur := s.incomingTransfer; cur != nil { + return true, "incoming_transfer:" + cur.meta.ID + } + if !s.transferQuietUntil.IsZero() && now.Before(s.transferQuietUntil) { + reason := s.transferQuietReason + if reason == "" { + reason = "quiet_window" + } + return true, reason + } + return false, "" +} + func (s *session) onHello(msg *protoHello) { - if msg.Peer != "" && msg.Peer != s.nodeID { - s.log("hello dropped: wrong peer") + if msg.Proto != protocolName { + s.log("hello dropped: unsupported proto") + return + } + if msg.SID == "" || msg.Node == "" { + s.log("hello dropped: missing identity") return } if s.peerID != "" && msg.Node != s.peerID { @@ -562,10 +629,9 @@ func (s *session) onHello(msg *protoHello) { if !s.sendControl(marshal(protoHelloAck{ Type: msgHelloAck, - Node: s.nodeID, + Proto: protocolName, SID: s.localSID, - Proto: protoVersion, - OK: true, + Node: s.nodeID, })) { return } @@ -578,9 +644,12 @@ func (s *session) onHelloAck(msg *protoHelloAck) { s.log("echoed hello_ack ignored") return } - if !msg.OK { - s.log("hello_ack rejected by peer") - s.handleLinkDown(reasonHelloRejected, "") + if msg.Proto != protocolName { + s.log("hello_ack dropped: unsupported proto") + return + } + if msg.SID == "" || msg.Node == "" { + s.log("hello_ack dropped: missing identity") return } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) @@ -589,6 +658,13 @@ func (s *session) onHelloAck(msg *protoHelloAck) { } func (s *session) onPing(msg *protoPing) { + if s.isSelfControlFrame("", msg.SID) { + s.log("echoed ping ignored") + return + } + if quiet, _ := s.transferQuiet(time.Now()); quiet { + return + } s.logKV("ping rx", "peer_sid", msg.SID) if !s.sendControl(marshal(protoPong{Type: msgPong, TS: msg.TS, SID: s.localSID})) { return @@ -603,6 +679,13 @@ func (s *session) tickPing(now time.Time) { if s.link != linkUp { return } + if quiet, _ := s.transferQuiet(now); quiet { + // Keep the UART quiet while CM5 is preparing or streaming a firmware + // image; chunk recovery depends on xfer_need being the only periodic + // MCU-originated frame on the fabric link. + s.nextPingAt = now.Add(s.cfg.PingInterval) + return + } if s.nextPingAt.IsZero() || now.Before(s.nextPingAt) { return } @@ -631,23 +714,6 @@ func (s *session) onPub(msg *protoPub) { return } - // config/device → config/hal: normalize and track. - if topicEquals(localTopic, tConfigHAL) { - cfg, err := decodeHALConfig(msg.Payload) - if err != "" { - s.lastConfigErr = err - s.log("config/device rejected: " + err) - return - } - s.configApplied = true - s.configCount++ - s.lastConfigErr = "" - s.log("config/device applied to config/hal") - s.conn.Publish(s.conn.NewMessage(localTopic, cfg, true)) - s.trackImportedRetain(localTopic) - return - } - s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) if msg.Retain { s.trackImportedRetain(localTopic) @@ -670,34 +736,6 @@ func (s *session) onUnretain(msg *protoUnretain) { } func (s *session) onCall(msg *protoCall) { - // rpc/hal/dump: handle directly — reply with config and HAL state. - if slicesEqualStrings(msg.Topic, dumpCallTopic) { - var halState *types.HALState - sub := s.conn.Subscribe(bus.T("hal", "state")) - select { - case m := <-sub.Channel(): - if m != nil { - if st, ok := decodeHALState(m.Payload); ok { - halState = &st - } - } - default: - } - s.conn.Unsubscribe(sub) - - reply := dumpReply{ - OK: true, - Method: "dump", - Echo: decodePayload(msg.Payload), - HAL: halState, - Applied: s.configApplied, - ConfigCount: s.configCount, - ConfigError: s.lastConfigErr, - } - s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: true, Value: mustMarshal(reply)})) - return - } - if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { s.log("incoming call dropped: busy") s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) @@ -711,16 +749,24 @@ func (s *session) onCall(msg *protoCall) { return } + isTransferPrepare := wireTopicEquals(msg.Topic, []string{"cmd", "self", "updater", "prepare"}) + if isTransferPrepare { + s.extendTransferQuiet("prepare_call_rx", transferPrepareQuiet) + } + timeout := callTimeoutDef if msg.TimeoutMs > 0 { timeout = time.Duration(msg.TimeoutMs) * time.Millisecond } busMsg := s.conn.NewMessage(localTopic, msg.Payload, false) sub := s.conn.Request(busMsg) + topicCopy := append([]string(nil), msg.Topic...) s.inboundCalls = append(s.inboundCalls, &inboundCall{ - id: msg.ID, - sub: sub, - deadline: time.Now().Add(timeout), + id: msg.ID, + topic: topicCopy, + sub: sub, + deadline: time.Now().Add(timeout), + transferPrepare: isTransferPrepare, }) } @@ -737,7 +783,7 @@ func (s *session) onReply(msg *protoReply) { s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: msg.Err}, false) return } - s.conn.Reply(call.req, decodePayload(msg.Value), false) + s.conn.Reply(call.req, decodePayload(msg.Payload), false) return } @@ -846,10 +892,17 @@ func (s *session) drainExports() { if s.link != linkUp { return } + now := time.Now() + if quiet, _ := s.transferQuiet(now); quiet { + // Avoid colliding telemetry/state exports with prepare/xfer traffic on + // echo-prone links. Queued retained state can be exported after the OTA + // control/data path has gone quiet. + return + } if !s.exportsEnabled { return } - if !s.exportReadyAt.IsZero() && time.Now().Before(s.exportReadyAt) { + if !s.exportReadyAt.IsZero() && now.Before(s.exportReadyAt) { return } total := 0 @@ -882,6 +935,15 @@ func (s *session) drainExports() { s.logKV("export payload dropped", "err", err.Error()) continue } + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "export pub tx", + "topic", wireTopicString(wire), + "retain", m.Retained, + "payload_len", strconvx.Itoa(len(payload)), + ) + } if !s.sendRPC(marshal(protoPub{ Type: msgPub, Topic: wire, @@ -911,12 +973,18 @@ func (s *session) drainInbound(now time.Time) { s.conn.Unsubscribe(call.sub) call.sub = nil // prevent double-unsubscribe in teardownInbound if !ok || reply == nil { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_timeout", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue } if errStr := checkBusError(reply.Payload); errStr != "" { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_error", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) { return } @@ -924,12 +992,18 @@ func (s *session) drainInbound(now time.Time) { } payload, err := marshalPayload(reply.Payload) if err != nil { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_marshal_failed", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { return } continue } - if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Value: payload})) { + if call.transferPrepare { + s.extendTransferQuiet("prepare_reply_ok", transferPrepareQuiet) + } + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Payload: payload})) { return } continue @@ -939,6 +1013,9 @@ func (s *session) drainInbound(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) call.sub = nil + if call.transferPrepare { + s.extendTransferQuiet("prepare_call_timeout", transferPrepareQuiet) + } if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 59919b8..dd4155a 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -7,28 +7,28 @@ import ( "strings" "time" + "devicecode-go/services/updater" "devicecode-go/x/strconvx" "devicecode-go/x/xxhash" ) -const postTransferDoneSettle = 250 * time.Millisecond -const transferProgressLogEvery = 32 +const transferTargetUpdaterMain = "updater/main" -// transferMeta captures xfer_begin contents. The required Lua wire shape is -// {xfer_id, size, checksum}; meta is optional but source-used (transfer_mgr -// passes it through to the receiver, where meta.receiver names a local -// endpoint to call after xfer_commit and before xfer_done). Preserve meta -// as an opaque blob — interpretation lives in fabric-update. +// transferMeta captures xfer_begin contents. The transfer target is explicit +// on the wire; firmware update uses target="updater/main". meta remains opaque +// and informational to Fabric. type transferMeta struct { - ID string - Size uint32 - Checksum string // xxHash32 hex (8 lower-case hex chars), no algorithm field - Meta json.RawMessage + ID string + Target string + Size uint32 + DigestAlg string + Digest string // xxHash32 hex (8 lower-case hex chars), seed 0 + Meta json.RawMessage } // transferInfo is internal-only state returned by the sink on Commit. It is // no longer wire-visible — xfer_done carries only xfer_id in the canonical -// schema; size/checksum reconciliation lives on xfer_commit. +// schema; size/digest reconciliation lives on xfer_commit. type transferInfo struct { BytesWritten uint32 SlotXIPAddr uint32 @@ -38,11 +38,18 @@ type transferInfo struct { // WriteChunk receives bytes at the given byte offset (matching xfer_chunk's // canonical wire fields). No sequence number is passed — the caller has // already validated offset against expected progress. +// +// Bytes() returns the committed payload bytes for target invocation. +// Only valid after Commit() has succeeded. May return nil if the sink +// streamed the bytes elsewhere (e.g. the RP2350 sink writes directly to +// flash and doesn't keep a RAM copy); updater/main consumes that staged +// stream from the updater package. type transferSink interface { WriteChunk(offset uint32, data []byte) error Commit() (transferInfo, error) Apply() error Abort(reason string) error + Bytes() []byte } type incomingTransfer struct { @@ -61,10 +68,39 @@ func lowerHex(s string) string { return strings.ToLower(strings.TrimSpace(s)) } +func canonicalXXHash32Hex(s string) (string, bool) { + digest := lowerHex(s) + return digest, s == digest && validXXHash32Hex(digest) +} + +func validXXHash32Hex(s string) bool { + if len(s) != 8 { + return false + } + for i := 0; i < len(s); i++ { + c := s[i] + if !((c >= '0' && c <= '9') || (c >= 'a' && c <= 'f')) { + return false + } + } + return true +} + func u32s(v uint32) string { return strconvx.Itoa(int(v)) } +func decodeChunkData(encoded string) ([]byte, string) { + raw, err := base64.RawURLEncoding.DecodeString(encoded) + if err != nil { + return nil, "invalid_chunk_encoding" + } + if base64.RawURLEncoding.EncodeToString(raw) != encoded { + return nil, "invalid_chunk_encoding" + } + return raw, "" +} + func (s *session) sendTransferReady(id string) bool { return s.sendControl(marshal(protoXferReady{ Type: msgXferReady, @@ -125,8 +161,6 @@ func (s *session) checkTransferTimeout(now time.Time) { return } id := cur.meta.ID - println("[fabric]", "sid", s.localSID, "xfer_phase_timeout", - "id", id, "phase_s", u32s(uint32(s.cfg.PhaseTimeout/time.Second))) s.abortTransfer("timeout") s.sendTransferAbort(id, "timeout") } @@ -135,21 +169,34 @@ func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { if msg.XferID == "" { return transferMeta{}, "xfer_begin.xfer_id" } + if msg.Target == "" { + return transferMeta{}, "missing_target" + } + if msg.Target != transferTargetUpdaterMain { + return transferMeta{}, "unsupported_target" + } if msg.Size == 0 { return transferMeta{}, "xfer_begin.size" } - if msg.Checksum == "" { - return transferMeta{}, "xfer_begin.checksum" + if msg.DigestAlg != digestAlg { + return transferMeta{}, "unsupported_digest_alg" + } + digest, ok := canonicalXXHash32Hex(msg.Digest) + if !ok { + return transferMeta{}, "invalid_digest" } return transferMeta{ - ID: msg.XferID, - Size: msg.Size, - Checksum: lowerHex(msg.Checksum), - Meta: append(json.RawMessage(nil), msg.Meta...), + ID: msg.XferID, + Target: msg.Target, + Size: msg.Size, + DigestAlg: msg.DigestAlg, + Digest: digest, + Meta: append(json.RawMessage(nil), msg.Meta...), }, "" } func (s *session) onTransferBegin(msg *protoXferBegin) { + s.extendTransferQuiet("xfer_begin_rx", transferPrepareQuiet) meta, errStr := validateTransferBegin(msg) if errStr != "" { if msg.XferID != "" { @@ -159,6 +206,19 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { return } if s.incomingTransfer != nil { + cur := s.incomingTransfer + if cur.meta.ID == meta.ID && + cur.meta.Size == meta.Size && + cur.meta.Target == meta.Target && + cur.meta.DigestAlg == meta.DigestAlg && + cur.meta.Digest == meta.Digest { + s.logKV("xfer_begin duplicate", "id", meta.ID) + if s.sendTransferReady(meta.ID) { + s.sendTransferNeed(meta.ID, cur.bytesWritten) + } + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) + return + } s.sendTransferAbort(meta.ID, "busy") return } @@ -177,14 +237,9 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { hasher: xxhash.New(0), deadline: time.Now().Add(s.cfg.PhaseTimeout), } - println( - "[fabric]", "sid", s.localSID, - "xfer_begin accepted", - "id", meta.ID, - "size", u32s(meta.Size), - "checksum", meta.Checksum, - ) - s.sendTransferReady(meta.ID) + if s.sendTransferReady(meta.ID) { + s.sendTransferNeed(meta.ID, 0) + } } func (s *session) onTransferChunk(msg *protoXferChunk) { @@ -198,37 +253,49 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { // Match that — do not send xfer_need + keep alive. id := cur.meta.ID if msg.Offset != cur.bytesWritten { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "unexpected_offset", - "off", u32s(msg.Offset), "want_off", u32s(cur.bytesWritten)) s.abortTransfer("unexpected_offset") s.sendTransferAbort(id, "unexpected_offset") return } - raw, err := base64.RawURLEncoding.DecodeString(msg.Data) - if err != nil { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "decode_failed", - "off", u32s(msg.Offset), "data_len", u32s(uint32(len(msg.Data)))) - s.abortTransfer("decode_failed") - s.sendTransferAbort(id, "decode_failed") + raw, errStr := decodeChunkData(msg.Data) + if errStr != "" { + s.abortTransfer(errStr) + s.sendTransferAbort(id, errStr) return } if len(raw) == 0 { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "empty_chunk", "off", u32s(msg.Offset)) s.abortTransfer("empty_chunk") s.sendTransferAbort(id, "empty_chunk") return } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { - println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", - "id", id, "err", "size_overflow", - "bytes_written", u32s(cur.bytesWritten), - "raw_len", u32s(uint32(len(raw))), - "total", u32s(cur.meta.Size)) - s.abortTransfer("size_overflow") - s.sendTransferAbort(id, "size_overflow") + reason := "size_too_large" + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return + } + // Per-chunk integrity is required by the current MCU contract. + // JSON parsing alone misses single-byte UART corruption inside the + // base64url data string: the bytes still decode, just to the wrong + // values. On mismatch we ask the sender to resume at the current + // byte offset instead of clearing the transfer. + want, ok := canonicalXXHash32Hex(msg.ChunkDigest) + if !ok { + reason := "invalid_chunk_digest" + if msg.ChunkDigest == "" { + reason = "missing_chunk_digest" + } + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return + } + got := xxhashHex(xxhash.Sum32(raw, 0)) + if got != want { + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) + // Recovery counts as progress — bump the deadline so a burst + // of digest-mismatched chunks doesn't trip the idle watchdog + // mid-recovery. + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) return } if err := cur.sink.WriteChunk(msg.Offset, raw); err != nil { @@ -242,16 +309,6 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { cur.bytesWritten += uint32(len(raw)) cur.chunksSeen++ cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) - if cur.chunksSeen == 1 || (cur.chunksSeen%transferProgressLogEvery) == 0 { - println( - "[fabric]", "sid", s.localSID, - "xfer_chunk accepted", - "id", cur.meta.ID, - "off", u32s(msg.Offset), - "data_len", u32s(uint32(len(raw))), - "bytes_written", u32s(cur.bytesWritten), - ) - } raw = nil // Forced GC after each absorbed chunk eliminates firmware-transfer byte // drops on the safe-window allocator. Do NOT remove this without @@ -268,28 +325,29 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { } id := cur.meta.ID if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { - println("[fabric]", "sid", s.localSID, "xfer_commit failed", - "id", id, "err", "size_mismatch", - "bytes_written", u32s(cur.bytesWritten), - "msg_size", u32s(msg.Size), "meta_size", u32s(cur.meta.Size)) - s.abortTransfer("size_mismatch") - s.sendTransferAbort(id, "size_mismatch") + reason := "short_transfer" + s.abortTransfer(reason) + s.sendTransferAbort(id, reason) + return + } + if msg.DigestAlg != digestAlg { + s.abortTransfer("unsupported_digest_alg") + s.sendTransferAbort(id, "unsupported_digest_alg") + return + } + commitDigest, ok := canonicalXXHash32Hex(msg.Digest) + if !ok { + s.abortTransfer("invalid_digest") + s.sendTransferAbort(id, "invalid_digest") return } streamedHex := xxhashHex(cur.hasher.Sum32()) - commitChecksum := lowerHex(msg.Checksum) - if commitChecksum != cur.meta.Checksum || streamedHex != cur.meta.Checksum { - println("[fabric]", "sid", s.localSID, "xfer_commit failed", - "id", id, "err", "checksum_mismatch", - "begin", cur.meta.Checksum, - "commit", commitChecksum, - "streamed", streamedHex, - ) - s.abortTransfer("checksum_mismatch") - s.sendTransferAbort(id, "checksum_mismatch") + if commitDigest != cur.meta.Digest || streamedHex != cur.meta.Digest { + s.abortTransfer("digest_mismatch") + s.sendTransferAbort(id, "digest_mismatch") return } - info, err := cur.sink.Commit() + _, err := cur.sink.Commit() if err != nil { s.logKV("transfer commit failed", "err", err.Error()) reason := err.Error() @@ -298,22 +356,112 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { return } sink := cur.sink + meta := cur.meta + s.extendTransferQuiet("xfer_commit_target", transferCompleteQuiet) s.clearTransfer() - println( - "[fabric]", "sid", s.localSID, - "xfer_commit accepted", - "id", id, - "bytes_written", u32s(info.BytesWritten), - ) - if !s.sendTransferDone(id) { + + bytesPayload := sink.Bytes() + ok, reason := s.invokeTransferTarget(meta, id, bytesPayload) + if !ok { + s.extendTransferQuiet("xfer_target_rejected", transferCompleteQuiet) + s.sendTransferAbort(id, reason) return } - time.Sleep(postTransferDoneSettle) - if err := sink.Apply(); err != nil { - s.logKV("transfer apply failed", "err", err.Error()) - return + s.extendTransferQuiet("xfer_done", transferCompleteQuiet) + s.sendTransferDone(id) +} + +const targetCallTimeout = 5 * time.Second + +// invokeTransferTarget calls the local updater staging RPC named by +// xfer_begin.target. The wire no longer carries raw/member receiver topics; +// target="updater/main" maps to an internal bus RPC owned by the updater +// service. The reply gates whether fabric sends xfer_done or xfer_abort. +func (s *session) invokeTransferTarget(meta transferMeta, xferID string, artefact []byte) (bool, string) { + if meta.Target != transferTargetUpdaterMain { + return false, "unsupported_target" + } + payload := updater.StagePayload{ + LinkID: s.linkID, + XferID: xferID, + Target: meta.Target, + Size: meta.Size, + DigestAlg: meta.DigestAlg, + Digest: meta.Digest, + Meta: meta.Meta, + Artefact: artefact, + } + msg := s.conn.NewMessage(updater.TopicStageRPC, payload, false) + replySub := s.conn.Request(msg) + defer s.conn.Unsubscribe(replySub) + + select { + case rep, ok := <-replySub.Channel(): + if !ok || rep == nil { + return false, "stage_no_reply" + } + ok, reason := decodeStageReply(rep.Payload) + if !ok { + return false, reason + } + return true, "" + case <-time.After(targetCallTimeout): + return false, "stage_timeout" + } +} + +func decodeStageReply(payload any) (bool, string) { + switch v := payload.(type) { + case nil: + return false, "stage_nil_payload" + case updater.StageReply: + if !v.OK { + if v.Err == "" { + return false, "stage_rejected" + } + return false, v.Err + } + return true, "" + case *updater.StageReply: + if v == nil { + return false, "stage_nil_payload" + } + if !v.OK { + if v.Err == "" { + return false, "stage_rejected" + } + return false, v.Err + } + return true, "" + case map[string]any: + ok, _ := v["ok"].(bool) + if !ok { + err, _ := v["err"].(string) + if err == "" { + err = "stage_rejected" + } + return false, err + } + return true, "" + } + b, err := json.Marshal(payload) + if err != nil { + return false, "stage_marshal_failed" + } + var probe struct { + OK bool `json:"ok"` + Err string `json:"err"` + } + if err := json.Unmarshal(b, &probe); err != nil { + return false, "stage_unmarshal_failed" + } + if !probe.OK { + if probe.Err == "" { + return false, "stage_rejected" + } + return false, probe.Err } - println("[fabric]", "sid", s.localSID, "transfer apply ok", "id", id) + return true, "" } func (s *session) onTransferAbort(msg *protoXferAbort) { @@ -326,7 +474,6 @@ func (s *session) onTransferAbort(msg *protoXferAbort) { if reason == "" { reason = "remote_abort" } - println("[fabric]", "sid", s.localSID, "xfer_abort received", "id", cur.meta.ID, "reason", reason) s.abortTransfer(reason) } diff --git a/services/fabric/transfer_sink_buffer.go b/services/fabric/transfer_sink_buffer.go new file mode 100644 index 0000000..bcdb01a --- /dev/null +++ b/services/fabric/transfer_sink_buffer.go @@ -0,0 +1,80 @@ +package fabric + +import "errors" + +// bufferSink is the default transferSink for the fabric-update branch: +// it buffers the verified-by-wire (xxHash32) artefact in RAM and exposes +// the bytes via Bytes() so onTransferCommit can hand them to the +// updater/main staging RPC. The updater is responsible for signed-image +// verification and staging. +// +// Size cap is deliberately conservative: the smoke tests target small +// artefacts and large firmware images need a streaming-into-flash +// sink, which is fabric-security's job. Hitting the cap aborts the +// transfer cleanly via WriteChunk -> ErrArtefactTooLarge. +const maxArtefactBytes = 64 * 1024 + +var ErrArtefactTooLarge = errors.New("artefact_too_large") + +type bufferSink struct { + meta transferMeta + buf []byte + closed bool + committed bool +} + +func newBufferSink(meta transferMeta) *bufferSink { + return &bufferSink{ + meta: meta, + buf: make([]byte, 0, sizeHint(meta.Size)), + } +} + +func sizeHint(announced uint32) int { + if announced == 0 || announced > maxArtefactBytes { + return maxArtefactBytes + } + return int(announced) +} + +func (s *bufferSink) WriteChunk(off uint32, data []byte) error { + if s.closed { + return errors.New("sink_closed") + } + if int(off) != len(s.buf) { + return errors.New("unexpected_offset") + } + if len(s.buf)+len(data) > maxArtefactBytes { + return ErrArtefactTooLarge + } + s.buf = append(s.buf, data...) + return nil +} + +func (s *bufferSink) Commit() (transferInfo, error) { + if s.closed { + return transferInfo{}, errors.New("sink_closed") + } + s.committed = true + return transferInfo{BytesWritten: uint32(len(s.buf))}, nil +} + +// Apply is a no-op for the buffer sink — the staged-image apply +// (slot switch + reboot) belongs to the updater's commit RPC, not to +// fabric's transfer state machine. fabric-security wires the real +// apply path through `cmd/self/updater/commit`. +func (s *bufferSink) Apply() error { return nil } + +func (s *bufferSink) Abort(reason string) error { + _ = reason + s.buf = nil + s.closed = true + return nil +} + +func (s *bufferSink) Bytes() []byte { + if !s.committed { + return nil + } + return s.buf +} diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 3360aa6..d8b1d3f 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -1,19 +1,61 @@ //go:build tinygo && rp2350 -// Default RP2350 transfer sink for the fabric-protocol baseline. Rejects all -// transfers at xfer_begin: signed-image verification and staged flash writes -// land in fabric-update via the receiver topic -// `raw/member/mcu/cap/updater/main/rpc/receive` and `pico2-a-b/imagev1/`. Until -// that path lands, the safe default is to refuse incoming transfers rather -// than flash unverified bytes directly into the inactive slot. - package fabric -import "errors" +import ( + "errors" + + "devicecode-go/services/updater" +) -var errTransferUnsupported = errors.New("staging_unavailable: signed-image receiver not present in this build") +type streamedStageSink struct { + accepted uint32 + closed bool +} func beginTransfer(meta transferMeta) (transferSink, error) { - _ = meta - return nil, errTransferUnsupported + if err := updater.BeginStreamedStage(meta.Size); err != nil { + return nil, err + } + return &streamedStageSink{}, nil } + +func (s *streamedStageSink) WriteChunk(off uint32, data []byte) error { + if s.closed { + return errors.New("sink_closed") + } + if s.accepted != off { + return errors.New("unexpected_offset") + } + if err := updater.WriteStreamedStage(data); err != nil { + return err + } + s.accepted += uint32(len(data)) + return nil +} + +func (s *streamedStageSink) Commit() (transferInfo, error) { + if s.closed { + return transferInfo{}, errors.New("sink_closed") + } + written, err := updater.CommitStreamedStage() + if err != nil { + return transferInfo{}, err + } + s.closed = true + return transferInfo{BytesWritten: written}, nil +} + +func (s *streamedStageSink) Apply() error { return nil } + +func (s *streamedStageSink) Abort(reason string) error { + _ = reason + updater.AbortStreamedStage() + s.closed = true + return nil +} + +// Bytes returns nil because the TinyGo RP2350 default path streams directly +// into the inactive slot. fabric still calls updater/main staging; the +// updater consumes the pre-staged descriptor instead of an in-RAM artefact. +func (s *streamedStageSink) Bytes() []byte { return nil } diff --git a/services/fabric/transfer_sink_stub.go b/services/fabric/transfer_sink_stub.go index 6386f0a..f07a074 100644 --- a/services/fabric/transfer_sink_stub.go +++ b/services/fabric/transfer_sink_stub.go @@ -1,11 +1,11 @@ //go:build !(tinygo && rp2350) -package fabric - -import "errors" +// Host build (tests, dev tooling): same buffer-sink behaviour as the +// default RP2350 build. Lets unit tests exercise updater/main staging +// without firmware stubs in the way. -var errTransferUnsupported = errors.New("unsupported") +package fabric func beginTransfer(meta transferMeta) (transferSink, error) { - return nil, errTransferUnsupported + return newBufferSink(meta), nil } diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 7837980..872566a 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -9,6 +9,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/updater" "devicecode-go/x/xxhash" ) @@ -51,6 +52,10 @@ func (s *fakeTransferSink) Abort(reason string) error { return nil } +// Bytes returns nil because the test fake doesn't retain a RAM copy +// of the transferred bytes — it tracks per-chunk writes instead. +func (s *fakeTransferSink) Bytes() []byte { return nil } + func runSessionWithSink(ctx context.Context, tr Transport, conn *bus.Connection, sink *fakeTransferSink) { s := session{ linkID: defaultLinkID, @@ -70,15 +75,82 @@ func rawURL(data []byte) string { return base64.RawURLEncoding.EncodeToString(data) } -// xxhashStr is the wire-format checksum: lower-case hex, 8 chars, no algorithm -// field. Mirrors the Lua reference's M.digest_hex. +// xxhashStr is the wire-format digest: lower-case hex, 8 chars. Mirrors +// the Lua reference's M.digest_hex. func xxhashStr(data []byte) string { return xxhashHex(xxhash.Sum32(data, 0)) } +func xferBegin(id string, payload []byte, meta json.RawMessage) protoXferBegin { + return protoXferBegin{ + Type: msgXferBegin, + XferID: id, + Target: updater.TargetUpdaterMain, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: xxhashStr(payload), + Meta: meta, + } +} + +func xferChunk(id string, off uint32, payload []byte) protoXferChunk { + return protoXferChunk{ + Type: msgXferChunk, + XferID: id, + Offset: off, + Data: rawURL(payload), + ChunkDigest: xxhashStr(payload), + } +} + +func xferCommit(id string, payload []byte) protoXferCommit { + return protoXferCommit{ + Type: msgXferCommit, + XferID: id, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: xxhashStr(payload), + } +} + +func installStageResponder(t *testing.T, b *bus.Bus, reply updater.StageReply) <-chan updater.StagePayload { + t.Helper() + conn := b.NewConnection("test-stage") + sub := conn.Subscribe(updater.TopicStageRPC) + t.Cleanup(func() { conn.Unsubscribe(sub) }) + got := make(chan updater.StagePayload, 4) + go func() { + for msg := range sub.Channel() { + if msg == nil { + continue + } + if payload, ok := msg.Payload.(updater.StagePayload); ok { + select { + case got <- payload: + default: + } + } + conn.Reply(msg, reply, false) + } + }() + return got +} + +func readTransferReady(t *testing.T, tr Transport, id string, next uint32) { + t.Helper() + ready := readMsg[protoXferReady](t, tr) + if ready.Type != msgXferReady || ready.XferID != id { + t.Fatalf("bad xfer_ready: %+v", ready) + } + need := readMsg[protoXferNeed](t, tr) + if need.Type != msgXferNeed || need.XferID != id || need.Next != next { + t.Fatalf("bad initial xfer_need: %+v, want id=%s next=%d", need, id, next) + } +} + func TestTransferBeginPreservesMeta(t *testing.T) { // xfer_begin's meta is opaque to fabric-protocol but must be preserved - // for fabric-update's receiver, which pulls meta.receiver out of it. + // for updater/main staging diagnostics. b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -102,16 +174,10 @@ func TestTransferBeginPreservesMeta(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - metaBlob := json.RawMessage(`{"receiver":["raw","member","mcu","cap","updater","main","rpc","receive"],"version":"1.2.3"}`) + metaBlob := json.RawMessage(`{"version":"1.2.3"}`) - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-meta", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - Meta: metaBlob, - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-meta", payload, metaBlob)) + readTransferReady(t, cm5, "xfer-meta", 0) if string(captured.Meta) != string(metaBlob) { t.Fatalf("transferMeta.Meta = %q, want %q", captured.Meta, metaBlob) @@ -119,6 +185,29 @@ func TestTransferBeginPreservesMeta(t *testing.T) { if captured.ID != "xfer-meta" || captured.Size != uint32(len(payload)) { t.Fatalf("transferMeta basic fields wrong: %+v", captured) } + if captured.Target != updater.TargetUpdaterMain || captured.DigestAlg != updater.DigestAlgXXHash32 || captured.Digest != xxhashStr(payload) { + t.Fatalf("transferMeta contract fields wrong: %+v", captured) + } +} + +func TestTransferDuplicateBeginResendsReady(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + begin := xferBegin("xfer-dup", payload, nil) + + sendMsg(t, cm5, begin) + readTransferReady(t, cm5, "xfer-dup", 0) + + sendMsg(t, cm5, begin) + readTransferReady(t, cm5, "xfer-dup", 0) } func TestTransferReceiveSuccess(t *testing.T) { @@ -133,34 +222,19 @@ func TestTransferReceiveSuccess(t *testing.T) { SlotXIPAddr: 0x10280000, }, } + stageCalls := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) payload := []byte("abcdefghij") - checksum := xxhashStr(payload) - - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-2", - Size: uint32(len(payload)), - Checksum: checksum, - }) - - ready := readMsg[protoXferReady](t, cm5) - if ready.Type != msgXferReady || ready.XferID != "xfer-2" { - t.Fatalf("bad xfer_ready: %+v", ready) - } + sendMsg(t, cm5, xferBegin("xfer-2", payload, nil)) + readTransferReady(t, cm5, "xfer-2", 0) parts := [][]byte{payload[:4], payload[4:8], payload[8:]} off := uint32(0) for i, part := range parts { - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-2", - Offset: off, - Data: rawURL(part), - }) + sendMsg(t, cm5, xferChunk("xfer-2", off, part)) need := readMsg[protoXferNeed](t, cm5) want := off + uint32(len(part)) if need.Next != want { @@ -169,19 +243,21 @@ func TestTransferReceiveSuccess(t *testing.T) { off = want } - sendMsg(t, cm5, protoXferCommit{ - Type: msgXferCommit, - XferID: "xfer-2", - Size: uint32(len(payload)), - Checksum: checksum, - }) + sendMsg(t, cm5, xferCommit("xfer-2", payload)) done := readMsg[protoXferDone](t, cm5) if done.Type != msgXferDone || done.XferID != "xfer-2" { t.Fatalf("bad xfer_done: %+v", done) } - time.Sleep(postTransferDoneSettle + 50*time.Millisecond) + select { + case call := <-stageCalls: + if call.XferID != "xfer-2" || call.Target != updater.TargetUpdaterMain || call.Digest != xxhashStr(payload) { + t.Fatalf("stage payload wrong: %+v", call) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage call") + } if got := string(sink.writes[0]) + string(sink.writes[1]) + string(sink.writes[2]); got != string(payload) { t.Fatalf("sink writes = %q, want %q", got, payload) @@ -189,14 +265,14 @@ func TestTransferReceiveSuccess(t *testing.T) { if !sink.committed { t.Fatal("sink.Commit was not called") } - if !sink.applied { - t.Fatal("sink.Apply was not called") + if sink.applied { + t.Fatal("sink.Apply should not be called by strict target staging") } } func TestTransferChunkBadOffsetAborts(t *testing.T) { // Lua transfer_mgr aborts and clears the active transfer on chunk faults - // (unexpected_offset, decode_failed, size_overflow). Match that — do not + // (unexpected_offset, invalid_chunk_encoding, size_too_large). Match that — do not // keep the transfer alive with an xfer_need. b := newBus() cm5, mcu := pipePair() @@ -208,21 +284,17 @@ func TestTransferChunkBadOffsetAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-3", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-3", payload, nil)) + readTransferReady(t, cm5, "xfer-3", 0) // Send a chunk at the wrong byte offset; expect xfer_abort and // sink.Abort, not an xfer_need retry. sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-3", - Offset: 7, - Data: rawURL(payload), + Type: msgXferChunk, + XferID: "xfer-3", + Offset: 7, + Data: rawURL(payload), + ChunkDigest: xxhashStr(payload), }) abort := readMsg[protoXferAbort](t, cm5) @@ -248,28 +320,90 @@ func TestTransferChunkDecodeFailureAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-d1", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-d1", payload, nil)) + readTransferReady(t, cm5, "xfer-d1", 0) // Bogus base64 (uses non-base64url chars). + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-d1", + Offset: 0, + Data: "!!!not-base64!!!", + ChunkDigest: xxhashStr(payload), + }) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.Err != "invalid_chunk_encoding" { + t.Fatalf("bad xfer_abort: %+v", abort) + } + if len(sink.abortReasons) == 0 { + t.Fatal("expected sink.Abort on decode failure") + } +} + +func TestTransferChunkMissingDigestAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-missing-digest", payload, nil)) + readTransferReady(t, cm5, "xfer-missing-digest", 0) + sendMsg(t, cm5, protoXferChunk{ Type: msgXferChunk, - XferID: "xfer-d1", + XferID: "xfer-missing-digest", Offset: 0, - Data: "!!!not-base64!!!", + Data: rawURL(payload), }) abort := readMsg[protoXferAbort](t, cm5) - if abort.Err != "decode_failed" { + if abort.Err != "missing_chunk_digest" { t.Fatalf("bad xfer_abort: %+v", abort) } if len(sink.abortReasons) == 0 { - t.Fatal("expected sink.Abort on decode failure") + t.Fatal("expected sink.Abort on missing chunk digest") + } +} + +func TestTransferChunkDigestMismatchRequestsSameOffset(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-bad-chunk-digest", payload, nil)) + readTransferReady(t, cm5, "xfer-bad-chunk-digest", 0) + + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-bad-chunk-digest", + Offset: 0, + Data: rawURL(payload), + ChunkDigest: "00000000", + }) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != 0 { + t.Fatalf("retry xfer_need.next = %d, want 0", need.Next) + } + if len(sink.writes) != 0 { + t.Fatalf("sink received %d writes before digest passed", len(sink.writes)) + } + + sendMsg(t, cm5, xferChunk("xfer-bad-chunk-digest", 0, payload)) + need = readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(len(payload)) { + t.Fatalf("xfer_need.next after retry = %d, want %d", need.Next, len(payload)) } } @@ -285,28 +419,24 @@ func TestTransferChunkSizeOverflowAborts(t *testing.T) { payload := []byte("abcd") // Advertise size=4 but send 6 bytes in the first chunk. - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-d2", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-d2", payload, nil)) + readTransferReady(t, cm5, "xfer-d2", 0) sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-d2", - Offset: 0, - Data: rawURL([]byte("abcdef")), + Type: msgXferChunk, + XferID: "xfer-d2", + Offset: 0, + Data: rawURL([]byte("abcdef")), + ChunkDigest: xxhashStr([]byte("abcdef")), }) abort := readMsg[protoXferAbort](t, cm5) - if abort.Err != "size_overflow" { + if abort.Err != "size_too_large" { t.Fatalf("bad xfer_abort: %+v", abort) } } -func TestTransferCommitChecksumMismatchAborts(t *testing.T) { +func TestTransferCommitDigestMismatchAborts(t *testing.T) { b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -317,40 +447,152 @@ func TestTransferCommitChecksumMismatchAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - // Begin with the wrong-checksum advertised. The only way to surface a - // commit-time mismatch is for begin/commit checksums to disagree, OR for - // the streamed bytes to disagree with the begin checksum. Use the - // latter: claim a bogus begin/commit checksum but stream the real bytes. + // Begin with the wrong digest advertised. The streamed bytes disagree + // with the begin/commit digest even though the frames agree. bogus := strings.Repeat("0", 8) sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-4", - Size: uint32(len(payload)), - Checksum: bogus, + Type: msgXferBegin, + XferID: "xfer-4", + Target: updater.TargetUpdaterMain, + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: bogus, }) - _ = readMsg[protoXferReady](t, cm5) + readTransferReady(t, cm5, "xfer-4", 0) - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-4", - Offset: 0, - Data: rawURL(payload), - }) + sendMsg(t, cm5, xferChunk("xfer-4", 0, payload)) _ = readMsg[protoXferNeed](t, cm5) sendMsg(t, cm5, protoXferCommit{ - Type: msgXferCommit, - XferID: "xfer-4", - Size: uint32(len(payload)), - Checksum: bogus, + Type: msgXferCommit, + XferID: "xfer-4", + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: bogus, }) abort := readMsg[protoXferAbort](t, cm5) - if abort.Type != msgXferAbort || abort.Err != "checksum_mismatch" { + if abort.Type != msgXferAbort || abort.Err != "digest_mismatch" { t.Fatalf("bad xfer_abort: %+v", abort) } if len(sink.abortReasons) == 0 { - t.Fatal("expected sink abort on checksum mismatch") + t.Fatal("expected sink abort on digest mismatch") + } +} + +// bufferingSinkAdapter wraps the production bufferSink so transfer tests +// can assert the bytes passed to updater/main staging. +type bufferingSinkAdapter struct { + *bufferSink + abortReasons []string +} + +func (b *bufferingSinkAdapter) Abort(reason string) error { + b.abortReasons = append(b.abortReasons, reason) + return b.bufferSink.Abort(reason) +} + +func TestTransferTargetInvokedAfterCommit(t *testing.T) { + // With target=updater/main, fabric calls the local updater stage RPC + // after xfer_commit and before xfer_done. The wire never names a + // raw/member receiver topic. + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + gotPayload := installStageResponder(t, b, updater.StageReply{OK: true, Stage: "staged"}) + + sink := &bufferingSinkAdapter{bufferSink: newBufferSink(transferMeta{Size: 4})} + s := session{ + linkID: defaultLinkID, + nodeID: "mcu-1", + peerID: "cm5-local", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + sink.bufferSink.meta = meta + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + payload := []byte("abcd") + metaBlob := json.RawMessage(`{"version":"1.2.3"}`) + + sendMsg(t, cm5, xferBegin("xfer-stage", payload, metaBlob)) + readTransferReady(t, cm5, "xfer-stage", 0) + + sendMsg(t, cm5, xferChunk("xfer-stage", 0, payload)) + _ = readMsg[protoXferNeed](t, cm5) + + sendMsg(t, cm5, xferCommit("xfer-stage", payload)) + + select { + case p := <-gotPayload: + if p.XferID != "xfer-stage" { + t.Fatalf("stage xfer_id = %v, want xfer-stage", p.XferID) + } + if p.LinkID != defaultLinkID { + t.Fatalf("stage link_id = %q, want %q", p.LinkID, defaultLinkID) + } + if p.Target != updater.TargetUpdaterMain || p.DigestAlg != updater.DigestAlgXXHash32 || p.Digest != xxhashStr(payload) { + t.Fatalf("stage contract fields wrong: %+v", p) + } + if string(p.Artefact) != string(payload) { + t.Fatalf("stage artefact = %v, want %q", p.Artefact, payload) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage call") + } + + done := readMsg[protoXferDone](t, cm5) + if done.XferID != "xfer-stage" { + t.Fatalf("xfer_done xfer_id = %q, want xfer-stage", done.XferID) + } +} + +func TestTransferTargetRejectAbortsTransfer(t *testing.T) { + // updater/main stage replies {ok=false, err=...}. fabric must send + // xfer_abort with the stage reason rather than xfer_done. + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + _ = installStageResponder(t, b, updater.StageReply{OK: false, Err: "manifest_check_failed"}) + + sink := &bufferingSinkAdapter{bufferSink: newBufferSink(transferMeta{Size: 4})} + s := session{ + linkID: defaultLinkID, + nodeID: "mcu-1", + peerID: "cm5-local", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + sink.bufferSink.meta = meta + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, xferBegin("xfer-rej", payload, nil)) + readTransferReady(t, cm5, "xfer-rej", 0) + sendMsg(t, cm5, xferChunk("xfer-rej", 0, payload)) + _ = readMsg[protoXferNeed](t, cm5) + sendMsg(t, cm5, xferCommit("xfer-rej", payload)) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.XferID != "xfer-rej" { + t.Fatalf("xfer_abort xfer_id = %q, want xfer-rej", abort.XferID) + } + if abort.Err != "manifest_check_failed" { + t.Fatalf("xfer_abort err = %q, want manifest_check_failed", abort.Err) } } @@ -381,13 +623,8 @@ func TestTransferIdleChunkWatchdog(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-wd", - Size: uint32(len(payload)), - Checksum: xxhashStr(payload), - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-wd", payload, nil)) + readTransferReady(t, cm5, "xfer-wd", 0) // Stop sending chunks; watchdog should fire within ~PhaseTimeout + // one exportTickInterval (50ms). @@ -400,8 +637,8 @@ func TestTransferIdleChunkWatchdog(t *testing.T) { } } -func TestTransferCommitChecksumMismatchOnCommitFrameAborts(t *testing.T) { - // xfer_begin and xfer_commit must agree on the checksum. If they +func TestTransferCommitDigestMismatchOnCommitFrameAborts(t *testing.T) { + // xfer_begin and xfer_commit must agree on the digest. If they // disagree (even when the streamed bytes match begin), commit aborts. b := newBus() cm5, mcu := pipePair() @@ -413,33 +650,22 @@ func TestTransferCommitChecksumMismatchOnCommitFrameAborts(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") - good := xxhashStr(payload) - sendMsg(t, cm5, protoXferBegin{ - Type: msgXferBegin, - XferID: "xfer-5", - Size: uint32(len(payload)), - Checksum: good, - }) - _ = readMsg[protoXferReady](t, cm5) + sendMsg(t, cm5, xferBegin("xfer-5", payload, nil)) + readTransferReady(t, cm5, "xfer-5", 0) - sendMsg(t, cm5, protoXferChunk{ - Type: msgXferChunk, - XferID: "xfer-5", - Offset: 0, - Data: rawURL(payload), - }) + sendMsg(t, cm5, xferChunk("xfer-5", 0, payload)) _ = readMsg[protoXferNeed](t, cm5) - // Commit advertises a different checksum than begin: must abort. + // Commit advertises a different digest than begin: must abort. sendMsg(t, cm5, protoXferCommit{ - Type: msgXferCommit, - XferID: "xfer-5", - Size: uint32(len(payload)), - Checksum: strings.Repeat("0", 8), + Type: msgXferCommit, + XferID: "xfer-5", + Size: uint32(len(payload)), + DigestAlg: updater.DigestAlgXXHash32, + Digest: strings.Repeat("0", 8), }) - abort := readMsg[protoXferAbort](t, cm5) - if abort.Type != msgXferAbort || abort.Err != "checksum_mismatch" { + if abort.Type != msgXferAbort || abort.Err != "digest_mismatch" { t.Fatalf("bad xfer_abort: %+v", abort) } } diff --git a/services/fabric/writer.go b/services/fabric/writer.go index 5d3f596..4ca94df 100644 --- a/services/fabric/writer.go +++ b/services/fabric/writer.go @@ -43,6 +43,18 @@ func (l *txLane) pop() []byte { // (e.g. drainExports + drainOutbound generating frames back-to-back). func (s *session) enqueueFrame(l lane, data []byte) bool { s.lane(l).push(data) + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "enqueue_frame", + "lane", laneName(l), + "type", protoType(data), + "len", len(data), + "q_control", s.txControl.len(), + "q_rpc", s.txRPC.len(), + "q_bulk", s.txBulk.len(), + ) + } return s.flushWriter() } @@ -59,6 +71,19 @@ func (s *session) lane(l lane) *txLane { } } +func laneName(l lane) string { + switch l { + case laneControl: + return "control" + case laneRPC: + return "rpc" + case laneBulk: + return "bulk" + default: + return "unknown" + } +} + // flushWriter writes queued frames to the transport in priority order: // 1. drain controlQ fully (no fairness), // 2. weighted RR between rpcQ and bulkQ until both empty. @@ -75,18 +100,18 @@ func (s *session) flushWriter() bool { bulkQ = 1 } for s.txControl.len() > 0 { - if !s.writeFrame(s.txControl.pop()) { + if !s.writeFrame(laneControl, s.txControl.pop()) { return false } } for s.txRPC.len() > 0 || s.txBulk.len() > 0 { for i := 0; i < rpcQ && s.txRPC.len() > 0; i++ { - if !s.writeFrame(s.txRPC.pop()) { + if !s.writeFrame(laneRPC, s.txRPC.pop()) { return false } } for i := 0; i < bulkQ && s.txBulk.len() > 0; i++ { - if !s.writeFrame(s.txBulk.pop()) { + if !s.writeFrame(laneBulk, s.txBulk.pop()) { return false } } @@ -96,10 +121,20 @@ func (s *session) flushWriter() bool { // writeFrame is the actual transport write. Mirrors what the prior // sendFrame did inline; isolated so flushWriter can call it per-frame. -func (s *session) writeFrame(data []byte) bool { +func (s *session) writeFrame(l lane, data []byte) bool { if len(data) > 0 && data[len(data)-1] == '\n' { data = data[:len(data)-1] } + if fabricTraceEnabled { + println( + "[fabric]", "sid", s.localSID, + "tx_frame", + "lane", laneName(l), + "type", protoType(data), + "len", len(data), + "line", tracePreview(data), + ) + } if err := s.tr.WriteLine(data); err != nil { if errors.Is(err, ErrLineTooLong) { s.log("oversized write dropped") diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 9429b2c..8fac7c8 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -9,12 +9,31 @@ import ( "devicecode-go/bus" "devicecode-go/services/fabric" + "devicecode-go/services/telemetry" + "devicecode-go/services/updater" "devicecode-go/types" "devicecode-go/utilities" "devicecode-go/x/shmring" "devicecode-go/x/strconvx" ) +// FirmwareVersion/FirmwareBuild/FirmwareImageID are the stamps the updater +// publishes via state/self/software. main may override them before the reactor +// starts; defaults are development sentinels. +var ( + FirmwareVersion = "0.0.0-dev" + FirmwareBuild = "local" + FirmwareImageID = "img-dev" +) + +func firmwareIdentity() updater.Identity { + return updater.Identity{ + Version: FirmwareVersion, + Build: FirmwareBuild, + ImageID: FirmwareImageID, + } +} + const fabricWaitLogInterval = 2 * time.Second // ----------------------------------------------------------------------------- @@ -162,6 +181,9 @@ type Reactor struct { // misc now time.Time + + // updater service handle used by the post-hello_ack republish hook. + updater *updater.Service } func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { @@ -398,6 +420,31 @@ func (r *Reactor) emitMemSnapshot() { } func (r *Reactor) Run(ctx context.Context) { + // Updater service: state machine + cmd/self/updater/{prepare,commit} + // RPC handlers + updater/main staging + retained state/self/{software, + // updater, health} facts. Started early so the initial fact retains + // land before fabric establishes — that way the first hello_ack + // observer sees a populated retain store. + updaterConn := r.bus.NewConnection("updater") + identity := firmwareIdentity() + updaterSvc := updater.New(updater.Options{ + Conn: updaterConn, + Verifier: updater.PassthroughVerifier(identity), + Applier: updater.ProductionApplier(), + Identity: identity, + }) + go updaterSvc.Run(ctx) + r.updater = updaterSvc + + // Telemetry service: subscribes to HAL value topics and republishes + // at state/self/* with integer engineering units; runs the charger + // alert FSM and emits event/self/power/charger/alert on bit-set + // transitions. Started after the updater so the initial software/ + // updater retains land first. + telemetryConn := r.bus.NewConnection("telemetry") + telemetrySvc := telemetry.New(telemetryConn) + go telemetrySvc.Run(ctx) + // Subscriptions (env + power) log.Println("[main] subscribing env + power …") tempSub := r.uiConn.Subscribe(tTempValue) diff --git a/services/telemetry/alerts.go b/services/telemetry/alerts.go new file mode 100644 index 0000000..ffb4982 --- /dev/null +++ b/services/telemetry/alerts.go @@ -0,0 +1,199 @@ +package telemetry + +import "devicecode-go/types" + +// chargerAlertFSM implements W8 from docs/firmware-alignment-update.md: +// hold previous bitfield state; on bit-set transition for a kind, emit +// one normal event with the canonical kind name. The 14 canonical kinds +// split into: +// - 11 bit-driven kinds (state[] + status[]), compared against the +// previous ChargerValue snapshot +// - 3 analog kinds (vin_lo / vin_hi / bsr_high), compared against +// the thresholds carried on state/self/power/charger/config. +// vin_lo + vin_hi observe ChargerValue.VIN_mV; bsr_high observes +// BatteryValue.BSR_uOhmPerCell. +// +// Each kind fires only on the boundary-crossing edge. While a value +// stays past its threshold (or a bit stays set), no further alerts. +type chargerAlertFSM struct { + prev types.ChargerValue + prevBSR uint32 + seen bool + seenBSR bool +} + +// AlertKind is the canonical alert kind name (snake_case) sent on +// the wire as event/self/power/charger/alert.kind. The 14 values +// are frozen by the spec; new kinds must be added here AND on the +// CM5 import side. +type AlertKind string + +const ( + AlertVinLo AlertKind = "vin_lo" + AlertVinHi AlertKind = "vin_hi" + AlertBsrHigh AlertKind = "bsr_high" + AlertBatMissing AlertKind = "bat_missing" + AlertBatShort AlertKind = "bat_short" + AlertMaxChargeTimeFault AlertKind = "max_charge_time_fault" + AlertAbsorb AlertKind = "absorb" + AlertEqualize AlertKind = "equalize" + AlertCccv AlertKind = "cccv" + AlertPrecharge AlertKind = "precharge" + AlertIinLimited AlertKind = "iin_limited" + AlertUvclActive AlertKind = "uvcl_active" + AlertCcPhase AlertKind = "cc_phase" + AlertCvPhase AlertKind = "cv_phase" +) + +// AllAlertKinds enumerates every canonical kind. Tests assert this is +// exactly 14 entries and that publishing rejects anything outside the +// set. +var AllAlertKinds = []AlertKind{ + AlertVinLo, AlertVinHi, AlertBsrHigh, + AlertBatMissing, AlertBatShort, AlertMaxChargeTimeFault, + AlertAbsorb, AlertEqualize, AlertCccv, AlertPrecharge, + AlertIinLimited, AlertUvclActive, AlertCcPhase, AlertCvPhase, +} + +// AlertEvent is the payload at event/self/power/charger/alert. Not +// retained — the publisher uses retained=false so subscribers only +// see live transitions, not stale alerts on reconnect. +type AlertEvent struct { + Kind AlertKind `json:"kind"` + Severity string `json:"severity"` + Source string `json:"source"` + StateBits uint16 `json:"state_bits"` + StatusBits uint16 `json:"status_bits"` + SystemBits uint16 `json:"system_bits"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +// alertSeverity returns the canonical severity for a kind. Faults +// surface as "warning"; charge-phase / control-loop transitions are +// "info". Splitting it out keeps the FSM's emit loop tiny and gives +// the spec one place to grow if severity refines later. +func alertSeverity(k AlertKind) string { + switch k { + case AlertBatMissing, AlertBatShort, AlertMaxChargeTimeFault, AlertBsrHigh: + return "warning" + default: + return "info" + } +} + +// observe runs one tick of the FSM against an incoming ChargerValue. +// On every bit-set transition we emit one event. Cleared bits do +// nothing (sparse stream — no clear-events). +func (f *chargerAlertFSM) observe(s *Service, v types.ChargerValue) { + if !f.seen { + f.prev = v + f.seen = true + return + } + + // State bits (CHARGER_STATE_ALERTS): 6 of the 11 bits map to + // canonical kinds. Bits with display name "suspended", "ntc_pause", + // "timer_term", "c_over_x_term" don't map to alert kinds in the + // spec — they're informational only. + f.fireOnSet(s, v, uint16(types.BatMissingFault), AlertBatMissing, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.BatShortFault), AlertBatShort, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.MaxChargeTimeFault), AlertMaxChargeTimeFault, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.AbsorbCharge), AlertAbsorb, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.EqualizeCharge), AlertEqualize, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.CCCVCharge), AlertCccv, + uint16(v.State), uint16(f.prev.State)) + f.fireOnSet(s, v, uint16(types.Precharge), AlertPrecharge, + uint16(v.State), uint16(f.prev.State)) + + // Status bits (CHARGE_STATUS): all 4 map to kinds. + f.fireOnSet(s, v, uint16(types.IinLimitActive), AlertIinLimited, + uint16(v.Status), uint16(f.prev.Status)) + f.fireOnSet(s, v, uint16(types.VinUvclActive), AlertUvclActive, + uint16(v.Status), uint16(f.prev.Status)) + f.fireOnSet(s, v, uint16(types.ConstCurrent), AlertCcPhase, + uint16(v.Status), uint16(f.prev.Status)) + f.fireOnSet(s, v, uint16(types.ConstVoltage), AlertCvPhase, + uint16(v.Status), uint16(f.prev.Status)) + + // Analog kinds — vin_lo and vin_hi compare ChargerValue.VIN_mV + // against the published thresholds on state/self/power/charger/ + // config. bsr_high comes from BatteryValue and is handled in + // observeBattery below. + th := s.chargerThresholds() + if th.VinLoMV > 0 { + // Edge from "vin >= threshold" to "vin < threshold". + if f.prev.VIN_mV >= th.VinLoMV && v.VIN_mV < th.VinLoMV { + s.emitAlert(v, AlertVinLo) + } + } + if th.VinHiMV > 0 { + // Edge from "vin <= threshold" to "vin > threshold". + if f.prev.VIN_mV <= th.VinHiMV && v.VIN_mV > th.VinHiMV { + s.emitAlert(v, AlertVinHi) + } + } + + f.prev = v +} + +// observeBattery feeds the bsr_high analog kind. BSR +// (battery-source-resistance) lives on BatteryValue, not +// ChargerValue, so it gets its own observer entry point. +func (f *chargerAlertFSM) observeBattery(s *Service, b types.BatteryValue) { + if !f.seenBSR { + f.prevBSR = b.BSR_uOhmPerCell + f.seenBSR = true + return + } + th := s.chargerThresholds() + if th.BSRHighUohmPerCell > 0 { + if f.prevBSR <= th.BSRHighUohmPerCell && b.BSR_uOhmPerCell > th.BSRHighUohmPerCell { + // bsr_high carries the latest charger snapshot for context; + // state/status/system bits are the most recent we saw. + s.emitAlert(f.prev, AlertBsrHigh) + } + } + f.prevBSR = b.BSR_uOhmPerCell +} + +// fireOnSet emits an alert if the bit went from clear to set between +// prev and curr. Bit is passed as a uint16 mask — call sites convert +// from their typed bit-flag (types.ChargerStateBits etc.) at the +// callsite to keep this helper free of generics overhead. +func (f *chargerAlertFSM) fireOnSet( + s *Service, + v types.ChargerValue, + mask uint16, + kind AlertKind, + curr, prev uint16, +) { + if mask == 0 { + return + } + wasSet := (prev & mask) != 0 + isSet := (curr & mask) != 0 + if !wasSet && isSet { + s.emitAlert(v, kind) + } +} + +func (s *Service) emitAlert(v types.ChargerValue, kind AlertKind) { + ev := AlertEvent{ + Kind: kind, + Severity: alertSeverity(kind), + Source: "ltc4015", + StateBits: v.State, + StatusBits: v.Status, + SystemBits: v.Sys, + Seq: s.seqChargerAlert.Add(1), + UptimeMs: s.uptimeMs(), + } + // Sparse alerts: NOT retained. + s.conn.Publish(s.conn.NewMessage(TopicChargerAlert, ev, false)) +} diff --git a/services/telemetry/telemetry.go b/services/telemetry/telemetry.go new file mode 100644 index 0000000..c298d28 --- /dev/null +++ b/services/telemetry/telemetry.go @@ -0,0 +1,521 @@ +// Package telemetry implements the W7/W8 retained-state and sparse- +// alert publishers from docs/firmware-alignment-update.md. It +// subscribes to the existing HAL value topics (hal/cap/env/..., +// hal/cap/power/...) and republishes them under the canonical +// state/self/* surface using integer engineering units, plus runs the +// charger alert FSM that emits event/self/power/charger/alert with +// 14 canonical kinds. +// +// Boundary: telemetry does NOT touch the updater state machine — it +// only consumes HAL data and produces fact retains + alert events. +// The fabric service then exports them onto the wire via the +// state/self/* + event/self/* export rules in services/fabric/remap.go. +package telemetry + +import ( + "context" + "encoding/json" + "runtime" + "sync/atomic" + "time" + + "devicecode-go/bus" + "devicecode-go/types" +) + +// Topic constants. Mirrors the canonical fact schema in +// docs/firmware-alignment-update.md §"Telemetry/state facts". +var ( + TopicBattery = bus.T("state", "self", "power", "battery") + TopicCharger = bus.T("state", "self", "power", "charger") + TopicChargerCfg = bus.T("state", "self", "power", "charger", "config") + TopicEnvTemp = bus.T("state", "self", "environment", "temperature") + TopicEnvHumidity = bus.T("state", "self", "environment", "humidity") + TopicRuntimeMem = bus.T("state", "self", "runtime", "memory") + + TopicChargerAlert = bus.T("event", "self", "power", "charger", "alert") + + // TopicFabricLink mirrors the updater's W10 watcher — telemetry + // republishes the charger config retain on every link-ready edge + // so the CM5 sees a fresh config fact on every newly established + // session, warm or cold. (Per-value retains like + // state/self/power/battery refresh naturally on the next HAL + // publish; the static-ish config fact needs an explicit re-emit.) + topicFabricLink = bus.T("state", "fabric", "link", "+") +) + +// HAL source topics — single point of truth for what we subscribe to. +var ( + halEnvTemp = bus.T("hal", "cap", "env", string(types.KindTemperature), "core", "value") + halEnvHum = bus.T("hal", "cap", "env", string(types.KindHumidity), "core", "value") + halPwrAny = bus.T("hal", "cap", "power", "+", "internal", "value") +) + +// MemSnapshotInterval is how often the runtime/memory fact republishes. +// Keep it on the order of the existing reactor mem-stat cadence to +// avoid burning UART bandwidth on changes that don't affect anything. +const MemSnapshotInterval = 3 * time.Second + +// ChargerThresholds carries the analog comparison thresholds used by +// both the state/self/power/charger/config retained fact (W7 finish) +// and the charger alert FSM's analog kinds (W8 finish — vin_lo, +// vin_hi, bsr_high). +// +// These ARE the LTC4015 effective config in production; on this +// branch they default to conservative bring-up values. +type ChargerThresholds struct { + VinLoMV int32 `json:"vin_lo_mV"` + VinHiMV int32 `json:"vin_hi_mV"` + BSRHighUohmPerCell uint32 `json:"bsr_high_uohm_per_cell"` +} + +// ChargerAlertMask is the 14-bool mask matching the 14 canonical alert +// kinds. Pre-fabric-security the mask is informational only — the +// alert FSM ignores it for emission. Once the LTC4015 driver +// programs the chip's alert-enable register from this and reports it +// back, masking can flow through to the FSM. Names here are +// spec-frozen to match docs/firmware-alignment-update.md. +type ChargerAlertMask struct { + VinLo bool `json:"vin_lo"` + VinHi bool `json:"vin_hi"` + BSRHigh bool `json:"bsr_high"` + BatMissing bool `json:"bat_missing"` + BatShort bool `json:"bat_short"` + MaxChargeTimeFault bool `json:"max_charge_time_fault"` + Absorb bool `json:"absorb"` + Equalize bool `json:"equalize"` + CCCV bool `json:"cccv"` + Precharge bool `json:"precharge"` + IinLimited bool `json:"iin_limited"` + UvclActive bool `json:"uvcl_active"` + CcPhase bool `json:"cc_phase"` + CvPhase bool `json:"cv_phase"` +} + +// ChargerConfig is the typed input into the publisher; the runtime +// fact wraps it inside ChargerConfigFact with seq + uptime_ms. +// +// Source is the value emitted on the wire as the fact's "source" +// field. Use "ltc4015" when the driver has read the effective +// programmed register state; use "ltc4015-default" (the +// DefaultChargerConfig value) to make it visible on the wire that +// these are fallback bring-up values, not what the chip is actually +// programmed with. The source string tracks the data's provenance so defaults +// are not presented as values read back from the chip. +type ChargerConfig struct { + Source string + Thresholds ChargerThresholds + AlertMaskBits uint16 + AlertMask ChargerAlertMask +} + +// DefaultChargerConfig returns conservative bring-up values labelled +// source="ltc4015-default" so consumers can spot that the LTC4015 +// driver has not supplied effective programmed values. VinLoMV / +// VinHiMV bracket a healthy USB-C / 12 V input; BSRHigh targets a +// typical lead-acid pack BSR. +func DefaultChargerConfig() ChargerConfig { + return ChargerConfig{ + Source: "ltc4015-default", + Thresholds: ChargerThresholds{ + VinLoMV: 10500, + VinHiMV: 17000, + BSRHighUohmPerCell: 5000, + }, + // Mask bits + booleans both zero — alerts unmasked at the + // chip level by default. The FSM emits regardless on this + // branch (informational mask only). + } +} + +// Service runs the telemetry publishers + charger alert FSM. Started +// from the reactor in its own goroutine. +type Service struct { + conn *bus.Connection + + // monotonic seq counters per topic — keeps the CM5 import side + // able to spot stalls without reading payload contents. + seqBattery atomic.Uint32 + seqCharger atomic.Uint32 + seqChargerCfg atomic.Uint32 + seqEnvTemp atomic.Uint32 + seqEnvHum atomic.Uint32 + seqRuntimeMem atomic.Uint32 + seqChargerAlert atomic.Uint32 + + startedAt time.Time + + // chargerCfg carries the analog thresholds the alert FSM uses for + // the vin_lo / vin_hi / bsr_high kinds, plus the alert mask the + // charger config fact retains to the wire. + chargerCfg ChargerConfig + + // alert FSM previous-bitfield state. Compared against incoming + // values to detect bit-set transitions. + alertFSM chargerAlertFSM +} + +// New constructs the service. conn must be a fresh bus connection +// dedicated to telemetry (not shared with the updater or fabric). +func New(conn *bus.Connection) *Service { + return &Service{ + conn: conn, + startedAt: time.Now(), + chargerCfg: DefaultChargerConfig(), + } +} + +func (s *Service) chargerThresholds() ChargerThresholds { + return s.chargerCfg.Thresholds +} + +// Run subscribes to HAL inputs and runs the publish loop. Blocks +// until ctx is cancelled. +func (s *Service) Run(ctx context.Context) { + tempSub := s.conn.Subscribe(halEnvTemp) + defer s.conn.Unsubscribe(tempSub) + humSub := s.conn.Subscribe(halEnvHum) + defer s.conn.Unsubscribe(humSub) + pwrSub := s.conn.Subscribe(halPwrAny) + defer s.conn.Unsubscribe(pwrSub) + + // Charger config retain at startup — the CM5-side import on + // the CM5 keys off this for the `update.components.mcu.charger_*` + // view, and the alert FSM analog kinds (vin_lo / vin_hi / + // bsr_high) depend on it being present to know what to compare + // against. + s.publishChargerConfig() + + linkSub := s.conn.Subscribe(topicFabricLink) + defer s.conn.Unsubscribe(linkSub) + + memTick := time.NewTicker(MemSnapshotInterval) + defer memTick.Stop() + + prevReady := map[string]bool{} + + for { + select { + case <-ctx.Done(): + return + case msg, ok := <-tempSub.Channel(): + if !ok || msg == nil { + continue + } + if v, ok := msg.Payload.(types.TemperatureValue); ok { + s.publishEnvTemp(v) + } + case msg, ok := <-humSub.Channel(): + if !ok || msg == nil { + continue + } + if v, ok := msg.Payload.(types.HumidityValue); ok { + s.publishEnvHum(v) + } + case msg, ok := <-pwrSub.Channel(): + if !ok || msg == nil { + continue + } + s.dispatchPower(msg) + case msg, ok := <-linkSub.Channel(): + if !ok || msg == nil { + continue + } + linkID, ready := decodeLinkReady(msg) + if linkID == "" { + continue + } + was := prevReady[linkID] + if ready && !was { + s.publishChargerConfig() + } + prevReady[linkID] = ready + case <-memTick.C: + s.publishRuntimeMem() + } + } +} + +// decodeLinkReady mirrors services/updater's helper but local to the +// telemetry package — kept duplicated rather than reaching into +// updater (cleaner package boundary). +func decodeLinkReady(msg *bus.Message) (string, bool) { + if msg == nil { + return "", false + } + t := msg.Topic + if t == nil || t.Len() < 4 { + return "", false + } + id, _ := t.At(t.Len() - 1).(string) + if id == "" { + return "", false + } + switch p := msg.Payload.(type) { + case nil: + return id, false + case map[string]any: + ready, _ := p["ready"].(bool) + return id, ready + } + // Probe via JSON for the typed-struct payload fabric publishes. + b, err := json.Marshal(msg.Payload) + if err != nil { + return id, false + } + var probe struct { + Ready bool `json:"ready"` + } + if err := json.Unmarshal(b, &probe); err != nil { + return id, false + } + return id, probe.Ready +} + +// dispatchPower splits the power-domain wildcard into per-kind +// publish paths. Kept tiny: BatteryValue and ChargerValue are the only +// shapes we consume on this branch (TemperatureValue from +// power/temperature/internal is intentionally NOT republished — the +// canonical contract puts thermal info under environment/temperature). +func (s *Service) dispatchPower(msg *bus.Message) { + switch v := msg.Payload.(type) { + case types.BatteryValue: + s.publishBattery(v) + s.alertFSM.observeBattery(s, v) + case types.ChargerValue: + s.publishCharger(v) + s.alertFSM.observe(s, v) + } +} + +// uptimeMs returns a service-monotonic uptime — close enough to a +// boot-uptime for the consumers' purposes (within a few HAL-init ms). +func (s *Service) uptimeMs() int64 { + return time.Since(s.startedAt).Milliseconds() +} + +// ---- W7: retained-state publishers --------------------------------- + +// BatteryFact is the retained payload at state/self/power/battery. +// All units are integer engineering units per the spec. +type BatteryFact struct { + PackMV int32 `json:"pack_mV"` + PerCellMV int32 `json:"per_cell_mV"` + IBatMA int32 `json:"ibat_mA"` + TempMC int32 `json:"temp_mC"` + BSRUOhmPerCell uint32 `json:"bsr_uohm_per_cell"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishBattery(v types.BatteryValue) { + fact := BatteryFact{ + PackMV: v.PackMilliV, + PerCellMV: v.PerCellMilliV, + IBatMA: v.IBatMilliA, + TempMC: v.TempMilliC, + BSRUOhmPerCell: v.BSR_uOhmPerCell, + Seq: s.seqBattery.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicBattery, fact, true)) +} + +// ChargerFact is the retained payload at state/self/power/charger. +// Carries raw bitfields AND 3 decoded boolean objects. +// +// The canonical key names below come from +// docs/firmware-alignment-update.md §"Telemetry/state facts" — they +// are NOT the existing display names in types.ChargerStateTable etc. +// (those drop the `_charge` / `_active` / `_fault` suffixes for +// log-line brevity). The wire-canonical names are spec-frozen because +// the Lua import side keys off them; renaming any of these is a +// wire-break. +type ChargerFact struct { + VinMV int32 `json:"vin_mV"` + VsysMV int32 `json:"vsys_mV"` + IinMA int32 `json:"iin_mA"` + StateBits uint16 `json:"state_bits"` + StatusBits uint16 `json:"status_bits"` + SystemBits uint16 `json:"system_bits"` + State map[string]bool `json:"state"` + Status map[string]bool `json:"status"` + System map[string]bool `json:"system"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +// Canonical name tables. Each entry is a (bit, canonical-name) pair. +// Counts match the spec's "27 booleans total: 11 + 4 + 12". +var chargerStateNames = []struct { + bit types.ChargerStateBits + name string +}{ + {types.EqualizeCharge, "equalize_charge"}, + {types.AbsorbCharge, "absorb_charge"}, + {types.ChargerSuspended, "charger_suspended"}, + {types.Precharge, "precharge"}, + {types.CCCVCharge, "cccv_charge"}, + {types.NTCPause, "ntc_pause"}, + {types.TimerTerm, "timer_term"}, + {types.COverXTerm, "c_over_x_term"}, + {types.MaxChargeTimeFault, "max_charge_time_fault"}, + {types.BatMissingFault, "bat_missing_fault"}, + {types.BatShortFault, "bat_short_fault"}, +} + +var chargerStatusNames = []struct { + bit types.ChargeStatusBits + name string +}{ + {types.VinUvclActive, "vin_uvcl_active"}, + {types.IinLimitActive, "iin_limit_active"}, + {types.ConstCurrent, "const_current"}, + {types.ConstVoltage, "const_voltage"}, +} + +var chargerSystemNames = []struct { + bit types.SystemStatus + name string +}{ + {types.ChargerEnabled, "charger_enabled"}, + {types.MpptEnPin, "mppt_en_pin"}, + {types.EqualizeReq, "equalize_req"}, + {types.DrvccGood, "drvcc_good"}, + {types.CellCountError, "cell_count_error"}, + {types.OkToCharge, "ok_to_charge"}, + {types.NoRt, "no_rt"}, + {types.ThermalShutdown, "thermal_shutdown"}, + {types.VinOvlo, "vin_ovlo"}, + {types.VinGtVbat, "vin_gt_vbat"}, + {types.IntvccGt4p3V, "intvcc_gt_4p3v"}, + {types.IntvccGt2p8V, "intvcc_gt_2p8v"}, +} + +func decodeChargerState(v uint16) map[string]bool { + out := make(map[string]bool, len(chargerStateNames)) + for _, e := range chargerStateNames { + out[e.name] = (v & uint16(e.bit)) != 0 + } + return out +} + +func decodeChargerStatus(v uint16) map[string]bool { + out := make(map[string]bool, len(chargerStatusNames)) + for _, e := range chargerStatusNames { + out[e.name] = (v & uint16(e.bit)) != 0 + } + return out +} + +func decodeChargerSystem(v uint16) map[string]bool { + out := make(map[string]bool, len(chargerSystemNames)) + for _, e := range chargerSystemNames { + out[e.name] = (v & uint16(e.bit)) != 0 + } + return out +} + +func (s *Service) publishCharger(v types.ChargerValue) { + fact := ChargerFact{ + VinMV: v.VIN_mV, + VsysMV: v.VSYS_mV, + IinMA: v.IIn_mA, + StateBits: v.State, + StatusBits: v.Status, + SystemBits: v.Sys, + State: decodeChargerState(v.State), + Status: decodeChargerStatus(v.Status), + System: decodeChargerSystem(v.Sys), + Seq: s.seqCharger.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicCharger, fact, true)) +} + +// ChargerConfigFact — state/self/power/charger/config. Effective +// LTC4015 configuration. Strictly forbidden from carrying +// operating-state booleans (charger_enabled, ok_to_charge, etc.) — +// those live on state/self/power/charger. +type ChargerConfigFact struct { + Schema int `json:"schema"` + Source string `json:"source"` + Thresholds ChargerThresholds `json:"thresholds"` + AlertMaskBits uint16 `json:"alert_mask_bits"` + AlertMask ChargerAlertMask `json:"alert_mask"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishChargerConfig() { + cfg := s.chargerCfg + source := cfg.Source + if source == "" { + // Defensive: a caller that constructed ChargerConfig without + // going through DefaultChargerConfig may have left this empty. + // Make the gap visible on the wire rather than misreporting. + source = "ltc4015-default" + } + fact := ChargerConfigFact{ + Schema: 1, + Source: source, + Thresholds: cfg.Thresholds, + AlertMaskBits: cfg.AlertMaskBits, + AlertMask: cfg.AlertMask, + Seq: s.seqChargerCfg.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicChargerCfg, fact, true)) +} + +// EnvTempFact — state/self/environment/temperature. +type EnvTempFact struct { + DeciC int32 `json:"deci_c"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishEnvTemp(v types.TemperatureValue) { + fact := EnvTempFact{ + DeciC: int32(v.DeciC), + Seq: s.seqEnvTemp.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicEnvTemp, fact, true)) +} + +// EnvHumFact — state/self/environment/humidity. +type EnvHumFact struct { + RHx100 int32 `json:"rh_x100"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishEnvHum(v types.HumidityValue) { + fact := EnvHumFact{ + RHx100: int32(v.RHx100), + Seq: s.seqEnvHum.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicEnvHumidity, fact, true)) +} + +// RuntimeMemFact — state/self/runtime/memory. Sourced from +// runtime.MemStats.Alloc; sufficient for the retained-fact +// "memory pressure" signal Lua consumers expect. +type RuntimeMemFact struct { + AllocBytes uint64 `json:"alloc_bytes"` + Seq uint32 `json:"seq"` + UptimeMs int64 `json:"uptime_ms"` +} + +func (s *Service) publishRuntimeMem() { + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + fact := RuntimeMemFact{ + AllocBytes: ms.Alloc, + Seq: s.seqRuntimeMem.Add(1), + UptimeMs: s.uptimeMs(), + } + s.conn.Publish(s.conn.NewMessage(TopicRuntimeMem, fact, true)) +} diff --git a/services/telemetry/telemetry_test.go b/services/telemetry/telemetry_test.go new file mode 100644 index 0000000..2c40985 --- /dev/null +++ b/services/telemetry/telemetry_test.go @@ -0,0 +1,410 @@ +package telemetry + +import ( + "context" + "testing" + "time" + + "devicecode-go/bus" + "devicecode-go/types" +) + +func newTestBus() *bus.Bus { return bus.NewBus(8, "+", "#") } + +// runService is the same kind of subscribe-then-start helper used in +// services/updater_test.go: a fresh probe subscription on a bus +// connection guarantees we capture the first publish without racing +// the goroutine's Subscribe calls. +func runService(t *testing.T, b *bus.Bus) (*bus.Connection, context.CancelFunc) { + t.Helper() + conn := b.NewConnection("telemetry") + svc := New(conn) + ctx, cancel := context.WithCancel(context.Background()) + go svc.Run(ctx) + // Telemetry only emits in response to incoming HAL data, so we + // don't need to wait on a startup retain; the SubscribeOnHAL test + // below uses a settle delay. + time.Sleep(10 * time.Millisecond) + return conn, cancel +} + +func TestPublishesBatteryFact(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicBattery) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halPwrAny, types.BatteryValue{ + PackMilliV: 12000, + PerCellMilliV: 3000, + IBatMilliA: -500, + TempMilliC: 24500, + BSR_uOhmPerCell: 1200, + }, true)) + + select { + case msg := <-sub.Channel(): + fact, ok := msg.Payload.(BatteryFact) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if fact.PackMV != 12000 || fact.IBatMA != -500 || fact.BSRUOhmPerCell != 1200 { + t.Fatalf("battery fact wrong: %+v", fact) + } + if fact.Seq != 1 { + t.Fatalf("seq = %d, want 1", fact.Seq) + } + if fact.UptimeMs < 0 { + t.Fatalf("uptime_ms = %d, want >= 0", fact.UptimeMs) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for battery fact") + } +} + +func TestPublishesChargerWithDecodedBooleans(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicCharger) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + VIN_mV: 18000, + VSYS_mV: 12200, + IIn_mA: 500, + State: uint16(types.AbsorbCharge | types.CCCVCharge), + Status: uint16(types.IinLimitActive), + Sys: uint16(types.ChargerEnabled | types.OkToCharge), + }, true)) + + select { + case msg := <-sub.Channel(): + fact, ok := msg.Payload.(ChargerFact) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if fact.VinMV != 18000 || fact.VsysMV != 12200 || fact.IinMA != 500 { + t.Fatalf("analog values wrong: %+v", fact) + } + if fact.StateBits != uint16(types.AbsorbCharge|types.CCCVCharge) { + t.Fatalf("state_bits = 0x%x", fact.StateBits) + } + // Decoded booleans use the canonical names from + // docs/firmware-alignment-update.md §6.2 — these are the + // wire-frozen names the Lua side keys off. + if !fact.State["absorb_charge"] || !fact.State["cccv_charge"] { + t.Fatalf("decoded state booleans wrong: %+v", fact.State) + } + if fact.State["bat_short_fault"] || fact.State["bat_missing_fault"] { + t.Fatalf("unset state bits decoded as true: %+v", fact.State) + } + if !fact.Status["iin_limit_active"] { + t.Fatalf("status iin_limit_active not decoded: %+v", fact.Status) + } + if !fact.System["charger_enabled"] || !fact.System["ok_to_charge"] { + t.Fatalf("system booleans wrong: %+v", fact.System) + } + // All three maps must be exactly the spec sizes. + if got := len(fact.State); got != 11 { + t.Fatalf("state map size = %d, want 11", got) + } + if got := len(fact.Status); got != 4 { + t.Fatalf("status map size = %d, want 4", got) + } + if got := len(fact.System); got != 12 { + t.Fatalf("system map size = %d, want 12", got) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for charger fact") + } +} + +func TestPublishesEnvironmentFacts(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + tSub := observer.Subscribe(TopicEnvTemp) + defer observer.Unsubscribe(tSub) + hSub := observer.Subscribe(TopicEnvHumidity) + defer observer.Unsubscribe(hSub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halEnvTemp, types.TemperatureValue{DeciC: 235}, true)) + hal.Publish(hal.NewMessage(halEnvHum, types.HumidityValue{RHx100: 4530}, true)) + + select { + case msg := <-tSub.Channel(): + fact, ok := msg.Payload.(EnvTempFact) + if !ok || fact.DeciC != 235 { + t.Fatalf("env temp fact = %+v ok=%v", msg.Payload, ok) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for env temp fact") + } + select { + case msg := <-hSub.Channel(): + fact, ok := msg.Payload.(EnvHumFact) + if !ok || fact.RHx100 != 4530 { + t.Fatalf("env hum fact = %+v ok=%v", msg.Payload, ok) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for env hum fact") + } +} + +func TestAllAlertKindsCount(t *testing.T) { + if got := len(AllAlertKinds); got != 14 { + t.Fatalf("AllAlertKinds has %d entries, want 14", got) + } + // Spec-frozen names — typo in the kind enum is a wire-break, so + // guard the canonical strings explicitly. + want := []string{ + "vin_lo", "vin_hi", "bsr_high", + "bat_missing", "bat_short", "max_charge_time_fault", + "absorb", "equalize", "cccv", "precharge", + "iin_limited", "uvcl_active", "cc_phase", "cv_phase", + } + for i, k := range AllAlertKinds { + if string(k) != want[i] { + t.Fatalf("AllAlertKinds[%d] = %q, want %q", i, string(k), want[i]) + } + } +} + +func TestChargerAlertFSMEdgeOnly(t *testing.T) { + // Spec: "On bit-set transition for a kind, emit one normal event." + // Subsequent retains that keep the bit set must NOT re-emit. + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // First publish primes the FSM (no alerts emitted on init). + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{}, true)) + time.Sleep(20 * time.Millisecond) + + // Bit goes from clear to set: one alert emitted. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + Status: uint16(types.IinLimitActive), + }, true)) + + select { + case msg := <-sub.Channel(): + ev, ok := msg.Payload.(AlertEvent) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if ev.Kind != AlertIinLimited { + t.Fatalf("kind = %q, want %q", ev.Kind, AlertIinLimited) + } + if ev.Source != "ltc4015" { + t.Fatalf("source = %q", ev.Source) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for first alert") + } + + // Second publish keeps the bit set — no new alert. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + Status: uint16(types.IinLimitActive), + }, true)) + select { + case msg := <-sub.Channel(): + t.Fatalf("unexpected duplicate alert: %+v", msg.Payload) + case <-time.After(150 * time.Millisecond): + } + + // Bit clears, then sets again: one more alert. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{}, true)) + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + Status: uint16(types.IinLimitActive), + }, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertIinLimited { + t.Fatalf("re-edge alert kind = %q", ev.Kind) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for re-edge alert") + } +} + +func TestPublishesChargerConfigAtStartup(t *testing.T) { + // W7 finish: state/self/power/charger/config retains at startup + // with the conservative defaults from DefaultChargerConfig(). + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerCfg) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + select { + case msg := <-sub.Channel(): + fact, ok := msg.Payload.(ChargerConfigFact) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if fact.Schema != 1 || fact.Source != "ltc4015-default" { + t.Fatalf("schema/source wrong: %+v", fact) + } + if fact.Thresholds.VinLoMV == 0 || fact.Thresholds.VinHiMV == 0 || fact.Thresholds.BSRHighUohmPerCell == 0 { + t.Fatalf("default thresholds not populated: %+v", fact.Thresholds) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for charger config fact") + } +} + +func TestChargerAlertFSMVinLoEdge(t *testing.T) { + // W8 finish: vin_lo fires on ChargerValue.VIN_mV crossing below + // the configured threshold. Subsequent observations below the + // threshold do NOT re-fire. + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // Prime the FSM with vin above threshold (default vin_lo = 10500). + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 12000}, true)) + time.Sleep(20 * time.Millisecond) + + // vin drops below threshold. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 10000}, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertVinLo { + t.Fatalf("kind = %q, want vin_lo", ev.Kind) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for vin_lo alert") + } + + // Stays below — no re-emit. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 9500}, true)) + select { + case msg := <-sub.Channel(): + t.Fatalf("unexpected duplicate vin_lo: %+v", msg.Payload) + case <-time.After(150 * time.Millisecond): + } +} + +func TestChargerAlertFSMVinHiEdge(t *testing.T) { + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // Default vin_hi = 17000; prime below threshold. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 12000}, true)) + time.Sleep(20 * time.Millisecond) + // Cross above. + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{VIN_mV: 18000}, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertVinHi { + t.Fatalf("kind = %q, want vin_hi", ev.Kind) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for vin_hi alert") + } +} + +func TestChargerAlertFSMBSRHighEdge(t *testing.T) { + // bsr_high observes BatteryValue.BSR_uOhmPerCell against the + // threshold from charger config (default 5000 uohm/cell). + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + // Prime with healthy BSR (below threshold). + hal.Publish(hal.NewMessage(halPwrAny, types.BatteryValue{BSR_uOhmPerCell: 2000}, true)) + time.Sleep(20 * time.Millisecond) + // Crosses threshold. + hal.Publish(hal.NewMessage(halPwrAny, types.BatteryValue{BSR_uOhmPerCell: 6000}, true)) + + select { + case msg := <-sub.Channel(): + ev, _ := msg.Payload.(AlertEvent) + if ev.Kind != AlertBsrHigh { + t.Fatalf("kind = %q, want bsr_high", ev.Kind) + } + if ev.Severity != "warning" { + t.Fatalf("severity = %q, want warning", ev.Severity) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for bsr_high alert") + } +} + +func TestChargerAlertFSMMultipleBitsTransitionTogether(t *testing.T) { + // Two state bits flip in the same publish — both alerts should + // fire. The order is deterministic per the FSM's switch order. + b := newTestBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(TopicChargerAlert) + defer observer.Unsubscribe(sub) + + _, cancel := runService(t, b) + defer cancel() + + hal := b.NewConnection("hal") + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{}, true)) + time.Sleep(20 * time.Millisecond) + + hal.Publish(hal.NewMessage(halPwrAny, types.ChargerValue{ + State: uint16(types.AbsorbCharge | types.CCCVCharge), + }, true)) + + gotKinds := make(map[AlertKind]bool) + deadline := time.After(2 * time.Second) + for len(gotKinds) < 2 { + select { + case msg := <-sub.Channel(): + ev, ok := msg.Payload.(AlertEvent) + if !ok { + continue + } + gotKinds[ev.Kind] = true + case <-deadline: + t.Fatalf("only got %v before deadline; want absorb + cccv", gotKinds) + } + } + if !gotKinds[AlertAbsorb] || !gotKinds[AlertCccv] { + t.Fatalf("expected absorb+cccv, got %v", gotKinds) + } +} diff --git a/services/updater/applier_host.go b/services/updater/applier_host.go new file mode 100644 index 0000000..4e33f01 --- /dev/null +++ b/services/updater/applier_host.go @@ -0,0 +1,14 @@ +//go:build !tinygo + +package updater + +// ProductionApplier returns the applier the reactor wires by default. +// On host builds (tests, dev environments without a flash slot to +// reboot into) this stays the safe-default RefusingApplier — commit +// returns apply_unavailable. Real reboot wiring lives in +// applier_tinygo.go. +func ProductionApplier() Applier { return RefusingApplier() } + +func scheduleArmReboot(a Applier, d StagedDescriptor) { + a.ArmReboot(d) +} diff --git a/services/updater/applier_tinygo.go b/services/updater/applier_tinygo.go new file mode 100644 index 0000000..b1d6f76 --- /dev/null +++ b/services/updater/applier_tinygo.go @@ -0,0 +1,47 @@ +//go:build tinygo && rp2350 + +package updater + +import "time" + +// abupdateApplier reboots into the slot the abupdateSink staged into. +// CanApply requires that newSlotSink has previously initialised the +// shared updater (i.e. updater/main staging wrote a staged image); without +// that, the inactive slot still holds the previous image and rebooting +// would either roll back or fail at the bootloader. +type abupdateApplier struct{} + +// ProductionApplier returns the abupdate-backed applier. CanApply +// validates that a staging cycle ran; ArmReboot calls +// abupdate.RebootIntoSlot which does not return on success. +func ProductionApplier() Applier { return abupdateApplier{} } + +const postCommitReplyFlushDelay = 750 * time.Millisecond + +func (abupdateApplier) CanApply(d StagedDescriptor) error { + _ = d + if !sharedUpdaterInit { + return errFromRC("apply_unavailable_uninited", 0) + } + return nil +} + +func (abupdateApplier) ArmReboot(d StagedDescriptor) { + _ = d + if !sharedUpdaterInit { + return + } + // Does not return on success. + _ = sharedUpdater.RebootIntoSlot() +} + +func scheduleArmReboot(a Applier, d StagedDescriptor) { + go func() { + // handleCommit has only replied on the local bus. The fabric + // session still needs a scheduler turn to marshal and write the + // wire reply (and the state=rebooting retain) back to CM5 before + // RebootIntoSlot stops the process. + time.Sleep(postCommitReplyFlushDelay) + a.ArmReboot(d) + }() +} diff --git a/services/updater/boot_id.go b/services/updater/boot_id.go new file mode 100644 index 0000000..295ed80 --- /dev/null +++ b/services/updater/boot_id.go @@ -0,0 +1,132 @@ +package updater + +import ( + "encoding/hex" + "runtime" + "sync/atomic" + "time" +) + +// boot_id contract per master plan R3 / docs/firmware-alignment-update.md §W6: +// - Opaque 16-character lower-hex marker that must change on every +// successful boot. +// - Generated from 8 bytes of crypto/rand AFTER HAL init succeeds and +// BEFORE fabric opens, so it's available to the first software-fact +// publish on hello_ack. +// - Held in RAM only. Not persisted to flash. Not added to the +// abupdate metadata block (the regression guard test in +// fabric-update tests checks that abupdate metadata never grows a +// boot_id field). +// +// The fallback path on rand failure is documented inline; this branch +// drops to a process-startup counter rather than panicking, with a +// clear log so the failure-mode test suite (master R3) can assert it. + +var ( + cachedBootID atomic.Pointer[string] + fallbackTick uint64 +) + +// GenerateBootID populates the cached value. Call exactly once during +// boot — main.go invokes it between HAL ready and fabric.Run. Subsequent +// calls return the existing value (idempotent so reactor reinit doesn't +// regenerate). +func GenerateBootID() string { + if existing := cachedBootID.Load(); existing != nil { + return *existing + } + id := generate() + if cachedBootID.CompareAndSwap(nil, &id) { + return id + } + // Lost the race; return whatever the winner stored. + return *cachedBootID.Load() +} + +// BootID returns the cached value generated at boot. Returns "" if +// GenerateBootID has not yet been called — which would indicate a +// boot-order bug, since the spec says it must run before fabric opens. +func BootID() string { + if existing := cachedBootID.Load(); existing != nil { + return *existing + } + return "" +} + +func generate() string { + var buf [8]byte + if tryCryptoRand(buf[:]) { + return hex.EncodeToString(buf[:]) + } + // Fallback: triggered only when crypto/rand is unavailable or returns + // all-zero. This is best-effort per-boot jitter, not contract-grade entropy. + // The log line below is the failure-mode signal for tests and diagnostics. + tick := atomic.AddUint64(&fallbackTick, 1) + println("[updater] boot_id fallback engaged tick=", itoaU64(tick)) + + var ms runtime.MemStats + runtime.ReadMemStats(&ms) + + // Best-effort fallback when crypto/rand is unavailable. Mixes: + // - monotonic clock at generation time (UnixNano), which varies + // with HAL init duration across cold boots; + // - runtime.MemStats Alloc / Mallocs / HeapInuse / Frees, which + // vary with allocation timing inside HAL bringup; + // - the per-call counter so multiple GenerateBootID calls within + // one process boot don't collide. + // Followed by a 3-stage shift mix so every output byte depends on + // every input bit. + // + // NOT contract-grade: the mix is non-cryptographic and depends on runtime + // jitter rather than a hardware entropy source. If crypto/rand is broken on + // the target, use the RP2350 hardware RNG or a persisted boot counter. + mix := tick + mix ^= uint64(time.Now().UnixNano()) + mix ^= ms.Alloc + mix ^= uint64(ms.Mallocs) + mix ^= uint64(ms.HeapInuse) + mix ^= uint64(ms.Frees) << 32 + mix ^= mix >> 11 + mix ^= mix << 17 + mix ^= mix >> 5 + for i := 7; i >= 0; i-- { + buf[i] = byte(mix & 0xff) + mix >>= 8 + } + return hex.EncodeToString(buf[:]) +} + +// tryCryptoRand is split per build: +// - host (!tinygo) — boot_id_host.go reads from crypto/rand +// - tinygo (RP2350 et al.) — boot_id_tinygo.go skips crypto/rand +// entirely and returns false, so the firmware always falls +// through to the deterministic mix below. TinyGo on RP2350 +// panics with "no rng" inside crypto/rand.Read, and pulling in +// defer/recover to catch it grew the binary by ~110 KB. Until +// TinyGo wires the RP2350 hardware-RNG (rosc) into its +// crypto/rand backend or we route a HAL-supplied RNG into +// services/updater, the safe-by-default path on the firmware +// is to never call crypto/rand. + +func allZero(b []byte) bool { + for _, c := range b { + if c != 0 { + return false + } + } + return true +} + +func itoaU64(v uint64) string { + if v == 0 { + return "0" + } + var buf [20]byte + pos := len(buf) + for v > 0 { + pos-- + buf[pos] = byte('0' + v%10) + v /= 10 + } + return string(buf[pos:]) +} diff --git a/services/updater/boot_id_host.go b/services/updater/boot_id_host.go new file mode 100644 index 0000000..11f6c53 --- /dev/null +++ b/services/updater/boot_id_host.go @@ -0,0 +1,15 @@ +//go:build !tinygo + +package updater + +import "crypto/rand" + +// tryCryptoRand on host builds reads 8 bytes from crypto/rand. Tests +// assert randomness across simulated "boots" via this path. +func tryCryptoRand(buf []byte) bool { + n, err := rand.Read(buf) + if err != nil || n != len(buf) { + return false + } + return !allZero(buf) +} diff --git a/services/updater/boot_id_test.go b/services/updater/boot_id_test.go new file mode 100644 index 0000000..5b16a74 --- /dev/null +++ b/services/updater/boot_id_test.go @@ -0,0 +1,8 @@ +package updater + +import "sync/atomic" + +func resetBootIDForTest() { + cachedBootID.Store(nil) + atomic.StoreUint64(&fallbackTick, 0) +} diff --git a/services/updater/boot_id_tinygo.go b/services/updater/boot_id_tinygo.go new file mode 100644 index 0000000..310b082 --- /dev/null +++ b/services/updater/boot_id_tinygo.go @@ -0,0 +1,24 @@ +//go:build tinygo + +package updater + +// tryCryptoRand on TinyGo always returns false so generate() falls +// through to the deterministic mix. +// +// Why we don't call crypto/rand.Read here: on RP2350 (and several +// other TinyGo targets), the runtime has no hardware-RNG seam wired +// in, and crypto/rand.Read PANICS with "no rng" rather than +// returning an error. Recovering from that panic in Go is possible +// but pulls TinyGo's panic-handling runtime into the binary, +// inflating code size by ~110 KB. The deterministic mix is poor +// entropy but at least it boots — and the +// `[updater] boot_id fallback engaged` log line is the canonical +// signal for the failure-mode hardware test suite. +// +// When TinyGo grows an RP2350 RNG backend or we route a HAL- +// supplied RNG into services/updater, drop this stub and let the +// host-side implementation (boot_id_host.go) be the single source. +func tryCryptoRand(buf []byte) bool { + _ = buf + return false +} diff --git a/services/updater/facts.go b/services/updater/facts.go new file mode 100644 index 0000000..d7c9536 --- /dev/null +++ b/services/updater/facts.go @@ -0,0 +1,49 @@ +package updater + +// PublishSoftware emits the retained state/self/software fact with the +// build identity + the per-boot RAM-only boot_id + the persisted +// payload_sha256 (when abupdate has populated it). Callers don't pass +// inputs — the fact pulls everything from the Service's configured +// Identity + boot_id cache + metadata reader. +func (s *Service) PublishSoftware() { + fact := SoftwareFact{ + Version: s.identity.Version, + BuildID: s.identity.Build, + ImageID: s.identity.ImageID, + BootID: s.ensureBootID(), + PayloadSHA256: s.metadata.PayloadSHA256(), + } + s.conn.Publish(s.conn.NewMessage(TopicSoftwareFact, fact, true)) +} + +func strPtrOrNil(v string) *string { + if v == "" { + return nil + } + return &v +} + +// PublishUpdater emits the retained state/self/updater fact with the +// canonical {state, last_error, pending_version} shape. Called on +// every state transition (via transitionTo) and as part of the post- +// hello_ack republish. +func (s *Service) PublishUpdater() { + s.mu.Lock() + fact := UpdaterFact{ + State: s.state, + LastError: strPtrOrNil(s.lastError), + PendingVersion: strPtrOrNil(s.pendingVersion), + PendingImageID: strPtrOrNil(s.pendingImageID), + StagedImageID: strPtrOrNil(s.stagedImageID), + JobID: strPtrOrNil(s.jobID), + } + s.mu.Unlock() + s.conn.Publish(s.conn.NewMessage(TopicUpdaterFact, fact, true)) +} + +// PublishHealth emits the retained state/self/health fact. Reason is +// optional; "" is dropped via the omitempty tag. +func (s *Service) PublishHealth(state, reason string) { + fact := HealthFact{State: state, Reason: reason} + s.conn.Publish(s.conn.NewMessage(TopicHealthFact, fact, true)) +} diff --git a/services/updater/prestage_host.go b/services/updater/prestage_host.go new file mode 100644 index 0000000..c78c83a --- /dev/null +++ b/services/updater/prestage_host.go @@ -0,0 +1,12 @@ +//go:build !tinygo || !rp2350 + +package updater + +type streamedStage struct { + Length uint32 + PayloadSHA256 string +} + +func consumeStreamedStage() (streamedStage, bool) { + return streamedStage{}, false +} diff --git a/services/updater/prestage_tinygo.go b/services/updater/prestage_tinygo.go new file mode 100644 index 0000000..bef919d --- /dev/null +++ b/services/updater/prestage_tinygo.go @@ -0,0 +1,101 @@ +//go:build tinygo && rp2350 + +package updater + +import ( + "crypto/sha256" + "encoding/hex" + "errors" + + "pico2-a-b/abupdate" +) + +// streamedStage tracks a raw transfer that fabric has already streamed into +// the inactive A/B slot. It is the TinyGo bring-up path used before imagev1 +// verification can stream directly from the transfer source. +type streamedStage struct { + Length uint32 + PayloadSHA256 string +} + +var ( + streamedStageDesc streamedStage + streamedStageOK bool + streamedStageHash = sha256.New() + streamedStageLen uint32 +) + +// BeginStreamedStage prepares the inactive slot for a raw incoming transfer. +// The caller must subsequently call WriteStreamedStage and CommitStreamedStage +// or AbortStreamedStage. +func BeginStreamedStage(size uint32) error { + // A fresh prepare invalidates any prior stage, and retrying an update in + // the same boot must not inherit abupdate's previous writing/complete + // state. Recreate the updater before resolving the inactive slot. + sharedUpdater = abupdate.Updater{} + sharedUpdaterInit = false + + u, err := ensureUpdaterInited() + if err != nil { + return err + } + if rc := u.BeginUpdate(size); rc != 0 { + return errFromRC("begin_update", rc) + } + streamedStageHash.Reset() + streamedStageLen = 0 + streamedStageDesc = streamedStage{} + streamedStageOK = false + return nil +} + +func WriteStreamedStage(data []byte) error { + if len(data) == 0 { + return errors.New("empty_chunk") + } + u, err := ensureUpdaterInited() + if err != nil { + return err + } + if rc := u.WriteChunk(data); rc != 0 { + return errFromRC("write_chunk", rc) + } + _, _ = streamedStageHash.Write(data) + streamedStageLen += uint32(len(data)) + return nil +} + +func CommitStreamedStage() (uint32, error) { + u, err := ensureUpdaterInited() + if err != nil { + return 0, err + } + if rc := u.FlushFinal(); rc != 0 { + return 0, errFromRC("flush_final", rc) + } + streamedStageDesc = streamedStage{ + Length: streamedStageLen, + PayloadSHA256: hex.EncodeToString(streamedStageHash.Sum(nil)), + } + streamedStageOK = true + return u.BytesWritten(), nil +} + +func AbortStreamedStage() { + streamedStageDesc = streamedStage{} + streamedStageOK = false + streamedStageLen = 0 + streamedStageHash.Reset() +} + +func consumeStreamedStage() (streamedStage, bool) { + if !streamedStageOK { + return streamedStage{}, false + } + out := streamedStageDesc + streamedStageDesc = streamedStage{} + streamedStageOK = false + streamedStageLen = 0 + streamedStageHash.Reset() + return out, true +} diff --git a/services/updater/receiver.go b/services/updater/receiver.go new file mode 100644 index 0000000..51c33e8 --- /dev/null +++ b/services/updater/receiver.go @@ -0,0 +1,190 @@ +package updater + +import ( + "bytes" + + "devicecode-go/bus" +) + +// The SlotSink used during verification is created via newSlotSink, +// which is build-tag-split: host returns a RAM buffer (sink_host.go), +// tinygo+rp2350 returns an abupdate-backed sink that streams into the +// inactive A/B slot (sink_tinygo.go). + +// handleStage runs the verifier-gated staging path. Triggered by fabric +// after xfer_commit; the reply gates whether fabric sends xfer_done or +// xfer_abort. +// +// On verifier success: write staged descriptor, publish state=staged +// with the manifest's version as pending_version, return ok=true. +// +// On verifier failure: publish state=failed with the verifier's error +// string in last_error, return ok=false. +func (s *Service) handleStage(msg *bus.Message) { + payload, ok := jsonDecode[StagePayload](msg.Payload) + if !ok { + s.reply(msg, StageReply{OK: false, Err: "bad_payload"}) + return + } + if payload.Target != TargetUpdaterMain { + s.reply(msg, StageReply{OK: false, Err: "unsupported_target"}) + return + } + if payload.DigestAlg != "" && payload.DigestAlg != DigestAlgXXHash32 { + s.reply(msg, StageReply{OK: false, Err: "unsupported_digest_alg"}) + return + } + s.transitionTo(StateReceiving, "", "") + + if len(payload.Artefact) == 0 { + staged, ok := consumeStreamedStage() + if !ok { + s.clearStagedImage() + s.transitionTo(StateFailed, "artefact_missing", "") + s.reply(msg, StageReply{OK: false, Err: "artefact_missing"}) + return + } + stageIdentity, _ := identityFromStageMeta(s.identity, payload.Meta) + desc := StagedDescriptor{ + Version: stageIdentity.Version, + BuildID: stageIdentity.Build, + ImageID: stageIdentity.ImageID, + Length: staged.Length, + Slot: 0, + PayloadSHA256: staged.PayloadSHA256, + } + if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, "metadata_write_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) + return + } + s.setStagedImage(desc.ImageID, desc.Version) + s.transitionTo(StateStaged, "", desc.Version) + s.reply(msg, StageReply{OK: true, Stage: "staged"}) + return + } + + sink, err := newSlotSink(uint32(len(payload.Artefact))) + if err != nil { + s.clearStagedImage() + s.transitionTo(StateFailed, "sink_init_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "sink_init_failed"}) + return + } + manifest, err := s.verifier.Verify(bytes.NewReader(payload.Artefact), sink) + if err != nil { + // Verifier rejected the artefact. Clear any prior descriptor so a + // following commit cannot apply stale firmware from an older stage. + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: err.Error()}) + return + } + + // On verifier success the sink holds the verified payload bytes. + // Persist the staged descriptor via the abupdate metadata writer + // (W11) so the next prepare/commit RPC and the next boot's + // software fact see payload_sha256 + descriptor. The fabric-update + // branch ships an in-memory writer; fabric-security replaces it + // with a flash-backed implementation that survives reboots. + if err := sink.Commit(); err != nil { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, "sink_commit_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "sink_commit_failed"}) + return + } + desc := StagedDescriptor{ + Version: manifest.Version, + BuildID: manifest.BuildID, + ImageID: manifest.ImageID, + Length: manifest.PayloadLength, + Slot: 0, // slot-pick comes from abupdate when fabric-security wires it + PayloadSHA256: manifest.PayloadSHA256, + } + if err := s.metadataWrite.WriteStagedDescriptor(desc); err != nil { + _ = s.metadataWrite.ClearStagedDescriptor() + s.clearStagedImage() + s.transitionTo(StateFailed, "metadata_write_failed:"+err.Error(), "") + s.reply(msg, StageReply{OK: false, Err: "metadata_write_failed"}) + return + } + + s.setStagedImage(desc.ImageID, manifest.Version) + s.transitionTo(StateStaged, "", manifest.Version) + // Do not republish the software fact here: PayloadSHA256 describes the + // running image, while this descriptor describes the staged image. + s.reply(msg, StageReply{OK: true, Stage: "staged"}) +} + +type stageMetadata struct { + Version string `json:"version,omitempty"` + Build string `json:"build,omitempty"` + BuildID string `json:"build_id,omitempty"` + ImageID string `json:"image_id,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` +} + +type stageMetadataEnvelope struct { + Metadata stageMetadata `json:"metadata,omitempty"` + Meta stageMetadata `json:"meta,omitempty"` + Request struct { + Metadata stageMetadata `json:"metadata,omitempty"` + Meta stageMetadata `json:"meta,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` + } `json:"request,omitempty"` +} + +func applyStageMetadata(ident *Identity, md stageMetadata) bool { + applied := false + if md.Version != "" { + ident.Version = md.Version + applied = true + } + if md.BuildID != "" { + ident.Build = md.BuildID + applied = true + } else if md.Build != "" { + ident.Build = md.Build + applied = true + } + if md.ImageID != "" { + ident.ImageID = md.ImageID + applied = true + } else if md.ExpectedImageID != "" { + ident.ImageID = md.ExpectedImageID + applied = true + } + return applied +} + +func identityFromStageMeta(defaults Identity, meta any) (Identity, bool) { + ident := defaults + applied := false + md, ok := jsonDecode[stageMetadata](meta) + if ok { + applied = applyStageMetadata(&ident, md) || applied + } + + env, ok := jsonDecode[stageMetadataEnvelope](meta) + if !ok { + return ident, applied + } + applied = applyStageMetadata(&ident, env.Metadata) || applied + applied = applyStageMetadata(&ident, env.Meta) || applied + if env.Request.ExpectedImageID != "" && env.Request.Metadata.ExpectedImageID == "" { + env.Request.Metadata.ExpectedImageID = env.Request.ExpectedImageID + } + if env.Request.ExpectedImageID != "" && env.Request.Meta.ExpectedImageID == "" { + env.Request.Meta.ExpectedImageID = env.Request.ExpectedImageID + } + applied = applyStageMetadata(&ident, env.Request.Metadata) || applied + applied = applyStageMetadata(&ident, env.Request.Meta) || applied + if !applied { + return ident, false + } + return ident, true +} diff --git a/services/updater/rpc.go b/services/updater/rpc.go new file mode 100644 index 0000000..1c94732 --- /dev/null +++ b/services/updater/rpc.go @@ -0,0 +1,90 @@ +package updater + +import "devicecode-go/bus" + +// handlePrepare processes cmd/self/updater/prepare. Success returns the +// current contract's prepare acknowledgement, including the required +// transfer target and maximum raw chunk size. +func (s *Service) handlePrepare(msg *bus.Message) { + req, ok := jsonDecode[PrepareRequest](msg.Payload) + if !ok { + s.reply(msg, Reply{OK: false, Error: "bad_request"}) + return + } + if req.Target != "" && req.Target != PrepareTargetMCU { + s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) + return + } + + s.mu.Lock() + if s.preparing || s.state == StateCommitting || s.state == StateRebooting { + s.mu.Unlock() + s.reply(msg, Reply{OK: false, Error: ErrBusy}) + return + } + s.preparing = true + s.mu.Unlock() + s.setJobContext(req.JobID, req.ExpectedImageID) + s.transitionTo(StatePreparing, "", "") + + // Clear any persisted staged descriptor from a previous successful + // stage. Without this, a flow of (stage A) -> (prepare for B) -> + // (stage B fails) leaves descriptor A persisted and committable — + // which would be a real safety bug since the user-intent on + // prepare(B) is "I want to stage B, throw away A". + if err := s.metadataWrite.ClearStagedDescriptor(); err != nil { + s.markPrepareDone() + s.reply(msg, Reply{OK: false, Error: "metadata_clear_failed:" + err.Error()}) + return + } + + s.transitionTo(StateReady, "", "") + s.markPrepareDone() + s.reply(msg, PrepareReply{ + Ready: true, + Target: TargetUpdaterMain, + MaxChunkSize: DefaultMaxChunkSize, + }) +} + +// handleCommit processes cmd/self/updater/commit. It only accepts a valid +// staged descriptor matching the requested/remembered expected image. +func (s *Service) handleCommit(msg *bus.Message) { + req, ok := jsonDecode[CommitRequest](msg.Payload) + if !ok { + s.reply(msg, Reply{OK: false, Error: "bad_request"}) + return + } + + desc, present := s.metadata.StagedDescriptor() + s.mu.Lock() + stagedInState := s.state == StateStaged + pendingImageID := s.pendingImageID + s.mu.Unlock() + + if !present || !stagedInState { + s.reply(msg, Reply{OK: false, Error: ErrNothingStaged}) + return + } + expectedImageID := req.ExpectedImageID + if expectedImageID == "" { + expectedImageID = pendingImageID + } + if expectedImageID != "" && desc.ImageID != expectedImageID { + s.reply(msg, Reply{OK: false, Error: ErrTargetMismatch}) + return + } + + // Validate the apply path before publishing committing/rebooting or + // replying accepted. The default Applier refuses in non-hardware tests. + if err := s.applier.CanApply(desc); err != nil { + s.reply(msg, Reply{OK: false, Error: err.Error()}) + return + } + + s.transitionTo(StateCommitting, "", desc.Version) + s.reply(msg, CommitReply{Accepted: true, RebootRequired: true}) + s.transitionTo(StateRebooting, "", desc.Version) + + scheduleArmReboot(s.applier, desc) +} diff --git a/services/updater/sink_host.go b/services/updater/sink_host.go new file mode 100644 index 0000000..7c2f78c --- /dev/null +++ b/services/updater/sink_host.go @@ -0,0 +1,44 @@ +//go:build !tinygo + +package updater + +import ( + "bytes" + "io" +) + +// memorySink buffers verified payload bytes in RAM. Used on host builds +// (tests, dev builds) where there's no flash slot to write into. Any +// build that needs to stage to actual flash uses the tinygo+rp2350 +// sink in sink_tinygo.go. +type memorySink struct { + buf bytes.Buffer + closed bool +} + +func (m *memorySink) Write(p []byte) (int, error) { + if m.closed { + return 0, io.ErrClosedPipe + } + return m.buf.Write(p) +} + +func (m *memorySink) Commit() error { + m.closed = true + return nil +} + +func (m *memorySink) Abort() error { + m.buf.Reset() + m.closed = true + return nil +} + +// newSlotSink returns the host-default sink. totalSize is unused — the +// memory sink grows as bytes arrive. Staging passes it for parity +// with the tinygo factory which must hand the size to abupdate up +// front. +func newSlotSink(totalSize uint32) (SlotSink, error) { + _ = totalSize + return &memorySink{}, nil +} diff --git a/services/updater/sink_tinygo.go b/services/updater/sink_tinygo.go new file mode 100644 index 0000000..9ce85dc --- /dev/null +++ b/services/updater/sink_tinygo.go @@ -0,0 +1,116 @@ +//go:build tinygo && rp2350 + +package updater + +import ( + "errors" + + "pico2-a-b/abupdate" +) + +// sharedUpdater is the package-level abupdate instance. It must persist +// across the staging path (which writes the staged image into the +// inactive slot via the abupdateSink) and the applier path (which +// reboots into that slot). One device = one inactive slot, so a +// singleton is fine. +var ( + sharedUpdater abupdate.Updater + sharedUpdaterInit bool +) + +func ensureUpdaterInited() (*abupdate.Updater, error) { + if !sharedUpdaterInit { + if rc := sharedUpdater.Init(); rc != 0 { + return nil, errFromRC("updater_init", rc) + } + sharedUpdaterInit = true + } + return &sharedUpdater, nil +} + +// abupdateSink streams verified payload bytes straight into the +// inactive A/B slot via abupdate.WriteChunk. Commit() pads + writes +// the final partial page; Abort() leaves the slot in its current +// state (the next BeginUpdate erases sectors lazily as the next image +// is written). +type abupdateSink struct { + u *abupdate.Updater + closed bool +} + +func (s *abupdateSink) Write(p []byte) (int, error) { + if s.closed { + return 0, errors.New("abupdate_sink: closed") + } + if rc := s.u.WriteChunk(p); rc != 0 { + return 0, errFromRC("write_chunk", rc) + } + return len(p), nil +} + +func (s *abupdateSink) Commit() error { + if s.closed { + return nil + } + s.closed = true + if rc := s.u.FlushFinal(); rc != 0 { + return errFromRC("flush_final", rc) + } + return nil +} + +func (s *abupdateSink) Abort() error { + s.closed = true + return nil +} + +// newSlotSink resolves the inactive slot, calls BeginUpdate(totalSize) +// so abupdate knows when to stop erasing, and hands back a sink that +// streams into flash. The staging path creates one of these per Verify +// call. +func newSlotSink(totalSize uint32) (SlotSink, error) { + u, err := ensureUpdaterInited() + if err != nil { + return nil, err + } + if rc := u.BeginUpdate(totalSize); rc != 0 { + return nil, errFromRC("begin_update", rc) + } + return &abupdateSink{u: u}, nil +} + +func errFromRC(op string, rc int32) error { + return &rcError{op: op, rc: rc} +} + +type rcError struct { + op string + rc int32 +} + +func (e *rcError) Error() string { + return e.op + ":" + i32s(e.rc) +} + +func i32s(v int32) string { + if v == 0 { + return "0" + } + neg := false + if v < 0 { + neg = true + v = -v + } + var buf [12]byte + pos := len(buf) + for v > 0 { + pos-- + buf[pos] = byte('0' + v%10) + v /= 10 + } + if neg { + pos-- + buf[pos] = '-' + } + return string(buf[pos:]) +} diff --git a/services/updater/types.go b/services/updater/types.go new file mode 100644 index 0000000..650655c --- /dev/null +++ b/services/updater/types.go @@ -0,0 +1,151 @@ +package updater + +// State enumerates the canonical updater states from the current MCU +// contract. Empty string is accepted as the nil/unset state for local +// callers that have not published a fact yet. +type State string + +const ( + StateRunning State = "running" + StateReady State = "ready" + StatePreparing State = "preparing" + StateReceiving State = "receiving" + StateStaged State = "staged" + StateCommitting State = "committing" + StateRebooting State = "rebooting" + StateFailed State = "failed" + StateRollbackDetected State = "rollback_detected" +) + +func (s State) Allowed() bool { + switch s { + case "", + StateRunning, StateReady, StatePreparing, StateReceiving, + StateStaged, StateCommitting, StateRebooting, + StateFailed, StateRollbackDetected: + return true + } + return false +} + +const ( + PrepareTargetMCU = "mcu" + TargetUpdaterMain = "updater/main" + DigestAlgXXHash32 = "xxhash32" + DefaultMaxChunkSize uint32 = 2048 +) + +// PrepareRequest mirrors the current cmd/self/updater/prepare payload. +// metadata is intentionally opaque so the CM5 can add fields without +// requiring a device firmware rebuild. +type PrepareRequest struct { + JobID string `json:"job_id,omitempty"` + Target string `json:"target,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` + Metadata any `json:"metadata,omitempty"` +} + +// CommitRequest mirrors cmd/self/updater/commit. +type CommitRequest struct { + JobID string `json:"job_id,omitempty"` + ExpectedImageID string `json:"expected_image_id,omitempty"` + Metadata any `json:"metadata,omitempty"` +} + +type PrepareReply struct { + Ready bool `json:"ready"` + Target string `json:"target"` + MaxChunkSize uint32 `json:"max_chunk_size"` +} + +type CommitReply struct { + Accepted bool `json:"accepted"` + RebootRequired bool `json:"reboot_required,omitempty"` +} + +// Reply is retained for refusal/error replies. Successful prepare/commit +// calls use the contract-specific PrepareReply and CommitReply shapes. +type Reply struct { + OK bool `json:"ok"` + Accepted bool `json:"accepted,omitempty"` + Error string `json:"error,omitempty"` +} + +// Refusal error strings — the Lua side compares against these. +const ( + ErrBusy = "busy" + ErrNothingStaged = "nothing_staged" + ErrTargetMismatch = "target_mismatch" + // ErrApplyUnavailable is returned when the commit RPC sees a valid + // staged descriptor but no Applier is wired to actually trigger + // the slot-switch + reboot. fabric-update ships with a refusing + // Applier so we never lie to the CM5 about apply success on a + // branch where the apply path doesn't exist; fabric-security + // supplies a real Applier and the refusal goes away. + ErrApplyUnavailable = "apply_unavailable" +) + +// SoftwareFact is the retained payload at state/self/software per +// docs/firmware-alignment-update.md §"Identity facts". `boot_id` is +// generated per boot (W6, RAM-only); `payload_sha256` is bare 64-char +// lower-hex sourced from the abupdate metadata block. +type SoftwareFact struct { + Version string `json:"version"` + BuildID string `json:"build_id"` + ImageID string `json:"image_id"` + BootID string `json:"boot_id"` + PayloadSHA256 string `json:"payload_sha256,omitempty"` +} + +// UpdaterFact is the retained payload at state/self/updater. Nullable +// fields are pointers so JSON publishes explicit nulls, not omitted +// properties, when no value is present. +type UpdaterFact struct { + State State `json:"state"` + LastError *string `json:"last_error"` + PendingVersion *string `json:"pending_version"` + PendingImageID *string `json:"pending_image_id"` + StagedImageID *string `json:"staged_image_id"` + JobID *string `json:"job_id"` +} + +// HealthFact is the retained payload at state/self/health. Lua extracts +// `state`; Reason is optional. +type HealthFact struct { + State string `json:"state"` + Reason string `json:"reason,omitempty"` +} + +// StagedDescriptor is the metadata about a staged image, persisted in +// the abupdate metadata block by updater/main staging after the verifier +// accepts. Read at the next prepare/commit RPC to know what's actually +// stageable. +type StagedDescriptor struct { + Version string `json:"version"` + BuildID string `json:"build_id"` + ImageID string `json:"image_id"` + Length uint32 `json:"length"` + Slot uint8 `json:"slot"` + PayloadSHA256 string `json:"payload_sha256"` +} + +// StagePayload is the local updater/main staging RPC invoked by fabric +// after xfer_commit has verified size and transfer digest. It replaces +// the older meta.receiver/raw-member receive path; the CM5 supplies only +// target="updater/main" on the wire. +type StagePayload struct { + LinkID string `json:"link_id"` + XferID string `json:"xfer_id"` + Target string `json:"target"` + Size uint32 `json:"size"` + DigestAlg string `json:"digest_alg"` + Digest string `json:"digest"` + Meta any `json:"meta,omitempty"` + Artefact []byte `json:"artefact,omitempty"` +} + +type StageReply struct { + OK bool `json:"ok"` + Err string `json:"err,omitempty"` + Stage string `json:"stage,omitempty"` +} diff --git a/services/updater/updater.go b/services/updater/updater.go new file mode 100644 index 0000000..fc42d38 --- /dev/null +++ b/services/updater/updater.go @@ -0,0 +1,449 @@ +package updater + +import ( + "context" + "encoding/json" + "sync" + + "devicecode-go/bus" +) + +// Local-bus topics the updater binds to. Wire frames +// cmd/self/updater/{prepare,commit} are routed here by fabric. The +// staging path is a local RPC called by fabric after xfer_commit for +// target="updater/main"; raw/member topic names are not wire contract. +var ( + TopicPrepareRPC = bus.T("rpc", "updater", "prepare") + TopicCommitRPC = bus.T("rpc", "updater", "commit") + TopicStageRPC = bus.T("rpc", "updater", "stage") + + TopicSoftwareFact = bus.T("state", "self", "software") + TopicUpdaterFact = bus.T("state", "self", "updater") + TopicHealthFact = bus.T("state", "self", "health") + + // TopicFabricLink is the wildcard the updater watches to drive the + // post-hello_ack republish (W10). The fabric session retains a + // payload at state/fabric/link/ on every link-state edge; + // we pick out Ready-true transitions and call Republish() so the + // CM5 sees fresh state/self/* facts on every newly established + // session, warm or cold. + TopicFabricLink = bus.T("state", "fabric", "link", "+") +) + +// Identity carries the build-time stamp the software fact publishes. +// Filled in main.go (or tests) when constructing the updater. +type Identity struct { + Version string + Build string + ImageID string +} + +// MetadataReader is the read side of the abupdate metadata block — the +// updater pulls payload_sha256 and the staged descriptor (if any) from +// here at boot. The fabric-update branch only requires reads from this +// interface; the matching MetadataWriter handles staging-side +// persistence in W11. +type MetadataReader interface { + PayloadSHA256() string + StagedDescriptor() (StagedDescriptor, bool) +} + +// MetadataWriter is the write side: updater/main staging hands a verified +// StagedDescriptor + payload_sha256 here so the next boot's +// MetadataReader observes them. A default in-memory implementation is +// supplied (NewMemoryMetadata) for the fabric-update branch; the +// pico2-a-b/abupdate flash-backed implementation lands later (it +// touches the metadata sector at offset 0x000FF000 — see master +// plan §abupdate metadata block). +type MetadataWriter interface { + WriteStagedDescriptor(d StagedDescriptor) error + ClearStagedDescriptor() error +} + +// MemoryMetadata is the default in-memory MetadataReader+Writer used by host +// tests and non-persistent builds. +// +// Two separate payload-hash fields are intentional: +// - runningPayloadSHA — the hash of the IMAGE THAT IS RUNNING. Set +// once at boot from the active slot's metadata block. Read by +// SoftwareFact.PayloadSHA256. +// - stagedPayloadSHA — carried inside StagedDescriptor; lives only +// when a staged image is present. Cleared by +// ClearStagedDescriptor; never bleeds into the running fact. +// +// Sharing a single field would let prepare/stage-failure leave a +// stale staged hash sitting on the wire-visible software fact even +// after the descriptor was cleared. +type MemoryMetadata struct { + mu sync.Mutex + runningPayloadSHA string + desc StagedDescriptor + hasDesc bool +} + +// NewMemoryMetadata returns an empty MemoryMetadata. runningPayloadSHA +// stays "" until the caller calls SetRunningPayloadSHA from the boot +// path (typically reading the active slot's metadata block); the +// staged descriptor stays empty until updater/main staging writes it. +func NewMemoryMetadata() *MemoryMetadata { return &MemoryMetadata{} } + +// SetRunningPayloadSHA records the hash of the currently-running +// image. fabric-security wires this from the active slot's flash +// metadata at boot; tests can call it directly. Bare 64-char +// lower-hex per the spec. +func (m *MemoryMetadata) SetRunningPayloadSHA(sha string) { + m.mu.Lock() + defer m.mu.Unlock() + m.runningPayloadSHA = sha +} + +func (m *MemoryMetadata) PayloadSHA256() string { + m.mu.Lock() + defer m.mu.Unlock() + return m.runningPayloadSHA +} + +func (m *MemoryMetadata) StagedDescriptor() (StagedDescriptor, bool) { + m.mu.Lock() + defer m.mu.Unlock() + return m.desc, m.hasDesc +} + +func (m *MemoryMetadata) WriteStagedDescriptor(d StagedDescriptor) error { + m.mu.Lock() + defer m.mu.Unlock() + m.desc = d + m.hasDesc = true + // Note: running hash is NOT updated here. The staged hash lives + // inside the descriptor; it only becomes the running hash after + // a successful boot into the staged slot, at which point the + // next boot's SetRunningPayloadSHA pulls it from flash metadata. + return nil +} + +func (m *MemoryMetadata) ClearStagedDescriptor() error { + m.mu.Lock() + defer m.mu.Unlock() + m.desc = StagedDescriptor{} + m.hasDesc = false + return nil +} + +// nullMetadata is the zero-value default when the caller doesn't +// provide a MetadataReader. Read-only — no Write methods. +type nullMetadata struct{} + +func (nullMetadata) PayloadSHA256() string { return "" } +func (nullMetadata) StagedDescriptor() (StagedDescriptor, bool) { return StagedDescriptor{}, false } + +// Service is the updater state machine + RPC binder. Constructed once +// in reactor.go and run in its own goroutine. +type Service struct { + conn *bus.Connection + verifier Verifier + applier Applier + identity Identity + metadata MetadataReader + metadataWrite MetadataWriter + + mu sync.Mutex + state State + lastError string + pendingVersion string + pendingImageID string + stagedImageID string + jobID string + preparing bool + + // Logger seam — left as a small helper so tests can plug in. nil in + // tests means stderr-style println. + logf func(string, ...any) +} + +// Options bundle the constructor parameters so Service can grow new +// dependencies without churning callers. +type Options struct { + Conn *bus.Connection + Verifier Verifier + Applier Applier + Identity Identity + Metadata MetadataReader + MetadataWrite MetadataWriter +} + +// New builds a Service. Verifier defaults to the rejecting StubVerifier +// and Applier defaults to RefusingApplier so the production wiring +// never claims an apply succeeded when the apply path isn't +// implemented yet. Metadata defaults to a fresh in-memory +// implementation that's both reader and writer — fine for tests and +// for the rejecting-stub production path where nothing ever writes +// anyway. +func New(opts Options) *Service { + v := opts.Verifier + if v == nil { + v = StubVerifier() + } + a := opts.Applier + if a == nil { + a = RefusingApplier() + } + mr := opts.Metadata + mw := opts.MetadataWrite + if mr == nil && mw == nil { + shared := NewMemoryMetadata() + mr = shared + mw = shared + } else if mr == nil { + mr = nullMetadata{} + } else if mw == nil { + // Reader-only: writes from staging become no-ops. + mw = noopMetadataWriter{} + } + return &Service{ + conn: opts.Conn, + verifier: v, + applier: a, + identity: opts.Identity, + metadata: mr, + metadataWrite: mw, + state: StateRunning, + } +} + +// noopMetadataWriter is the writer-side fallback when the caller +// supplied a MetadataReader without a matching writer. +type noopMetadataWriter struct{} + +func (noopMetadataWriter) WriteStagedDescriptor(d StagedDescriptor) error { + return nil +} +func (noopMetadataWriter) ClearStagedDescriptor() error { + return nil +} + +// Run binds the RPC + staging topics, publishes the initial fact +// surface, and watches the fabric link-state retain for ready-true +// edges (W10). Blocks until ctx is cancelled. +func (s *Service) Run(ctx context.Context) { + prepareSub := s.conn.Subscribe(TopicPrepareRPC) + defer s.conn.Unsubscribe(prepareSub) + + commitSub := s.conn.Subscribe(TopicCommitRPC) + defer s.conn.Unsubscribe(commitSub) + + stageSub := s.conn.Subscribe(TopicStageRPC) + defer s.conn.Unsubscribe(stageSub) + + linkSub := s.conn.Subscribe(TopicFabricLink) + defer s.conn.Unsubscribe(linkSub) + + // Initial fact publish: tells the CM5 we're alive and reports + // build identity + the freshly generated boot_id. + s.PublishSoftware() + s.PublishUpdater() + s.PublishHealth("ok", "") + + // Track per-link ready state so we only republish on the + // !Ready -> Ready edge, not on every retain churn. + prevReady := map[string]bool{} + + for { + select { + case <-ctx.Done(): + return + case msg, ok := <-prepareSub.Channel(): + if !ok || msg == nil { + continue + } + s.handlePrepare(msg) + case msg, ok := <-commitSub.Channel(): + if !ok || msg == nil { + continue + } + s.handleCommit(msg) + case msg, ok := <-stageSub.Channel(): + if !ok || msg == nil { + continue + } + s.handleStage(msg) + case msg, ok := <-linkSub.Channel(): + if !ok || msg == nil { + continue + } + linkID, ready := decodeLinkState(msg) + if linkID == "" { + continue + } + was := prevReady[linkID] + if ready && !was { + // W10: post-hello_ack republish. Mirrors the spec line + // "republished after every successful boot AND on every + // newly established session (hello_ack), warm or cold". + s.Republish() + } + prevReady[linkID] = ready + } + } +} + +// Republish re-emits all retained `state/self/*` facts. Wired up to +// fabric's session lifecycle so every new hello_ack triggers a fresh +// retain — required by the spec for warm-and-cold session resumes. +func (s *Service) Republish() { + s.PublishSoftware() + s.PublishUpdater() + s.PublishHealth("ok", "") +} + +// transitionTo updates state under the lock and publishes the updater +// fact. Returns the previous state for callers that want to log or +// confirm a precondition. +func (s *Service) transitionTo(next State, lastError, pendingVersion string) State { + s.mu.Lock() + prev := s.state + s.state = next + if lastError != "" || (next != StateFailed && next != StateRollbackDetected) { + s.lastError = lastError + } + if pendingVersion != "" { + s.pendingVersion = pendingVersion + } else if next == StatePreparing || next == StateReady || next == StateReceiving { + s.pendingVersion = "" + } + s.mu.Unlock() + s.PublishUpdater() + return prev +} + +func (s *Service) setJobContext(jobID, pendingImageID string) { + s.mu.Lock() + s.jobID = jobID + s.pendingImageID = pendingImageID + s.stagedImageID = "" + s.mu.Unlock() +} + +func (s *Service) setStagedImage(imageID, version string) { + s.mu.Lock() + s.stagedImageID = imageID + if version != "" { + s.pendingVersion = version + } + s.mu.Unlock() +} + +func (s *Service) clearStagedImage() { + s.mu.Lock() + s.stagedImageID = "" + s.pendingVersion = "" + s.mu.Unlock() +} + +// markPrepareDone clears the preparing flag. handlePrepare/handleCommit +// guard re-entry through this. +func (s *Service) markPrepareDone() { + s.mu.Lock() + s.preparing = false + s.mu.Unlock() +} + +// boot-time initialization helper — main.go calls this before opening +// fabric so the first software-fact publish has a non-empty boot_id. +func (s *Service) ensureBootID() string { + id := BootID() + if id == "" { + id = GenerateBootID() + } + return id +} + +// reply is a thin convenience wrapper that tolerates nil msg (defensive +// against bus quirks observed during fabric-protocol bring-up where a +// ctx cancel could land a nil message on the channel). +func (s *Service) reply(msg *bus.Message, payload any) { + if msg == nil || !msg.CanReply() { + return + } + s.conn.Reply(msg, payload, false) +} + +// decodeLinkState extracts the link_id and ready flag from a +// state/fabric/link/ retain. Tolerates both the typed payload +// shape published by services/fabric/session.go and a generic +// map[string]any (in-process test harnesses). Returns ("", false) +// for any payload it can't make sense of — the caller treats that +// as "no edge". +func decodeLinkState(msg *bus.Message) (string, bool) { + if msg == nil { + return "", false + } + // Pull link_id from the topic tail (state/fabric/link/). + t := msg.Topic + if t == nil || t.Len() < 4 { + return "", false + } + last := t.At(t.Len() - 1) + linkID, _ := last.(string) + if linkID == "" { + return "", false + } + switch p := msg.Payload.(type) { + case nil: + return linkID, false + case map[string]any: + ready, _ := p["ready"].(bool) + return linkID, ready + } + // Fall back to JSON probe for the typed-struct payload that + // fabric publishes via its linkStatePayload type. + b, err := json.Marshal(msg.Payload) + if err != nil { + return linkID, false + } + var probe struct { + Ready bool `json:"ready"` + } + if err := json.Unmarshal(b, &probe); err != nil { + return linkID, false + } + return linkID, probe.Ready +} + +// jsonDecode is a small helper that tolerates both already-typed +// payloads (Go-side test wiring) and raw JSON payloads (real wire). +// Returns the decoded value or false on a hopeless mismatch. +func jsonDecode[T any](payload any) (T, bool) { + var out T + switch v := payload.(type) { + case nil: + return out, true + case T: + return v, true + case json.RawMessage: + if len(v) == 0 { + return out, true + } + if err := json.Unmarshal(v, &out); err != nil { + return out, false + } + return out, true + case []byte: + if len(v) == 0 { + return out, true + } + if err := json.Unmarshal(v, &out); err != nil { + return out, false + } + return out, true + } + // Fall back to re-marshaling unknown shapes; covers the test path + // where callers pass map[string]any that JSON-roundtrips. + b, err := json.Marshal(payload) + if err != nil { + return out, false + } + if err := json.Unmarshal(b, &out); err != nil { + return out, false + } + return out, true +} diff --git a/services/updater/updater_test.go b/services/updater/updater_test.go new file mode 100644 index 0000000..86747e3 --- /dev/null +++ b/services/updater/updater_test.go @@ -0,0 +1,873 @@ +package updater + +import ( + "bytes" + "context" + "encoding/hex" + "encoding/json" + "io" + "strings" + "testing" + "time" + + "devicecode-go/bus" +) + +// ---- helpers -------------------------------------------------------- + +func newTestBus() *bus.Bus { return bus.NewBus(8, "+", "#") } + +type fakeVerifierAccept struct { + manifest Manifest + payload []byte +} + +func (f *fakeVerifierAccept) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + if f.payload != nil { + _, _ = sink.Write(f.payload) + } else { + _, _ = io.Copy(sink, r) + } + return f.manifest, nil +} + +type fakeVerifierReject struct{ err error } + +func (f *fakeVerifierReject) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + _ = r + if sink != nil { + _ = sink.Abort() + } + return Manifest{}, f.err +} + +type fakeMetadata struct { + sha string + staged StagedDescriptor + has bool +} + +func (f *fakeMetadata) PayloadSHA256() string { return f.sha } +func (f *fakeMetadata) StagedDescriptor() (StagedDescriptor, bool) { return f.staged, f.has } + +// fakeApplier always succeeds — used by tests that need the commit RPC +// to drive the state machine through committing/rebooting without +// actually rebooting (production wiring uses RefusingApplier so the +// commit RPC returns apply_unavailable until fabric-security supplies +// the real abupdate-backed implementation). +// +// canCalls and rebootCalls are kept separate so tests can verify the commit +// ordering: CanApply first, publish rebooting + reply accepted, then ArmReboot. +type fakeApplier struct { + canCalls []StagedDescriptor + rebootCalls []StagedDescriptor +} + +func (f *fakeApplier) CanApply(d StagedDescriptor) error { + f.canCalls = append(f.canCalls, d) + return nil +} + +func (f *fakeApplier) ArmReboot(d StagedDescriptor) { + f.rebootCalls = append(f.rebootCalls, d) +} + +// ---- boot_id (W6) --------------------------------------------------- + +func TestBootIDIs16HexChars(t *testing.T) { + resetBootIDForTest() + id := GenerateBootID() + if len(id) != 16 { + t.Fatalf("len = %d, want 16", len(id)) + } + if _, err := hex.DecodeString(id); err != nil { + t.Fatalf("not hex: %v", err) + } +} + +func TestBootIDIsCachedAcrossCalls(t *testing.T) { + // Within a process boot, GenerateBootID is idempotent — multiple + // callers see the same value. + resetBootIDForTest() + a := GenerateBootID() + b := GenerateBootID() + if a != b { + t.Fatalf("non-idempotent: %s vs %s", a, b) + } +} + +func TestBootIDChangesAfterReset(t *testing.T) { + // resetBootIDForTest mimics a successful boot. 10 successive boots + // must all produce unique values (master R3 failure-mode list: + // "RNG-never-seeded / from-constant" guard). + seen := make(map[string]struct{}) + for i := 0; i < 10; i++ { + resetBootIDForTest() + id := GenerateBootID() + if _, dup := seen[id]; dup { + t.Fatalf("boot %d duplicated id %s", i, id) + } + seen[id] = struct{}{} + } +} + +func TestBootIDIsNotAllZero(t *testing.T) { + // "Generated-before-entropy" guard: all-zero sentinel should never + // be returned. The fallback path explicitly walks past it. + for i := 0; i < 20; i++ { + resetBootIDForTest() + id := GenerateBootID() + if id == "0000000000000000" { + t.Fatal("got all-zero boot_id") + } + } +} + +// ---- state machine + RPC handlers (W4) ------------------------------ + +func waitForFact[T any](t *testing.T, sub *bus.Subscription, want func(T) bool) T { + t.Helper() + deadline := time.After(2 * time.Second) + for { + select { + case msg := <-sub.Channel(): + if msg == nil { + continue + } + fact, ok := msg.Payload.(T) + if !ok { + continue + } + if want == nil || want(fact) { + return fact + } + case <-deadline: + t.Fatal("timeout waiting for fact") + } + } +} + +func strValue(p *string) string { + if p == nil { + return "" + } + return *p +} + +func testStagePayload(id string, artefact []byte) StagePayload { + return StagePayload{ + LinkID: "mcu0", + XferID: id, + Target: TargetUpdaterMain, + Size: uint32(len(artefact)), + DigestAlg: DigestAlgXXHash32, + Digest: "deadbeef", + Artefact: artefact, + } +} + +func runService(t *testing.T, b *bus.Bus, opts Options) (*Service, context.CancelFunc) { + t.Helper() + resetBootIDForTest() + if opts.Conn == nil { + t.Fatal("Options.Conn is required") + } + if opts.Identity.Version == "" { + opts.Identity = Identity{Version: "0.0.0-test", Build: "build-test", ImageID: "img-test"} + } + // Subscribe to the software-fact topic BEFORE starting Run, so we + // catch the initial publish without racing the goroutine's bus + // subscriptions. The probe lives on its own connection so it + // doesn't interfere with the caller's subscriptions. + probeConn := b.NewConnection("updater-probe") + probe := probeConn.Subscribe(TopicSoftwareFact) + svc := New(opts) + ctx, cancel := context.WithCancel(context.Background()) + go svc.Run(ctx) + select { + case msg := <-probe.Channel(): + if msg == nil { + t.Fatal("nil software fact at boot") + } + case <-time.After(2 * time.Second): + cancel() + t.Fatal("updater service did not publish initial software fact") + } + probeConn.Unsubscribe(probe) + return svc, cancel +} + +func TestPublishesInitialFactsOnRun(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + upSub := observer.Subscribe(TopicUpdaterFact) + defer observer.Unsubscribe(upSub) + hSub := observer.Subscribe(TopicHealthFact) + defer observer.Unsubscribe(hSub) + + _, cancel := runService(t, b, Options{ + Conn: conn, + Identity: Identity{Version: "1.2.3", Build: "abc", ImageID: "img-1"}, + }) + defer cancel() + + sw := waitForFact[SoftwareFact](t, swSub, nil) + if sw.Version != "1.2.3" || sw.BuildID != "abc" || sw.ImageID != "img-1" { + t.Fatalf("software identity wrong: %+v", sw) + } + if len(sw.BootID) != 16 { + t.Fatalf("boot_id len = %d, want 16 chars: %q", len(sw.BootID), sw.BootID) + } + + up := waitForFact[UpdaterFact](t, upSub, nil) + if up.State != StateRunning { + t.Fatalf("updater state = %q, want %q", up.State, StateRunning) + } + + h := waitForFact[HealthFact](t, hSub, nil) + if h.State != "ok" { + t.Fatalf("health state = %q, want ok", h.State) + } +} + +func TestPrepareTransitionsToReady(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + // drain initial running fact + _ = waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRunning }) + + req := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(PrepareReply) + if !ok { + t.Fatalf("reply payload type = %T", msg.Payload) + } + if !reply.Ready || reply.Target != TargetUpdaterMain || reply.MaxChunkSize != DefaultMaxChunkSize { + t.Fatalf("prepare reply = %+v, want ready target max_chunk_size", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for prepare reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateReady }) + if up.LastError != nil { + t.Fatalf("last_error not cleared on prepare: %q", strValue(up.LastError)) + } +} + +func TestCommitWithoutStagedReturnsNothingStaged(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + req := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(Reply) + if !ok { + t.Fatalf("reply payload type = %T", msg.Payload) + } + if reply.OK { + t.Fatalf("commit unexpectedly OK without staged image: %+v", reply) + } + if reply.Error != ErrNothingStaged { + t.Fatalf("commit error = %q, want %q", reply.Error, ErrNothingStaged) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } +} + +func TestCommitWithoutStagedStateRefusesEvenWithDescriptor(t *testing.T) { + // Both halves of the staged condition are required: a descriptor + // in metadata AND state == staged. A descriptor without the + // matching state means the receiver didn't actually finish, so + // commit must refuse rather than push into committing/rebooting. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + md := &fakeMetadata{ + has: true, + staged: StagedDescriptor{Version: "9.9.9", BuildID: "bx", ImageID: "ix", Length: 4096, Slot: 1, PayloadSHA256: strings.Repeat("a", 64)}, + } + _, cancel := runService(t, b, Options{Conn: conn, Metadata: md, Applier: &fakeApplier{}}) + defer cancel() + + req := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + select { + case msg := <-replySub.Channel(): + reply, _ := msg.Payload.(Reply) + if reply.OK || reply.Error != ErrNothingStaged { + t.Fatalf("commit reply = %+v, want refusal=nothing_staged", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } +} + +func TestCommitWithoutApplierReturnsApplyUnavailable(t *testing.T) { + // Spec safety: the commit RPC must not claim success when the MCU + // has no apply hook wired (the production default RefusingApplier + // returns ErrApplyUnavailable). State stays at staged; the + // receiver-staged descriptor remains valid for a subsequent + // commit once a real Applier is wired. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} + memMD := NewMemoryMetadata() + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Metadata: memMD, + MetadataWrite: memMD, + // No Applier supplied — defaults to RefusingApplier. + }) + defer cancel() + + // Drive updater/main staging to staged state. + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-x", []byte("blob")), false) + rsub := caller.Request(rreq) + defer caller.Unsubscribe(rsub) + <-rsub.Channel() + + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(Reply) + if reply.OK || reply.Error != ErrApplyUnavailable { + t.Fatalf("commit reply = %+v, want refusal=apply_unavailable", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } + + // State must NOT have transitioned to committing/rebooting — that would lie. + settle := time.After(150 * time.Millisecond) + for { + select { + case msg := <-upSub.Channel(): + fact, _ := msg.Payload.(UpdaterFact) + if fact.State == StateCommitting || fact.State == StateRebooting { + t.Fatalf("state transitioned to %s despite refusing applier", fact.State) + } + case <-settle: + return + } + } +} + +func TestCommitWithFakeApplierTransitionsToRebooting(t *testing.T) { + // With a real Applier supplied, the staged descriptor in metadata and state + // drives commit through committing to rebooting. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} + memMD := NewMemoryMetadata() + app := &fakeApplier{} + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: app, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + // Stage via updater/main. + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + <-caller.Request(rreq).Channel() + + // Commit. + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(CommitReply) + if !reply.Accepted || !reply.RebootRequired { + t.Fatalf("commit reply = %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + if len(app.canCalls) != 1 || len(app.rebootCalls) != 1 { + t.Fatalf("Applier hooks fired wrong: can=%d reboot=%d, want 1+1", + len(app.canCalls), len(app.rebootCalls)) + } + if app.rebootCalls[0].Version != "9.9.9" { + t.Fatalf("ArmReboot got descriptor.Version = %q, want 9.9.9", app.rebootCalls[0].Version) + } + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRebooting }) + if strValue(up.PendingVersion) != "9.9.9" { + t.Fatalf("pending_version = %q", strValue(up.PendingVersion)) + } +} + +// ---- updater/main staging path with fakes ---------------------------- + +func TestStageStubVerifierPublishesFailed(t *testing.T) { + // Production stub: any artefact is rejected. State must transition + // to failed with last_error matching the sentinel. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + _, cancel := runService(t, b, Options{Conn: conn, Verifier: StubVerifier()}) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-1", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(StageReply) + if !ok || reply.OK { + t.Fatalf("stage unexpectedly OK with stub: %+v", reply) + } + if !strings.Contains(reply.Err, "verifier_stub") { + t.Fatalf("stage err = %q, want stub sentinel", reply.Err) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if !strings.Contains(strValue(up.LastError), "verifier_stub") { + t.Fatalf("last_error = %q, want stub sentinel", strValue(up.LastError)) + } +} + +func TestStageFakeAcceptWritesStagedDescriptor(t *testing.T) { + // W11: on verifier success staging writes the manifest's + // fields to the metadata writer. A subsequent commit RPC reads + // the descriptor back via the matching reader and transitions + // to rebooting with the same pending_version. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{ + Version: "9.9.9", + BuildID: "bx", + ImageID: "ix", + PayloadSHA256: "deadbeef", + PayloadLength: 4, + }} + memMD := NewMemoryMetadata() + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: &fakeApplier{}, // success path; production default refuses + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + // Drive updater/main staging to verifier success. + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-w11", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + select { + case msg := <-replySub.Channel(): + reply, _ := msg.Payload.(StageReply) + if !reply.OK { + t.Fatalf("stage reply not ok: %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + // Reader sees the staged descriptor + its embedded payload hash. + desc, ok := memMD.StagedDescriptor() + if !ok { + t.Fatal("staged descriptor not persisted") + } + if desc.Version != "9.9.9" || desc.PayloadSHA256 != "deadbeef" || desc.Length != 4 { + t.Fatalf("descriptor wrong: %+v", desc) + } + // WriteStagedDescriptor must not promote the staged hash into the + // running-image hash. Running hash stays "" until SetRunningPayloadSHA is + // called at the next boot. + if got := memMD.PayloadSHA256(); got != "" { + t.Fatalf("running payload_sha256 leaked from staged descriptor: %q", got) + } + + // Commit RPC now succeeds because the reader sees the descriptor + // AND state is staged AND a real Applier is wired. + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(CommitReply) + if !reply.Accepted { + t.Fatalf("commit reply not ok: %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for commit reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateRebooting }) + if strValue(up.PendingVersion) != "9.9.9" { + t.Fatalf("pending_version = %q, want 9.9.9", strValue(up.PendingVersion)) + } +} + +func TestStageFailureClearsStaleStagedDescriptor(t *testing.T) { + // A (stage A) -> (prepare for B) -> (stage B fails) flow must not leave + // descriptor A persisted. The next commit should return nothing_staged + // rather than committing stale firmware. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + // Pre-stage: a real descriptor sitting in metadata from an earlier + // successful flow. + memMD := NewMemoryMetadata() + _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "1.0.0", PayloadSHA256: "old"}) + + // Service uses a verifier that always rejects. + verif := &fakeVerifierReject{err: errString("bad_signature")} + _, cancel := runService(t, b, Options{ + Conn: conn, + Verifier: verif, + Applier: &fakeApplier{}, + Metadata: memMD, + MetadataWrite: memMD, + }) + defer cancel() + + // Drive updater/main staging to failure. + rreq := caller.NewMessage(TopicStageRPC, testStagePayload("x", []byte("blob")), false) + rsub := caller.Request(rreq) + defer caller.Unsubscribe(rsub) + select { + case <-rsub.Channel(): + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + // The stale descriptor must have been cleared. + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatalf("stale staged descriptor survived receiver failure") + } + + // Commit must refuse with nothing_staged rather than commit the + // stale image. + creq := caller.NewMessage(TopicCommitRPC, CommitRequest{}, false) + csub := caller.Request(creq) + defer caller.Unsubscribe(csub) + select { + case msg := <-csub.Channel(): + reply, _ := msg.Payload.(Reply) + if reply.OK || reply.Error != ErrNothingStaged { + t.Fatalf("commit reply = %+v, want refusal=nothing_staged", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } +} + +func TestPrepareClearsStaleStagedDescriptor(t *testing.T) { + // A new prepare invalidates any prior persisted stage so a partial- + // failure subsequent transfer can't accidentally commit the + // previously-staged image. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + + memMD := NewMemoryMetadata() + _ = memMD.WriteStagedDescriptor(StagedDescriptor{Version: "1.0.0", PayloadSHA256: "old"}) + + _, cancel := runService(t, b, Options{ + Conn: conn, + Metadata: memMD, + MetadataWrite: memMD, + Applier: &fakeApplier{}, + }) + defer cancel() + + preq := caller.NewMessage(TopicPrepareRPC, PrepareRequest{Target: PrepareTargetMCU}, false) + psub := caller.Request(preq) + defer caller.Unsubscribe(psub) + select { + case msg := <-psub.Channel(): + reply, _ := msg.Payload.(PrepareReply) + if !reply.Ready { + t.Fatalf("prepare reply = %+v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } + + if _, ok := memMD.StagedDescriptor(); ok { + t.Fatalf("stale staged descriptor survived prepare") + } +} + +func TestStageFakeAcceptPublishesStaged(t *testing.T) { + // Test fake exercises the success path that fabric-security will + // flesh out in production. State -> staged, pending_version mirrors + // the manifest's build version, reply.OK = true. + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierAccept{manifest: Manifest{Version: "9.9.9", BuildID: "bx", ImageID: "ix", PayloadSHA256: strings.Repeat("a", 64), PayloadLength: 4}} + _, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-2", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(StageReply) + if !ok || !reply.OK || reply.Stage != "staged" { + t.Fatalf("stage reply = %+v ok-type=%v", reply, ok) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateStaged }) + if strValue(up.PendingVersion) != "9.9.9" { + t.Fatalf("pending_version = %q, want 9.9.9", strValue(up.PendingVersion)) + } +} + +func TestStageFakeRejectPublishesFailed(t *testing.T) { + b := newTestBus() + conn := b.NewConnection("updater") + caller := b.NewConnection("caller") + upSub := caller.Subscribe(TopicUpdaterFact) + defer caller.Unsubscribe(upSub) + + verif := &fakeVerifierReject{err: errString("manifest_check_failed")} + _, cancel := runService(t, b, Options{Conn: conn, Verifier: verif}) + defer cancel() + + req := caller.NewMessage(TopicStageRPC, testStagePayload("xfer-3", []byte("blob")), false) + replySub := caller.Request(req) + defer caller.Unsubscribe(replySub) + + select { + case msg := <-replySub.Channel(): + reply, ok := msg.Payload.(StageReply) + if !ok || reply.OK { + t.Fatalf("stage unexpectedly OK: %+v", reply) + } + if reply.Err != "manifest_check_failed" { + t.Fatalf("stage err = %q, want manifest_check_failed", reply.Err) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for stage reply") + } + + up := waitForFact[UpdaterFact](t, upSub, func(f UpdaterFact) bool { return f.State == StateFailed }) + if strValue(up.LastError) != "manifest_check_failed" { + t.Fatalf("last_error = %q, want manifest_check_failed", strValue(up.LastError)) + } +} + +func TestRepublishOnLinkReadyEdge(t *testing.T) { + // W10 contract: the updater republishes its retained state/self/* + // surface on every !Ready -> Ready transition observed on + // state/fabric/link/. Verifies the edge is detected without + // double-firing on subsequent retains that keep Ready=true. + b := newTestBus() + conn := b.NewConnection("updater") + observer := b.NewConnection("observer") + swSub := observer.Subscribe(TopicSoftwareFact) + defer observer.Unsubscribe(swSub) + + _, cancel := runService(t, b, Options{Conn: conn}) + defer cancel() + + // Drain the initial software fact emitted on Run start. + _ = waitForFact[SoftwareFact](t, swSub, nil) + + // Publish a link-state retain with Ready=false first; should not + // trigger a republish. + publisher := b.NewConnection("test-fabric") + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu0"), + map[string]any{"ready": false, "established": false}, + true, + )) + // Brief wait then drop everything that's already in the channel. + time.Sleep(50 * time.Millisecond) + for len(swSub.Channel()) > 0 { + <-swSub.Channel() + } + + // Now flip Ready to true: the !Ready -> Ready edge MUST trigger a + // software-fact republish. + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x"}, + true, + )) + _ = waitForFact[SoftwareFact](t, swSub, nil) + + // Subsequent Ready=true retain (no edge) should NOT trigger another + // republish. We assert by checking the channel is empty after a + // short settle window. + publisher.Publish(publisher.NewMessage( + bus.T("state", "fabric", "link", "mcu0"), + map[string]any{"ready": true, "established": true, "peer_sid": "cm5-x", "last_rx_ms": int64(123)}, + true, + )) + settled := time.After(150 * time.Millisecond) + for { + select { + case <-swSub.Channel(): + t.Fatal("unexpected republish on subsequent Ready=true retain") + case <-settled: + return + } + } +} + +// ---- jsonDecode robustness ------------------------------------------ + +func TestJSONDecodeAcceptsTypedAndRaw(t *testing.T) { + t1, ok := jsonDecode[PrepareRequest](PrepareRequest{Target: "x"}) + if !ok || t1.Target != "x" { + t.Fatalf("typed: %v %v", ok, t1) + } + raw := json.RawMessage(`{"target":"y"}`) + t2, ok := jsonDecode[PrepareRequest](raw) + if !ok || t2.Target != "y" { + t.Fatalf("raw: %v %v", ok, t2) + } + t3, ok := jsonDecode[PrepareRequest](nil) + if !ok || t3.Target != "" { + t.Fatalf("nil: %v %v", ok, t3) + } + t4, ok := jsonDecode[PrepareRequest]([]byte(`{"target":"z"}`)) + if !ok || t4.Target != "z" { + t.Fatalf("bytes: %v %v", ok, t4) + } +} + +func TestIdentityFromStageMetaAppliesUploadMetadata(t *testing.T) { + defaults := Identity{Version: "0.0.0-dev", Build: "local", ImageID: "img-dev"} + meta := json.RawMessage(`{"version":"13.0","build":"fw-update-e2e-13.0","image_id":"mcu-dev-13.0"}`) + + ident, applied := identityFromStageMeta(defaults, meta) + if !applied { + t.Fatal("metadata was not applied") + } + if ident.Version != "13.0" || ident.Build != "fw-update-e2e-13.0" || ident.ImageID != "mcu-dev-13.0" { + t.Fatalf("identity = %+v", ident) + } +} + +func TestIdentityFromStageMetaAcceptsBuildIDAndKeepsDefaults(t *testing.T) { + defaults := Identity{Version: "1.0.0", Build: "old-build", ImageID: "old-img"} + meta := map[string]any{"build_id": "new-build"} + + ident, applied := identityFromStageMeta(defaults, meta) + if !applied { + t.Fatal("metadata was not applied") + } + if ident.Version != "1.0.0" || ident.Build != "new-build" || ident.ImageID != "old-img" { + t.Fatalf("identity = %+v", ident) + } +} + +func TestIdentityFromStageMetaAcceptsNestedRequestMetadata(t *testing.T) { + defaults := Identity{Version: "0.0.0-dev", Build: "local", ImageID: "img-dev"} + meta := map[string]any{ + "request": map[string]any{ + "expected_image_id": "mcu-dev-13.0", + "metadata": map[string]any{ + "version": "13.0", + "build": "fw-update-e2e-13.0", + }, + }, + } + + ident, applied := identityFromStageMeta(defaults, meta) + if !applied { + t.Fatal("metadata was not applied") + } + if ident.Version != "13.0" || ident.Build != "fw-update-e2e-13.0" || ident.ImageID != "mcu-dev-13.0" { + t.Fatalf("identity = %+v", ident) + } +} + +// ---- memorySink behaviour ------------------------------------------- + +func TestMemorySinkAbortClearsBuffer(t *testing.T) { + s := &memorySink{} + _, _ = s.Write([]byte("hello")) + _ = s.Abort() + if got := s.buf.Len(); got != 0 { + t.Fatalf("after abort buf len = %d, want 0", got) + } +} + +func TestMemorySinkCommitClosesWrites(t *testing.T) { + s := &memorySink{} + _, _ = s.Write([]byte("hello")) + if err := s.Commit(); err != nil { + t.Fatalf("commit: %v", err) + } + _, err := s.Write([]byte("more")) + if err != io.ErrClosedPipe { + t.Fatalf("write after commit err = %v, want io.ErrClosedPipe", err) + } +} + +// errString is a tiny error type for tests that don't want to import +// the standard errors package twice. +type errString string + +func (e errString) Error() string { return string(e) } + +// Compile-time assert that bytes.NewReader satisfies the verifier API. +var _ io.Reader = bytes.NewReader(nil) diff --git a/services/updater/verifier.go b/services/updater/verifier.go new file mode 100644 index 0000000..6adaf11 --- /dev/null +++ b/services/updater/verifier.go @@ -0,0 +1,115 @@ +package updater + +import ( + "errors" + "io" +) + +// Manifest is the small subset of the signed-image manifest that updater +// staging needs after verification succeeds. The full canonical manifest +// lives in pico2-a-b/imagev1 (added in fabric-security); this type is the +// local interface we can carry across the staging -> updater -> +// state/self/updater pipeline without depending on imagev1. +type Manifest struct { + Version string + BuildID string + ImageID string + PayloadSHA256 string + PayloadLength uint32 +} + +// SlotSink is what the verifier writes verified payload bytes into. +// In production this lands in the inactive abupdate slot; in tests it +// can be backed by a bytes.Buffer or similar. Keep the interface tiny. +type SlotSink interface { + io.Writer + // Commit finalises the staged write. Called after the verifier has + // finished streaming and confirms the payload SHA-256 matches the + // manifest. Returns the descriptor-relevant fields. + Commit() error + // Abort rolls back any partial write so the next prepare/commit + // starts from a clean slot. + Abort() error +} + +// Verifier is updater/main staging's hook into signed-image verification. The +// production wiring on the fabric-update branch passes a stub that +// always rejects (ErrUnsignedNotSupported); fabric-security ships a +// real adapter over pico2-a-b/imagev1.Verify that fills the same +// interface. +type Verifier interface { + // Verify reads the artefact bytes from r, validates the signed + // envelope (header + manifest + signature), and on success streams + // the verified payload into sink. Returns the trusted manifest the + // staging path propagates to the staged descriptor and software fact. + // + // On failure: sink.Abort is called by the verifier itself before + // returning so staging doesn't have to special-case it. + Verify(r io.Reader, sink SlotSink) (Manifest, error) +} + +// ErrUnsignedNotSupported is the sentinel returned by the production +// stub on this branch. The wire `last_error` value is set to its +// Error() string so Lua-side test harnesses can grep for it. +var ErrUnsignedNotSupported = errors.New("verifier_stub: unsigned images not supported on this build") + +// Applier is the slot-switch + reboot hook for the commit RPC. Split in two so +// handleCommit can publish the rebooting retain and reply accepted before the +// reboot fires; an implementation that reboots inside Apply would otherwise +// skip both the wire reply and the state/self/updater retain. +// +// The fabric-update branch ships a refusing default (RefusingApplier) +// so the commit RPC never lies about apply success on a branch where +// the apply path doesn't exist. fabric-security supplies a real +// abupdate-backed implementation that triggers REBOOT_TYPE_FLASH_UPDATE +// into the staged slot. +type Applier interface { + // CanApply validates that the apply path is wired and the + // descriptor is acceptable. Quick, no side effects beyond minimal + // validation. Errors here surface in the commit reply as + // {ok:false, error:}; the canonical committing/rebooting + // retains are NOT published. + CanApply(d StagedDescriptor) error + + // ArmReboot schedules the slot-switch + reboot. Called only AFTER + // handleCommit has published state=rebooting and replied accepted to the + // caller. Real implementations may reboot inside this call (it + // won't return); the spec contract is that callers must do their + // pre-reboot work first. + ArmReboot(d StagedDescriptor) +} + +// refusingApplier is the production default. CanApply always returns +// ErrApplyUnavailable so commit refuses with +// `error: "apply_unavailable"` and never reaches ArmReboot. +type refusingApplier struct{} + +// RefusingApplier returns the safe-default Applier for this branch. +func RefusingApplier() Applier { return refusingApplier{} } + +func (refusingApplier) CanApply(d StagedDescriptor) error { + _ = d + return errors.New(ErrApplyUnavailable) +} + +// ArmReboot is a contract-required no-op for the refusing default — +// CanApply rejects every descriptor, so the commit handler never +// calls this. Defined for interface conformance. +func (refusingApplier) ArmReboot(d StagedDescriptor) { _ = d } + +// stubVerifier is the production verifier this branch ships with. It +// always rejects, so no untrusted firmware can stage. fabric-security +// replaces this with a real imagev1-backed adapter. +type stubVerifier struct{} + +// StubVerifier returns the rejecting production verifier. Staging wiring +// takes a Verifier; production passes this, tests pass fakes. +func StubVerifier() Verifier { return stubVerifier{} } + +func (stubVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + _ = r + if sink != nil { + _ = sink.Abort() + } + return Manifest{}, ErrUnsignedNotSupported +} diff --git a/services/updater/verifier_passthrough.go b/services/updater/verifier_passthrough.go new file mode 100644 index 0000000..e1e6f91 --- /dev/null +++ b/services/updater/verifier_passthrough.go @@ -0,0 +1,50 @@ +package updater + +import ( + "crypto/sha256" + "encoding/hex" + "errors" + "io" +) + +// passthroughVerifier accepts any artefact, streams its bytes straight +// into sink while computing SHA-256, and returns a synthetic manifest +// with the artefact length + computed hash. Intended for the bringup +// stack on this branch where the signed-image v1 envelope (header + +// canonical manifest + Ed25519 signature) is not yet implemented. +// +// Replace with a real verifier when fabric-security lands; this exists +// so fw-update-e2e can drive the staging → applier → reboot path +// end-to-end without the signed-image scaffolding in place. +type passthroughVerifier struct { + identity Identity +} + +// PassthroughVerifier returns a Verifier that accepts any artefact and +// fills the manifest with identity (caller-supplied), the artefact +// length, and the SHA-256 of the streamed payload. Reboot-time apply +// is gated by the Applier; a passthrough verifier without a real +// applier still ends with state=failed(apply_unavailable) at commit. +func PassthroughVerifier(identity Identity) Verifier { + return passthroughVerifier{identity: identity} +} + +func (v passthroughVerifier) Verify(r io.Reader, sink SlotSink) (Manifest, error) { + if sink == nil { + return Manifest{}, errors.New("passthrough_verifier: nil sink") + } + hasher := sha256.New() + mw := io.MultiWriter(sink, hasher) + n, err := io.Copy(mw, r) + if err != nil { + _ = sink.Abort() + return Manifest{}, err + } + return Manifest{ + Version: v.identity.Version, + BuildID: v.identity.Build, + ImageID: v.identity.ImageID, + PayloadSHA256: hex.EncodeToString(hasher.Sum(nil)), + PayloadLength: uint32(n), + }, nil +}