From c8db970d5b1b36d0cd8a65c192910a61e3b2ec16 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 09:29:45 +0000 Subject: [PATCH 01/65] feat: fabric session protocol and wire format Add the fabric service for CM5/MCU communication over UART. This implements the v1 JSON-lines protocol from fabric.md: - Session state machine with hello/hello_ack handshake - CM5-driven ping/pong heartbeat with 45s stale timeout - Incoming pub/call/unretain dispatch with import rules - Outgoing export replay gated on peer handshake - Outgoing wire-call support for remote RPC - Pending call tracking with timeout and correlation - Ping guard: drop pings when link is not up - Structured session logging (log/logKV) - Build-tag-gated transport trace (fabric_trace) - ShmringTransport for TinyGo cooperative scheduler - RWTransport for host testing with buffered io --- services/fabric/fabric.go | 43 + services/fabric/fabric_test.go | 1498 +++++++++++++++++++++++++ services/fabric/session.go | 880 +++++++++++++++ services/fabric/session_timer_test.go | 20 + services/fabric/trace.go | 45 + services/fabric/trace_disabled.go | 5 + services/fabric/trace_enabled.go | 5 + services/fabric/transport_rw.go | 97 ++ services/fabric/transport_shmring.go | 148 +++ services/fabric/wire.go | 90 ++ 10 files changed, 2831 insertions(+) create mode 100644 services/fabric/fabric.go create mode 100644 services/fabric/fabric_test.go create mode 100644 services/fabric/session.go create mode 100644 services/fabric/session_timer_test.go create mode 100644 services/fabric/trace.go create mode 100644 services/fabric/trace_disabled.go create mode 100644 services/fabric/trace_enabled.go create mode 100644 services/fabric/transport_rw.go create mode 100644 services/fabric/transport_shmring.go create mode 100644 services/fabric/wire.go diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go new file mode 100644 index 0000000..d0e3cd9 --- /dev/null +++ b/services/fabric/fabric.go @@ -0,0 +1,43 @@ +package fabric + +import ( + "context" + "sync/atomic" + + "devicecode-go/bus" + "devicecode-go/x/strconvx" +) + +// Transport abstracts the byte stream as newline-delimited JSON lines. +type Transport interface { + ReadLine() ([]byte, error) + WriteLine(data []byte) error + Close() error +} + +const protoVersion = 1 +const defaultLinkID = "mcu0" + +var nextSessionID atomic.Uint64 + +func newLocalSID() string { + return "mcu-sid-" + strconvx.Utoa64(nextSessionID.Add(1)) +} + +// Run starts the fabric session. Blocks until ctx is cancelled or the +// transport returns an unrecoverable error. The MCU is respond-only: +// it never initiates hello or ping. It waits for hello from the CM5 +// and replies with hello_ack; it responds to ping with pong. The CM5 +// owns heartbeat cadence — the MCU marks the link stale if nothing +// arrives within the timeout. +func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string) { + s := session{ + linkID: defaultLinkID, + nodeID: nodeID, + peerID: peerID, + localSID: newLocalSID(), + tr: tr, + conn: conn, + } + s.run(ctx) +} diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go new file mode 100644 index 0000000..775393c --- /dev/null +++ b/services/fabric/fabric_test.go @@ -0,0 +1,1498 @@ +package fabric + +import ( + "bytes" + "context" + "encoding/json" + "errors" + "io" + "strings" + "testing" + "time" + + "devicecode-go/bus" + "devicecode-go/types" + "devicecode-go/x/shmring" +) + +func pipePair() (*RWTransport, *RWTransport) { + r1, w1 := io.Pipe() + r2, w2 := io.Pipe() + return NewRWTransport(r2, w1), NewRWTransport(r1, w2) +} + +func newBus() *bus.Bus { return bus.NewBus(3, "+", "#") } + +type captureTransport struct { + writes [][]byte + writeErr error +} + +func (t *captureTransport) ReadLine() ([]byte, error) { return nil, io.EOF } + +func (t *captureTransport) WriteLine(data []byte) error { + if t.writeErr != nil { + return t.writeErr + } + cp := append([]byte(nil), data...) + t.writes = append(t.writes, cp) + return nil +} + +func (t *captureTransport) Close() error { return nil } + +func readMsg[T any](t *testing.T, tr Transport) T { + t.Helper() + line, err := tr.ReadLine() + if err != nil { + t.Fatalf("ReadLine: %v", err) + } + var msg T + if err := json.Unmarshal(line, &msg); err != nil { + t.Fatalf("Unmarshal %q: %v", line, err) + } + return msg +} + +func sendMsg(t *testing.T, tr Transport, v any) { + t.Helper() + b := marshal(v) + if err := tr.WriteLine(b[:len(b)-1]); err != nil { + t.Fatalf("WriteLine: %v", err) + } +} + +func bringUp(t *testing.T, cm5 Transport) wireHelloAck { + t.Helper() + sendMsg(t, cm5, wireHello{ + T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, + }) + ack := readMsg[wireHelloAck](t, cm5) + if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { + t.Fatalf("bad hello_ack: %+v", ack) + } + time.Sleep(50 * time.Millisecond) + return ack +} + +func unlockExports(t *testing.T, cm5 Transport, sid string) { + t.Helper() + sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: sid}) + pong := readMsg[wirePong](t, cm5) + if pong.T != "pong" { + t.Fatalf("expected pong, got %q", pong.T) + } +} + +// ---- codec ---- + +func TestCodecRoundTrip(t *testing.T) { + orig := wireHello{T: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} + data := marshal(orig) + if !bytes.HasSuffix(data, []byte("\n")) { + t.Error("marshal should end with newline") + } + jsonPart := data[:len(data)-1] + if bytes.Contains(jsonPart, []byte("\n")) { + t.Error("JSON should not contain embedded newlines") + } + if wireType(jsonPart) != "hello" { + t.Errorf("wireType = %q", wireType(jsonPart)) + } + var dec wireHello + json.Unmarshal(jsonPart, &dec) + if dec != orig { + t.Errorf("round-trip: %+v vs %+v", dec, orig) + } +} + +func TestCodecAllTypes(t *testing.T) { + for _, tc := range []struct { + v any + want string + }{ + {wireHello{T: "hello"}, "hello"}, + {wireHelloAck{T: "hello_ack"}, "hello_ack"}, + {wirePing{T: "ping", TS: 1}, "ping"}, + {wirePong{T: "pong", TS: 2}, "pong"}, + {wirePub{T: "pub", Topic: []string{"a"}}, "pub"}, + {wireUnretain{T: "unretain", Topic: []string{"a"}}, "unretain"}, + {wireCall{T: "call", ID: "c1"}, "call"}, + {wireReply{T: "reply", Corr: "c1", OK: true}, "reply"}, + } { + b := marshal(tc.v) + if got := wireType(b[:len(b)-1]); got != tc.want { + t.Errorf("wireType = %q, want %q", got, tc.want) + } + } +} + +func TestWireTypeBadInput(t *testing.T) { + for _, b := range [][]byte{[]byte("not json"), []byte(`{"no_t":true}`), nil} { + if got := wireType(b); got != "" { + t.Errorf("wireType(%q) = %q, want empty", b, got) + } + } +} + +// ---- transport ---- + +func TestTransportRoundTrip(t *testing.T) { + a, b := pipePair() + done := make(chan struct{}) + go func() { + defer close(done) + line, err := b.ReadLine() + if err != nil { + t.Errorf("ReadLine: %v", err) + return + } + if string(line) != `{"t":"ping","ts":99}` { + t.Errorf("got %q", line) + } + }() + sendMsg(t, a, wirePing{T: "ping", TS: 99}) + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("timeout") + } +} + +func TestOversizeLineRecovery(t *testing.T) { + big := `{"t":"ping","ts":0,"x":"` + strings.Repeat("x", maxLineLen+100) + `"}` + input := big + "\n" + `{"t":"ping","ts":3}` + "\n" + tr := NewRWTransport(strings.NewReader(input), io.Discard) + _, err := tr.ReadLine() + if !errors.Is(err, ErrLineTooLong) { + t.Fatalf("expected ErrLineTooLong, got %v", err) + } + line, err := tr.ReadLine() + if err != nil { + t.Fatalf("second ReadLine: %v", err) + } + if string(line) != `{"t":"ping","ts":3}` { + t.Errorf("got %q", line) + } +} + +// ---- shmring transport ---- + +func TestShmringTransportRoundTrip(t *testing.T) { + rx := shmring.New(256) + tx := shmring.New(256) + mcuTr := NewShmringTransport(rx, tx) + defer mcuTr.Close() + + rx.TryWriteFrom([]byte(`{"t":"ping","ts":42}` + "\n")) + line, err := mcuTr.ReadLine() + if err != nil { + t.Fatalf("ReadLine: %v", err) + } + if string(line) != `{"t":"ping","ts":42}` { + t.Errorf("got %q", line) + } + + if err := mcuTr.WriteLine([]byte(`{"t":"pong","ts":42}`)); err != nil { + t.Fatalf("WriteLine: %v", err) + } + var out [128]byte + n := tx.TryReadInto(out[:]) + if string(out[:n]) != `{"t":"pong","ts":42}`+"\n" { + t.Errorf("tx got %q", out[:n]) + } +} + +func TestShmringTransportMultiLine(t *testing.T) { + rx := shmring.New(256) + tr := NewShmringTransport(rx, shmring.New(256)) + defer tr.Close() + rx.TryWriteFrom([]byte(`{"t":"ping","ts":1}` + "\n" + `{"t":"ping","ts":2}` + "\n")) + line1, _ := tr.ReadLine() + line2, _ := tr.ReadLine() + if string(line1) != `{"t":"ping","ts":1}` { + t.Errorf("line1 = %q", line1) + } + if string(line2) != `{"t":"ping","ts":2}` { + t.Errorf("line2 = %q", line2) + } +} + +func TestShmringTransportReadLineWrapsAcrossSegments(t *testing.T) { + rx := shmring.New(8) + tr := NewShmringTransport(rx, shmring.New(8)) + defer tr.Close() + + rx.TryWriteFrom([]byte("123456")) + var discard [6]byte + if n := rx.TryReadInto(discard[:]); n != len(discard) { + t.Fatalf("priming read = %d, want %d", n, len(discard)) + } + if n := rx.TryWriteFrom([]byte("ab\n")); n != 3 { + t.Fatalf("wrapped write = %d, want 3", n) + } + + line, err := tr.ReadLine() + if err != nil { + t.Fatalf("ReadLine: %v", err) + } + if string(line) != "ab" { + t.Errorf("got %q", line) + } +} + +func TestShmringTransportWriteLineWrapsAcrossSegments(t *testing.T) { + tx := shmring.New(8) + tr := NewShmringTransport(shmring.New(8), tx) + defer tr.Close() + + tx.TryWriteFrom([]byte("123456")) + var discard [6]byte + if n := tx.TryReadInto(discard[:]); n != len(discard) { + t.Fatalf("priming read = %d, want %d", n, len(discard)) + } + + if err := tr.WriteLine([]byte("ab")); err != nil { + t.Fatalf("WriteLine: %v", err) + } + var out [3]byte + if n := tx.TryReadInto(out[:]); n != len(out) { + t.Fatalf("wrapped read = %d, want %d", n, len(out)) + } + if string(out[:]) != "ab\n" { + t.Errorf("tx got %q", out[:]) + } +} + +func TestShmringTransportOversize(t *testing.T) { + rx := shmring.New(4096) + tr := NewShmringTransport(rx, shmring.New(256)) + defer tr.Close() + big := make([]byte, maxLineLen+100) + for i := range big { + big[i] = 'x' + } + rx.TryWriteFrom(big) + rx.TryWriteFrom([]byte("\n")) + rx.TryWriteFrom([]byte(`{"t":"ping","ts":7}` + "\n")) + _, err := tr.ReadLine() + if !errors.Is(err, ErrLineTooLong) { + t.Fatalf("expected ErrLineTooLong, got %v", err) + } + line, err := tr.ReadLine() + if err != nil { + t.Fatalf("second ReadLine: %v", err) + } + if string(line) != `{"t":"ping","ts":7}` { + t.Errorf("got %q", line) + } +} + +func TestShmringTransportCloseUnblocks(t *testing.T) { + tr := NewShmringTransport(shmring.New(256), shmring.New(256)) + done := make(chan struct{}) + go func() { tr.ReadLine(); close(done) }() + tr.Close() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("ReadLine did not unblock") + } +} + +// ---- handshake ---- + +func TestHandshake(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + sendMsg(t, cm5, wireHello{ + T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, + }) + ack := readMsg[wireHelloAck](t, cm5) + if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { + t.Errorf("bad ack: %+v", ack) + } + time.Sleep(50 * time.Millisecond) + sendMsg(t, cm5, wirePing{T: "ping", TS: 99, SID: "s1"}) + pong := readMsg[wirePong](t, cm5) + if pong.TS != 99 || pong.SID != ack.SID { + t.Errorf("bad pong: %+v ack=%+v", pong, ack) + } +} + +func TestSessionReset(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + bringUp(t, cm5) + + sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + ack := readMsg[wireHelloAck](t, cm5) + if !ack.OK || ack.SID == "" || ack.Proto != protoVersion { + t.Error("hello_ack.OK = false") + } + sendMsg(t, cm5, wirePing{T: "ping", TS: 55, SID: "s2"}) + pong := readMsg[wirePong](t, cm5) + if pong.TS != 55 || pong.SID != ack.SID { + t.Errorf("bad pong: %+v ack=%+v", pong, ack) + } +} + +func TestRejectsWrongPeer(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) + gotLine := make(chan readResult, 1) + go func() { + line, err := cm5.ReadLine() + gotLine <- readResult{line: line, err: err} + }() + select { + case <-gotLine: + t.Fatal("got response to wrong-peer hello") + case <-time.After(200 * time.Millisecond): + } + sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + select { + case res := <-gotLine: + if res.err != nil { + t.Fatalf("ReadLine error: %v", res.err) + } + var ack wireHelloAck + if err := json.Unmarshal(res.line, &ack); err != nil { + t.Fatalf("expected hello_ack: %v", err) + } + if !ack.OK { + t.Fatal("hello_ack.OK = false") + } + case <-time.After(2 * time.Second): + t.Fatal("no hello_ack for correct peer") + } +} + +func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + gotLine := make(chan readResult, 1) + go func() { + line, err := cm5.ReadLine() + gotLine <- readResult{line: line, err: err} + }() + + sendMsg(t, cm5, wireHello{T: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) + select { + case <-gotLine: + t.Fatal("got response to hello without node") + case <-time.After(200 * time.Millisecond): + } + + sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + select { + case res := <-gotLine: + if res.err != nil { + t.Fatalf("ReadLine error: %v", res.err) + } + var ack wireHelloAck + if err := json.Unmarshal(res.line, &ack); err != nil { + t.Fatalf("expected hello_ack: %v", err) + } + if !ack.OK { + t.Fatal("hello_ack.OK = false") + } + case <-time.After(2 * time.Second): + t.Fatal("no hello_ack for correct peer") + } +} + +func TestPingPong(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + sendMsg(t, cm5, wirePing{T: "ping", TS: 42, SID: "s1"}) + pong := readMsg[wirePong](t, cm5) + if pong.TS != 42 || pong.SID != ack.SID { + t.Errorf("bad pong: %+v ack=%+v", pong, ack) + } +} + +func TestMCUNeverInitiates(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + gotLine := make(chan struct{}) + go func() { cm5.ReadLine(); close(gotLine) }() + select { + case <-gotLine: + t.Fatal("MCU sent unsolicited message") + case <-time.After(2 * time.Second): + } + cancel() +} + +func TestUnknownTypeIgnored(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + bringUp(t, cm5) + cm5.WriteLine([]byte(`{"t":"future_msg"}`)) + sendMsg(t, cm5, wirePing{T: "ping", TS: 1}) + pong := readMsg[wirePong](t, cm5) + if pong.TS != 1 { + t.Errorf("pong.TS = %d", pong.TS) + } +} + +func TestMalformedJSONIgnored(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + bringUp(t, cm5) + cm5.WriteLine([]byte("not json")) + sendMsg(t, cm5, wirePing{T: "ping", TS: 2}) + pong := readMsg[wirePong](t, cm5) + if pong.TS != 2 { + t.Errorf("pong.TS = %d", pong.TS) + } +} + +func TestCancelClosesCleanly(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan struct{}) + go func() { Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local"); close(done) }() + bringUp(t, cm5) + cancel() + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("Run did not return") + } +} + +func TestLinkStatePublishedOnHandshake(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu0")) + defer observer.Unsubscribe(sub) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + ack := bringUp(t, cm5) + + var sawOpening bool + deadline := time.After(2 * time.Second) + for { + select { + case msg := <-sub.Channel(): + if msg == nil { + t.Fatal("nil link-state message") + } + payload, ok := msg.Payload.(linkStatePayload) + if !ok { + t.Fatalf("payload type = %T, want linkStatePayload", msg.Payload) + } + if payload.Status == "opening" { + sawOpening = true + } + if payload.Status == "ready" { + if payload.LinkID != "mcu0" { + t.Fatalf("link_id = %q, want mcu0", payload.LinkID) + } + if !payload.Ready || !payload.Established { + t.Fatalf("expected ready/established link state, got %+v", payload) + } + if payload.PeerID != "cm5-local" { + t.Fatalf("peer_id = %q, want cm5-local", payload.PeerID) + } + if payload.LocalSID != ack.SID { + t.Fatalf("local_sid = %q, want %q", payload.LocalSID, ack.SID) + } + if payload.PeerSID != "s1" { + t.Fatalf("peer_sid = %q, want s1", payload.PeerSID) + } + if !sawOpening { + t.Fatal("did not observe opening link state before ready") + } + return + } + case <-deadline: + t.Fatal("timed out waiting for ready link state") + } + } +} + +// ---- remap ---- + +func topicString(t bus.Topic) string { + if t == nil { + return "" + } + var parts []string + for i := 0; i < t.Len(); i++ { + parts = append(parts, t.At(i).(string)) + } + return strings.Join(parts, "/") +} + +func TestImportPublishTopic(t *testing.T) { + for _, tc := range []struct { + wire []string + want string + }{ + {[]string{"config", "device"}, "config/device"}, + {[]string{"config", "other"}, ""}, + {[]string{"unknown", "x"}, ""}, + {nil, ""}, + } { + got := importPublishTopic(tc.wire) + if gotStr := topicString(got); gotStr != tc.want { + t.Errorf("importPublishTopic(%v) = %q, want %q", tc.wire, gotStr, tc.want) + } + } +} + +func TestImportCallTopic(t *testing.T) { + for _, tc := range []struct { + wire []string + want string + }{ + {[]string{"rpc", "hal", "read_state"}, "rpc/hal/read_state"}, + {[]string{"rpc", "hal", "dump"}, "rpc/hal/dump"}, + {[]string{"rpc", "hal", "other"}, ""}, + {[]string{"config", "device"}, ""}, + {nil, ""}, + } { + got := importCallTopic(tc.wire) + if gotStr := topicString(got); gotStr != tc.want { + t.Errorf("importCallTopic(%v) = %q, want %q", tc.wire, gotStr, tc.want) + } + } +} + +func TestExportTopic(t *testing.T) { + for _, tc := range []struct { + bus bus.Topic + want []string + }{ + {bus.T("hal", "cap", "env", "temperature", "core", "value"), []string{"state", "env", "temperature", "core", "value"}}, + {bus.T("hal", "cap", "power", "battery", "internal", "value"), []string{"state", "power", "battery", "internal", "value"}}, + {bus.T("hal", "state"), []string{"state", "hal"}}, + {bus.T("hal", "cap", "gpio", "fan", "value"), nil}, + {bus.T("other", "topic"), nil}, + } { + got := exportTopic(tc.bus) + if tc.want == nil { + if got != nil { + t.Errorf("exportTopic(%v) = %v, want nil", tc.bus, got) + } + } else { + if !slicesEqual(got, tc.want) { + t.Errorf("exportTopic(%v) = %v, want %v", tc.bus, got, tc.want) + } + } + } +} + +func TestExportCallTopic(t *testing.T) { + for _, tc := range []struct { + bus bus.Topic + want []string + }{ + {bus.T("fabric", "out", "rpc", "hal", "read_state"), []string{"rpc", "hal", "read_state"}}, + {bus.T("fabric", "out", "rpc", "hal", "dump"), nil}, + {bus.T("fabric", "out", "rpc", "hal"), nil}, + {bus.T("other", "topic"), nil}, + } { + got := exportCallTopic(tc.bus) + if tc.want == nil { + if got != nil { + t.Errorf("exportCallTopic(%v) = %v, want nil", tc.bus, got) + } + } else if !slicesEqual(got, tc.want) { + t.Errorf("exportCallTopic(%v) = %v, want %v", tc.bus, got, tc.want) + } + } +} + +func TestExportCallPatterns(t *testing.T) { + patterns := exportCallPatterns() + if len(patterns) != 1 { + t.Fatalf("len(exportCallPatterns()) = %d, want 1", len(patterns)) + } + if got := topicString(patterns[0]); got != "fabric/out/rpc/hal/read_state" { + t.Fatalf("exportCallPatterns()[0] = %q, want fabric/out/rpc/hal/read_state", got) + } +} + +func slicesEqual(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +// ---- pub import ---- + +func TestPubImport(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + conn := b.NewConnection("fabric") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, conn, "mcu-1", "cm5-local") + bringUp(t, cm5) + + reader := b.NewConnection("test") + sub := reader.Subscribe(bus.T("config", "device")) + + sendMsg(t, cm5, wirePub{ + T: "pub", + Topic: []string{"config", "device"}, + Payload: json.RawMessage(`{"mode":"normal"}`), + Retain: false, + }) + + select { + case m := <-sub.Channel(): + if m == nil { + t.Fatal("nil message") + } + raw, ok := m.Payload.(json.RawMessage) + if !ok { + t.Fatalf("payload type = %T, want json.RawMessage", m.Payload) + } + if string(raw) != `{"mode":"normal"}` { + t.Errorf("payload = %s", raw) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for imported pub") + } +} + +// ---- pub export ---- + +func TestPubExport(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + publishConn := b.NewConnection("hal") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + unlockExports(t, cm5, ack.SID) + + publishConn.Publish(publishConn.NewMessage( + bus.T("hal", "cap", "env", "temperature", "core", "value"), + map[string]int{"deci_c": 412}, + true, + )) + + msg := readMsg[wirePub](t, cm5) + if msg.T != "pub" { + t.Fatalf("expected pub, got %q", msg.T) + } + want := []string{"state", "env", "temperature", "core", "value"} + if !slicesEqual(msg.Topic, want) { + t.Errorf("topic = %v, want %v", msg.Topic, want) + } + if !msg.Retain { + t.Error("expected retain=true") + } +} + +func TestDrainExportsReturnsWhenSubscriptionClosed(t *testing.T) { + b := newBus() + conn := b.NewConnection("fabric") + sub := conn.Subscribe(bus.T("state", "#")) + conn.Unsubscribe(sub) + + s := session{ + link: linkUp, + exportSubs: []*bus.Subscription{sub}, + } + + done := make(chan struct{}) + go func() { + s.drainExports() + close(done) + }() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("drainExports did not return") + } +} + +func TestDrainExportsWaitsForStartupHoldoff(t *testing.T) { + b := newBus() + conn := b.NewConnection("fabric") + pub := b.NewConnection("hal") + sub := conn.Subscribe(bus.T("hal", "cap", "env", "#")) + defer conn.Unsubscribe(sub) + + msg := pub.NewMessage( + bus.T("hal", "cap", "env", "temperature", "core", "value"), + map[string]int{"deci_c": 412}, + true, + ) + + s := session{ + link: linkUp, + exportsArmed: true, + exportSubs: []*bus.Subscription{sub}, + exportReadyAt: time.Now().Add(time.Second), + } + + pub.Publish(msg) + + done := make(chan struct{}) + go func() { + s.drainExports() + close(done) + }() + + select { + case <-done: + case <-time.After(2 * time.Second): + t.Fatal("drainExports did not return") + } +} + +// ---- unretain ---- + +func TestPubIgnoredBeforeHandshake(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + sendMsg(t, cm5, wirePub{ + T: "pub", Topic: []string{"config", "device"}, + Payload: json.RawMessage(`{"v":1}`), Retain: true, + }) + time.Sleep(50 * time.Millisecond) + + reader := b.NewConnection("test") + sub := reader.Subscribe(bus.T("config", "device")) + defer reader.Unsubscribe(sub) + select { + case m := <-sub.Channel(): + t.Fatalf("unexpected pre-handshake publish: %+v", m) + case <-time.After(100 * time.Millisecond): + } +} + +func TestUnretainIgnoredBeforeHandshake(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + writer := b.NewConnection("writer") + writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(`{"v":1}`), true)) + + reader := b.NewConnection("test") + sub := reader.Subscribe(bus.T("config", "device")) + defer reader.Unsubscribe(sub) + select { + case m := <-sub.Channel(): + if m == nil || m.Payload == nil { + t.Fatalf("expected retained config/device, got %+v", m) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for retained config/device") + } + + sendMsg(t, cm5, wireUnretain{T: "unretain", Topic: []string{"config", "device"}}) + select { + case m := <-sub.Channel(): + t.Fatalf("unexpected pre-handshake unretain effect: %+v", m) + case <-time.After(100 * time.Millisecond): + } +} + +func TestUnretain(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + conn := b.NewConnection("fabric") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, conn, "mcu-1", "cm5-local") + bringUp(t, cm5) + + sendMsg(t, cm5, wirePub{ + T: "pub", Topic: []string{"config", "device"}, + Payload: json.RawMessage(`{"v":1}`), Retain: true, + }) + time.Sleep(50 * time.Millisecond) + sendMsg(t, cm5, wireUnretain{T: "unretain", Topic: []string{"config", "device"}}) + time.Sleep(50 * time.Millisecond) + + reader := b.NewConnection("test") + sub := reader.Subscribe(bus.T("config", "device")) + select { + case m := <-sub.Channel(): + if m != nil && m.Payload != nil { + t.Errorf("expected no retained message, got %+v", m) + } + case <-time.After(100 * time.Millisecond): + } +} + +// ---- call import ---- + +func TestCallIgnoredBeforeHandshake(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + + handler := b.NewConnection("handler") + sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) + defer handler.Unsubscribe(sub) + + sendMsg(t, cm5, wireCall{ + T: "call", ID: "pre-hello-1", Topic: []string{"rpc", "hal", "dump"}, + Payload: json.RawMessage(`{}`), TimeoutMs: 5000, + }) + + select { + case m := <-sub.Channel(): + t.Fatalf("unexpected pre-handshake call dispatch: %+v", m) + case <-time.After(100 * time.Millisecond): + } +} + +func TestCallImport(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + bringUp(t, cm5) + + handler := b.NewConnection("handler") + sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) + go func() { + for m := range sub.Channel() { + handler.Reply(m, map[string]string{"result": "ok"}, false) + } + }() + + sendMsg(t, cm5, wireCall{ + T: "call", ID: "test-corr-1", Topic: []string{"rpc", "hal", "dump"}, + Payload: json.RawMessage(`{}`), TimeoutMs: 5000, + }) + + reply := readMsg[wireReply](t, cm5) + if reply.Corr != "test-corr-1" { + t.Errorf("corr = %q", reply.Corr) + } + if !reply.OK { + t.Errorf("reply not ok: %s", reply.Err) + } +} + +func TestCallNoRoute(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + bringUp(t, cm5) + + sendMsg(t, cm5, wireCall{ + T: "call", ID: "no-route-1", Topic: []string{"unknown", "endpoint"}, + Payload: json.RawMessage(`{}`), TimeoutMs: 1000, + }) + + reply := readMsg[wireReply](t, cm5) + if reply.Corr != "no-route-1" { + t.Errorf("corr = %q", reply.Corr) + } + if reply.OK { + t.Error("expected ok=false") + } + if reply.Err != "no_route" { + t.Errorf("err = %q, want no_route", reply.Err) + } +} + +func TestCallHandlerError(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + bringUp(t, cm5) + + handler := b.NewConnection("handler") + sub := handler.Subscribe(bus.T("rpc", "hal", "read_state")) + go func() { + for m := range sub.Channel() { + handler.Reply(m, struct { + OK bool `json:"ok"` + Error string `json:"error"` + }{OK: false, Error: "device_busy"}, false) + } + }() + + sendMsg(t, cm5, wireCall{ + T: "call", ID: "err-1", Topic: []string{"rpc", "hal", "read_state"}, + Payload: json.RawMessage(`{}`), TimeoutMs: 5000, + }) + + reply := readMsg[wireReply](t, cm5) + if reply.Corr != "err-1" { + t.Errorf("corr = %q", reply.Corr) + } + if reply.OK { + t.Error("expected ok=false for handler error") + } + if reply.Err != "device_busy" { + t.Errorf("err = %q, want device_busy", reply.Err) + } +} + +func TestCallDoesNotBlockPing(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + bringUp(t, cm5) + + handler := b.NewConnection("handler") + sub := handler.Subscribe(bus.T("rpc", "hal", "read_state")) + go func() { + for m := range sub.Channel() { + time.Sleep(300 * time.Millisecond) + handler.Reply(m, map[string]string{"result": "ok"}, false) + } + }() + + sendMsg(t, cm5, wireCall{ + T: "call", ID: "slow-1", Topic: []string{"rpc", "hal", "read_state"}, + Payload: json.RawMessage(`{}`), TimeoutMs: 1000, + }) + sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: "s1"}) + + type readResult struct { + line []byte + err error + } + first := make(chan readResult, 1) + go func() { + line, err := cm5.ReadLine() + first <- readResult{line: line, err: err} + }() + + select { + case res := <-first: + if res.err != nil { + t.Fatalf("ReadLine: %v", res.err) + } + if got := wireType(res.line); got != "pong" { + t.Fatalf("first response type = %q, want pong", got) + } + var pong wirePong + if err := json.Unmarshal(res.line, &pong); err != nil { + t.Fatalf("Unmarshal pong: %v", err) + } + if pong.TS != 77 || pong.SID == "" { + t.Fatalf("bad pong: %+v", pong) + } + case <-time.After(150 * time.Millisecond): + t.Fatal("ping blocked behind slow call") + } + + reply := readMsg[wireReply](t, cm5) + if reply.Corr != "slow-1" { + t.Errorf("corr = %q", reply.Corr) + } + if !reply.OK { + t.Errorf("reply not ok: %s", reply.Err) + } +} + +func TestCallExport(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + unlockExports(t, cm5, ack.SID) + + type result struct { + msg *bus.Message + err error + } + done := make(chan result, 1) + go func() { + msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "read_state"), + map[string]string{"ask": "status"}, + false, + )) + done <- result{msg: msg, err: err} + }() + + call := readMsg[wireCall](t, cm5) + if call.T != "call" { + t.Fatalf("expected call, got %q", call.T) + } + want := []string{"rpc", "hal", "read_state"} + if !slicesEqual(call.Topic, want) { + t.Fatalf("topic = %v, want %v", call.Topic, want) + } + var payload map[string]string + if err := json.Unmarshal(call.Payload, &payload); err != nil { + t.Fatalf("Unmarshal payload: %v", err) + } + if payload["ask"] != "status" { + t.Fatalf("payload.ask = %q, want status", payload["ask"]) + } + + sendMsg(t, cm5, wireReply{ + T: "reply", + Corr: call.ID, + OK: true, + Payload: json.RawMessage(`{"ok":true,"remote":"cm5"}`), + }) + + select { + case res := <-done: + if res.err != nil { + t.Fatalf("RequestWait: %v", res.err) + } + if res.msg == nil { + t.Fatal("nil bus reply") + } + reply, ok := res.msg.Payload.(map[string]any) + if !ok { + t.Fatalf("payload type = %T, want map[string]any", res.msg.Payload) + } + if reply["remote"] != "cm5" { + t.Fatalf("reply.remote = %#v", reply["remote"]) + } + if reply["ok"] != true { + t.Fatalf("reply.ok = %#v", reply["ok"]) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for local reply") + } +} + +func TestCallExportOnlyConfiguredRule(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + unlockExports(t, cm5, ack.SID) + + reqCtx, reqCancel := context.WithTimeout(context.Background(), 250*time.Millisecond) + defer reqCancel() + go func() { + _, _ = reqConn.RequestWait(reqCtx, reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "dump"), + map[string]string{"ask": "status"}, + false, + )) + }() + + gotLine := make(chan struct{}) + go func() { + _, _ = cm5.ReadLine() + close(gotLine) + }() + + select { + case <-gotLine: + t.Fatal("got wire call for unconfigured export rule") + case <-time.After(200 * time.Millisecond): + } +} + +func TestPendingWireCallsTimeout(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + msg := reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "read_state"), + map[string]string{"ask": "status"}, + false, + ) + sub := reqConn.Request(msg) + defer reqConn.Unsubscribe(sub) + + s := session{ + conn: fabricConn, + pendingWireCalls: []*pendingWireCall{ + {id: "wire-1", req: msg, deadline: time.Now().Add(-time.Millisecond)}, + }, + } + + s.drainPendingWireCalls(time.Now()) + + select { + case reply := <-sub.Channel(): + if reply == nil { + t.Fatal("nil timeout reply") + } + out, ok := reply.Payload.(types.ErrorReply) + if !ok { + t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) + } + if out.OK { + t.Fatal("expected ok=false") + } + if out.Error != "timeout" { + t.Fatalf("error = %q, want timeout", out.Error) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for timeout reply") + } +} + +func TestDrainExportsDropsUnmarshalablePayload(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + pubConn := b.NewConnection("publisher") + tr := &captureTransport{} + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + } + + s.setupExports() + defer s.teardownExports() + + pubConn.Publish(pubConn.NewMessage(bus.T("hal", "state"), make(chan int), false)) + s.drainExports() + + if len(tr.writes) != 0 { + t.Fatalf("writes = %d, want 0", len(tr.writes)) + } +} + +func TestDrainPendingCallsReportsMarshalFailure(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + handlerConn := b.NewConnection("handler") + tr := &captureTransport{} + + sub := handlerConn.Subscribe(bus.T("rpc", "hal", "read_state")) + defer handlerConn.Unsubscribe(sub) + req := fabricConn.NewMessage(bus.T("rpc", "hal", "read_state"), map[string]string{"ask": "status"}, false) + replySub := fabricConn.Request(req) + + var msg *bus.Message + select { + case msg = <-sub.Channel(): + if msg == nil { + t.Fatal("nil request message") + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for request message") + } + handlerConn.Reply(msg, make(chan int), false) + + s := session{ + conn: fabricConn, + tr: tr, + pendingCalls: []*pendingCall{{ + id: "call-1", + sub: replySub, + deadline: time.Now().Add(time.Second), + }}, + } + + s.drainPendingCalls(time.Now()) + + if len(tr.writes) != 1 { + t.Fatalf("writes = %d, want 1", len(tr.writes)) + } + var reply wireReply + if err := json.Unmarshal(tr.writes[0], &reply); err != nil { + t.Fatalf("Unmarshal reply: %v", err) + } + if reply.Corr != "call-1" { + t.Fatalf("corr = %q, want call-1", reply.Corr) + } + if reply.OK { + t.Fatal("expected ok=false") + } + if reply.Err != errPayloadMarshal { + t.Fatalf("err = %q, want %q", reply.Err, errPayloadMarshal) + } +} + +func TestDrainOutgoingWireCallsReportsMarshalFailure(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + tr := &captureTransport{} + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + } + + s.setupExports() + defer s.teardownExports() + + msg := reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "read_state"), + make(chan int), + false, + ) + replySub := reqConn.Request(msg) + defer reqConn.Unsubscribe(replySub) + + s.drainOutgoingWireCalls(time.Now()) + + if len(tr.writes) != 0 { + t.Fatalf("writes = %d, want 0", len(tr.writes)) + } + if len(s.pendingWireCalls) != 0 { + t.Fatalf("pendingWireCalls = %d, want 0", len(s.pendingWireCalls)) + } + + select { + case reply := <-replySub.Channel(): + if reply == nil { + t.Fatal("nil reply") + } + out, ok := reply.Payload.(types.ErrorReply) + if !ok { + t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) + } + if out.OK { + t.Fatal("expected ok=false") + } + if out.Error != errPayloadMarshal { + t.Fatalf("error = %q, want %q", out.Error, errPayloadMarshal) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for marshal failure reply") + } +} + +func TestDrainOutgoingWireCallsReportsWriteFailure(t *testing.T) { + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + tr := &captureTransport{writeErr: errors.New("boom")} + s := session{ + conn: fabricConn, + tr: tr, + link: linkUp, + } + + s.setupExports() + defer s.teardownExports() + + msg := reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "read_state"), + map[string]string{"ask": "status"}, + false, + ) + replySub := reqConn.Request(msg) + defer reqConn.Unsubscribe(replySub) + + s.drainOutgoingWireCalls(time.Now()) + + if s.link != linkDown { + t.Fatalf("link = %v, want %v", s.link, linkDown) + } + if len(s.pendingWireCalls) != 0 { + t.Fatalf("pendingWireCalls = %d, want 0", len(s.pendingWireCalls)) + } + + select { + case reply := <-replySub.Channel(): + if reply == nil { + t.Fatal("nil reply") + } + out, ok := reply.Payload.(types.ErrorReply) + if !ok { + t.Fatalf("payload type = %T, want types.ErrorReply", reply.Payload) + } + if out.OK { + t.Fatal("expected ok=false") + } + if out.Error != "transport_write_failed" { + t.Fatalf("error = %q, want transport_write_failed", out.Error) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for write failure reply") + } +} + +func TestCallExportPeerReset(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + unlockExports(t, cm5, ack.SID) + + type result struct { + msg *bus.Message + err error + } + done := make(chan result, 1) + go func() { + msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "read_state"), + map[string]string{"ask": "status"}, + false, + )) + done <- result{msg: msg, err: err} + }() + + call := readMsg[wireCall](t, cm5) + if call.T != "call" { + t.Fatalf("expected call, got %q", call.T) + } + + sendMsg(t, cm5, wireHello{ + T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "fresh-session", Proto: protoVersion, + }) + _ = readMsg[wireHelloAck](t, cm5) + + select { + case res := <-done: + if res.err != nil { + t.Fatalf("RequestWait: %v", res.err) + } + if res.msg == nil { + t.Fatal("nil bus reply") + } + out, ok := res.msg.Payload.(types.ErrorReply) + if !ok { + t.Fatalf("payload type = %T, want types.ErrorReply", res.msg.Payload) + } + if out.OK { + t.Fatal("expected ok=false") + } + if out.Error != "peer_session_changed" { + t.Fatalf("error = %q, want peer_session_changed", out.Error) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for peer-reset reply") + } +} + +func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + reqConn := b.NewConnection("caller") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + unlockExports(t, cm5, ack.SID) + + type result struct { + msg *bus.Message + err error + } + done := make(chan result, 1) + go func() { + msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( + bus.T("fabric", "out", "rpc", "hal", "read_state"), + map[string]string{"ask": "status"}, + false, + )) + done <- result{msg: msg, err: err} + }() + + call := readMsg[wireCall](t, cm5) + if call.T != "call" { + t.Fatalf("expected call, got %q", call.T) + } + + sendMsg(t, cm5, wireHelloAck{ + T: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, + }) + + sendMsg(t, cm5, wireReply{ + T: "reply", + Corr: call.ID, + OK: true, + Payload: json.RawMessage(`{"ok":true,"remote":"cm5"}`), + }) + + select { + case res := <-done: + if res.err != nil { + t.Fatalf("RequestWait: %v", res.err) + } + if res.msg == nil { + t.Fatal("nil bus reply") + } + reply, ok := res.msg.Payload.(map[string]any) + if !ok { + t.Fatalf("payload type = %T, want map[string]any", res.msg.Payload) + } + if reply["remote"] != "cm5" || reply["ok"] != true { + t.Fatalf("unexpected reply payload: %#v", reply) + } + case <-time.After(2 * time.Second): + t.Fatal("timeout waiting for local reply after echoed hello_ack") + } +} diff --git a/services/fabric/session.go b/services/fabric/session.go new file mode 100644 index 0000000..d9be723 --- /dev/null +++ b/services/fabric/session.go @@ -0,0 +1,880 @@ +package fabric + +import ( + "context" + "encoding/json" + "errors" + "time" + + "devicecode-go/bus" + "devicecode-go/types" + "devicecode-go/x/strconvx" +) + +// ---- link state ---- + +type linkState int + +const ( + linkDown linkState = iota + linkUp +) + +// ---- timeouts (local policy) ---- + +const ( + staleTimeout = 45 * time.Second + callTimeoutDef = 5 * time.Second + waitLogEvery = 2 * time.Second + exportStartHoldoff = 1 * time.Second + // Give the serial reactor a chance to drain hello_ack before + // promoteLink publishes bus state and starts more work. This avoids + // relying on incidental println/GC timing in TinyGo. + postHelloAckSettle = 10 * time.Millisecond + // exportMaxPerTick caps the total export messages sent per drain + // cycle across all subscriptions, keeping UART throughput within + // the 115200-baud link capacity. + exportMaxPerTick = 1 + exportWaitFallback = 15 * time.Second + errPayloadMarshal = "payload_marshal_failed" +) + +// session manages the fabric link state machine over a Transport. +// +// All bus access happens in the main loop goroutine only. TinyGo's +// cooperative scheduler panics if multiple goroutines contend on +// the bus's internal sync.Mutex. +type session struct { + linkID string + nodeID string + peerID string + localSID string + tr Transport + conn *bus.Connection + + link linkState + remoteNode string + peerSID string + peerProto int + helloSeen bool + lastRxAt time.Time + lastTxAt time.Time + lastPongAt time.Time + exportReadyAt time.Time + exportWaitUntil time.Time + exportsArmed bool + + exportSubs []*bus.Subscription + exportCallSubs []*bus.Subscription + pendingCalls []*pendingCall + pendingWireCalls []*pendingWireCall + nextWireCallID uint64 +} + +func (s *session) log(msg string) { + println("[fabric]", "sid", s.localSID, msg) +} + +func (s *session) logKV(msg, key, value string) { + println("[fabric]", "sid", s.localSID, msg, key, value) +} + +type pendingCall struct { + id string + sub *bus.Subscription + deadline time.Time +} + +type pendingWireCall struct { + id string + req *bus.Message + deadline time.Time +} + +type readResult struct { + line []byte + err error +} + +type linkStatePayload struct { + LinkID string `json:"link_id"` + Status string `json:"status"` + Ready bool `json:"ready"` + Established bool `json:"established"` + PeerID string `json:"peer_id"` + LocalSID string `json:"local_sid"` + PeerSID string `json:"peer_sid,omitempty"` + RemoteID string `json:"remote_id,omitempty"` + PeerProto int `json:"peer_proto,omitempty"` + LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` + LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` + LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` + PendingCalls int `json:"pending_calls"` + PendingWireCalls int `json:"pending_wire_calls"` + Reason string `json:"reason,omitempty"` + Err string `json:"err,omitempty"` +} + +// run is the main loop. Blocks until ctx is cancelled. +func (s *session) run(ctx context.Context) { + lines := make(chan readResult, 1) + + go func() { + defer close(lines) + for { + line, err := s.tr.ReadLine() + if err != nil { + if errors.Is(err, ErrLineTooLong) { + s.log("oversized line dropped") + continue + } + select { + case lines <- readResult{err: err}: + case <-ctx.Done(): + } + return + } + cp := make([]byte, len(line)) + copy(cp, line) + select { + case lines <- readResult{line: cp}: + case <-ctx.Done(): + return + } + } + }() + + defer s.tr.Close() + defer s.teardownExports() + defer s.teardownPendingCalls() + defer s.teardownPendingWireCalls("link_down") + defer s.log("run stop") + + stale := time.NewTimer(staleTimeout) + defer stale.Stop() + + waitTick := time.NewTicker(waitLogEvery) + defer waitTick.Stop() + + // Poll subscription channels periodically. Needed because select + // blocks until a line/timer fires; without this, exported bus + // messages and async call replies would sit in subscription channels. + exportTick := time.NewTicker(50 * time.Millisecond) + defer exportTick.Stop() + + s.logWaiting() + s.publishLinkState("", "") + s.log("run start") + + for { + select { + case <-ctx.Done(): + return + + case res, ok := <-lines: + if !ok { + return + } + if res.err != nil { + s.handleLinkDown("transport_down", res.err.Error()) + return + } + if s.dispatch(res.line) { + resetTimer(stale, staleTimeout) + } + + case <-exportTick.C: + s.drainExports() + s.drainPendingCalls(time.Now()) + s.drainWireCalls(time.Now()) + + case <-waitTick.C: + s.logWaiting() + + case <-stale.C: + if s.link == linkUp { + s.handleLinkDown("peer_stale", "") + } else { + stale.Reset(staleTimeout) + } + } + } +} + +func resetTimer(t *time.Timer, d time.Duration) { + if !t.Stop() { + select { + case <-t.C: + default: + } + } + t.Reset(d) +} + +func unixMilli(t time.Time) int64 { + if t.IsZero() { + return 0 + } + return t.UnixMilli() +} + +func (s *session) currentStatus() string { + if s.link == linkUp { + return "ready" + } + return "opening" +} + +func (s *session) publishLinkState(reason, err string) { + if s.conn == nil { + return + } + status := s.currentStatus() + if s.link != linkUp && (reason != "" || err != "") { + status = "down" + } + s.conn.Publish(s.conn.NewMessage( + bus.T("state", "fabric", "link", s.linkID), + linkStatePayload{ + LinkID: s.linkID, + Status: status, + Ready: s.link == linkUp, + Established: s.link == linkUp, + PeerID: s.peerID, + LocalSID: s.localSID, + PeerSID: s.peerSID, + RemoteID: s.remoteNode, + PeerProto: s.peerProto, + LastRxUnixMilli: unixMilli(s.lastRxAt), + LastTxUnixMilli: unixMilli(s.lastTxAt), + LastPongUnixMilli: unixMilli(s.lastPongAt), + PendingCalls: len(s.pendingCalls), + PendingWireCalls: len(s.pendingWireCalls), + Reason: reason, + Err: err, + }, + true, + )) +} + +func (s *session) noteRx(msgType string) { + s.lastRxAt = time.Now() + if msgType == "pong" { + s.lastPongAt = s.lastRxAt + } +} + +func (s *session) noteTx() { + s.lastTxAt = time.Now() +} + +func (s *session) handleLinkDown(reason, err string) { + pendingReason := reason + if pendingReason == "" { + pendingReason = "link_down" + } + s.link = linkDown + s.remoteNode = "" + s.peerSID = "" + s.peerProto = 0 + s.helloSeen = false + s.exportReadyAt = time.Time{} + s.exportWaitUntil = time.Time{} + s.exportsArmed = false + s.teardownExports() + s.teardownPendingCalls() + s.teardownPendingWireCalls(pendingReason) + s.publishLinkState(reason, err) + if err != "" { + s.logKV("link down", "err", err) + } else if reason != "" { + s.logKV("link down", "reason", reason) + } +} + +// promoteLink transitions to linkUp, tearing down any prior session state. +func (s *session) promoteLink(reason string) { + if s.link == linkUp { + s.teardownExports() + s.teardownPendingCalls() + if reason == "" { + reason = "peer_reset" + } + s.teardownPendingWireCalls(reason) + } + s.link = linkUp + s.exportReadyAt = time.Time{} + s.exportWaitUntil = time.Now().Add(exportWaitFallback) + s.exportsArmed = false + s.publishLinkState(reason, "") +} + +// ---- dispatch ---- + +func (s *session) dispatch(line []byte) bool { + msgType := wireType(line) + switch msgType { + case "hello": + return s.onHello(line) + case "hello_ack": + return s.onHelloAck(line) + case "ping": + return s.onPing(line) + case "pong": + return s.onPong(line) + case "pub": + return s.onPub(line) + case "unretain": + return s.onUnretain(line) + case "call": + return s.onCall(line) + case "reply": + return s.onReply(line) + default: + if msgType == "" { + s.log("invalid frame dropped") + } else { + s.logKV("unknown frame type dropped", "type", msgType) + } + return false + } +} + +func (s *session) notePeerIdentity(node, sid string, proto int) string { + reason := "" + if s.link == linkUp && s.peerSID != "" && sid != "" && s.peerSID != sid { + reason = "peer_session_changed" + } + if node != "" { + s.remoteNode = node + } + if sid != "" { + s.peerSID = sid + } + if proto > 0 { + s.peerProto = proto + } + return reason +} + +func (s *session) isSelfControlFrame(node, sid string) bool { + if sid != "" && sid == s.localSID { + return true + } + if node != "" && node == s.nodeID { + return true + } + return false +} + +func hasWirePrefix(topic, prefix []string) bool { + if len(topic) < len(prefix) { + return false + } + for i := range prefix { + if topic[i] != prefix[i] { + return false + } + } + return true +} + +func (s *session) onHello(line []byte) bool { + var msg wireHello + if json.Unmarshal(line, &msg) != nil { + s.log("malformed hello dropped") + return false + } + s.noteRx("hello") + if msg.Peer != "" && msg.Peer != s.nodeID { + s.log("hello dropped: wrong peer") + return false + } + if s.peerID != "" && msg.Node != s.peerID { + s.log("hello dropped: wrong node") + return false + } + reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) + s.helloSeen = true + s.logKV("hello rx", "peer_sid", msg.SID) + + if !s.writeLine(marshal(wireHelloAck{ + T: "hello_ack", + Node: s.nodeID, + SID: s.localSID, + Proto: protoVersion, + OK: true, + })) { + return true + } + s.log("hello_ack tx") + time.Sleep(postHelloAckSettle) + s.promoteLink(reason) + return true +} + +func (s *session) armExports(reason string) { + if s.link != linkUp || s.exportsArmed { + return + } + s.setupExports() + s.exportReadyAt = time.Now().Add(exportStartHoldoff) + s.exportWaitUntil = time.Time{} + s.exportsArmed = true + s.logKV("export replay armed", "reason", reason) +} + +func (s *session) onHelloAck(line []byte) bool { + var msg wireHelloAck + if json.Unmarshal(line, &msg) != nil { + s.log("malformed hello_ack dropped") + return false + } + if s.isSelfControlFrame(msg.Node, msg.SID) { + s.log("echoed hello_ack ignored") + return true + } + s.noteRx("hello_ack") + if !msg.OK { + s.log("hello_ack rejected by peer") + s.handleLinkDown("hello_rejected", "") + return true + } + reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) + s.helloSeen = true + s.logKV("hello_ack rx", "peer_sid", msg.SID) + s.promoteLink(reason) + return true +} + +func (s *session) onPing(line []byte) bool { + var msg wirePing + if json.Unmarshal(line, &msg) != nil { + s.log("malformed ping dropped") + return false + } + s.noteRx("ping") + if s.link != linkUp { + s.log("ping dropped: link not up") + return true + } + reason := s.notePeerIdentity("", msg.SID, 0) + if reason != "" { + s.logKV("peer session changed", "reason", reason) + s.teardownExports() + s.teardownPendingCalls() + s.teardownPendingWireCalls(reason) + s.exportsArmed = false + } + s.armExports("peer_ping") + s.logKV("ping rx", "peer_sid", msg.SID) + if !s.writeLine(marshal(wirePong{T: "pong", TS: msg.TS, SID: s.localSID})) { + return true + } + s.log("pong tx") + s.publishLinkState(reason, "") + return true +} + +func (s *session) onPong(line []byte) bool { + var msg wirePong + if json.Unmarshal(line, &msg) != nil { + s.log("malformed pong dropped") + return false + } + if s.isSelfControlFrame("", msg.SID) { + s.log("echoed pong ignored") + return true + } + s.noteRx("pong") + reason := s.notePeerIdentity("", msg.SID, 0) + if reason != "" { + s.logKV("peer session changed", "reason", reason) + s.teardownExports() + s.teardownPendingCalls() + s.teardownPendingWireCalls(reason) + s.exportsArmed = false + } + s.armExports("peer_pong") + s.publishLinkState(reason, "") + return true +} + +func (s *session) onPub(line []byte) bool { + var msg wirePub + if json.Unmarshal(line, &msg) != nil { + s.log("malformed pub dropped") + return false + } + s.noteRx("pub") + if s.link != linkUp { + s.log("pub dropped before handshake") + return true + } + t := importPublishTopic(msg.Topic) + if t == nil { + if hasWirePrefix(msg.Topic, []string{"state"}) { + s.log("echoed state pub ignored") + return true + } + s.log("incoming pub dropped: no_route") + return true + } + s.armExports("peer_pub") + s.conn.Publish(s.conn.NewMessage(t, msg.Payload, msg.Retain)) + return true +} + +func (s *session) onUnretain(line []byte) bool { + var msg wireUnretain + if json.Unmarshal(line, &msg) != nil { + println("[fabric] malformed unretain dropped") + return false + } + s.noteRx("unretain") + if s.link != linkUp { + println("[fabric] unretain dropped before handshake") + return true + } + t := importPublishTopic(msg.Topic) + if t == nil { + println("[fabric] incoming unretain dropped: no_route") + return true + } + s.armExports("peer_unretain") + s.conn.Publish(s.conn.NewMessage(t, nil, true)) + return true +} + +func (s *session) onCall(line []byte) bool { + var msg wireCall + if json.Unmarshal(line, &msg) != nil { + println("[fabric] malformed call dropped") + return false + } + s.noteRx("call") + if s.link != linkUp { + println("[fabric] call dropped before handshake") + return true + } + t := importCallTopic(msg.Topic) + if t == nil { + println("[fabric] incoming call dropped: no_route") + s.writeLine(marshal(wireReply{T: "reply", Corr: msg.ID, OK: false, Err: "no_route"})) + return true + } + s.armExports("peer_call") + + timeout := callTimeoutDef + if msg.TimeoutMs > 0 { + timeout = time.Duration(msg.TimeoutMs) * time.Millisecond + } + busMsg := s.conn.NewMessage(t, msg.Payload, false) + sub := s.conn.Request(busMsg) + s.pendingCalls = append(s.pendingCalls, &pendingCall{ + id: msg.ID, + sub: sub, + deadline: time.Now().Add(timeout), + }) + return true +} + +func (s *session) onReply(line []byte) bool { + var msg wireReply + if json.Unmarshal(line, &msg) != nil { + println("[fabric] malformed reply dropped") + return false + } + s.noteRx("reply") + s.armExports("peer_reply") + + for i, call := range s.pendingWireCalls { + if call.id != msg.Corr { + continue + } + s.pendingWireCalls = append(s.pendingWireCalls[:i], s.pendingWireCalls[i+1:]...) + if !call.req.CanReply() { + return true + } + if !msg.OK { + s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: msg.Err}, false) + return true + } + s.conn.Reply(call.req, decodePayload(msg.Payload), false) + return true + } + + println("[fabric] unexpected reply dropped:", msg.Corr) + return true +} + +func checkBusError(payload any) string { + b, err := json.Marshal(payload) + if err != nil { + return "" + } + var probe struct { + OK bool `json:"ok"` + Error string `json:"error"` + } + if json.Unmarshal(b, &probe) == nil && !probe.OK && probe.Error != "" { + return probe.Error + } + return "" +} + +func marshalPayload(payload any) (json.RawMessage, error) { + b, err := json.Marshal(payload) + if err != nil { + return nil, err + } + return json.RawMessage(b), nil +} + +// ---- export lifecycle ---- +// +// Exports are drained inline in the main loop (no extra goroutines) +// to avoid TinyGo cooperative scheduler mutex panics. + +func (s *session) setupExports() { + if s.conn == nil { + return + } + for _, p := range exportPatterns() { + s.exportSubs = append(s.exportSubs, s.conn.Subscribe(p)) + } + for _, p := range exportCallPatterns() { + s.exportCallSubs = append(s.exportCallSubs, s.conn.Subscribe(p)) + } +} + +func (s *session) teardownExports() { + for _, sub := range s.exportSubs { + s.conn.Unsubscribe(sub) + } + s.exportSubs = nil + for _, sub := range s.exportCallSubs { + s.conn.Unsubscribe(sub) + } + s.exportCallSubs = nil +} + +func (s *session) teardownPendingCalls() { + for _, call := range s.pendingCalls { + s.conn.Unsubscribe(call.sub) + } + s.pendingCalls = nil +} + +func (s *session) teardownPendingWireCalls(reason string) { + for _, call := range s.pendingWireCalls { + if call.req != nil && call.req.CanReply() { + s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reason}, false) + } + } + s.pendingWireCalls = nil +} + +// drainExports does a non-blocking read of each export subscription +// and writes any messages to the wire. Called from the main loop. +func (s *session) drainExports() { + if s.link != linkUp { + return + } + if !s.exportsArmed { + if !s.exportWaitUntil.IsZero() && !time.Now().Before(s.exportWaitUntil) { + s.armExports("fallback") + } else { + return + } + } + if !s.exportReadyAt.IsZero() && time.Now().Before(s.exportReadyAt) { + return + } + total := 0 + for _, sub := range s.exportSubs { + for { + if total >= exportMaxPerTick { + return + } + select { + case m, ok := <-sub.Channel(): + if !ok || m == nil { + goto nextSub + } + wire := exportTopic(m.Topic) + if wire == nil { + continue + } + payload, err := marshalPayload(m.Payload) + if err != nil { + println("[fabric] export payload dropped:", err.Error()) + continue + } + if !s.writeLine(marshal(wirePub{ + T: "pub", + Topic: wire, + Payload: payload, + Retain: m.Retained, + })) { + return + } + total++ + default: + goto nextSub + } + } + nextSub: + } +} + +func (s *session) drainPendingCalls(now time.Time) { + if len(s.pendingCalls) == 0 { + return + } + + keep := s.pendingCalls[:0] + for _, call := range s.pendingCalls { + select { + case reply, ok := <-call.sub.Channel(): + s.conn.Unsubscribe(call.sub) + if !ok || reply == nil { + if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: "timeout"})) { + return + } + continue + } + if errStr := checkBusError(reply.Payload); errStr != "" { + if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: errStr})) { + return + } + continue + } + payload, err := marshalPayload(reply.Payload) + if err != nil { + if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: errPayloadMarshal})) { + return + } + continue + } + if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: true, Payload: payload})) { + return + } + continue + default: + } + + if !now.Before(call.deadline) { + s.conn.Unsubscribe(call.sub) + if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: "timeout"})) { + return + } + continue + } + + keep = append(keep, call) + } + + s.pendingCalls = keep +} + +func (s *session) drainWireCalls(now time.Time) { + s.drainOutgoingWireCalls(now) + s.drainPendingWireCalls(now) +} + +func (s *session) drainOutgoingWireCalls(now time.Time) { + if s.link != linkUp || len(s.exportCallSubs) == 0 { + return + } + + for _, sub := range s.exportCallSubs { + for { + select { + case msg, ok := <-sub.Channel(): + if !ok || msg == nil { + goto nextSub + } + + wireTopic := exportCallTopic(msg.Topic) + if wireTopic == nil { + continue + } + + payload, err := marshalPayload(msg.Payload) + if err != nil { + println("[fabric] outgoing call dropped:", err.Error()) + if msg.CanReply() { + s.conn.Reply(msg, types.ErrorReply{OK: false, Error: errPayloadMarshal}, false) + } + continue + } + id := s.nextWireCallID + s.nextWireCallID++ + corr := "wire-" + strconvx.Utoa64(id) + if msg.CanReply() { + s.pendingWireCalls = append(s.pendingWireCalls, &pendingWireCall{ + id: corr, + req: msg, + deadline: now.Add(callTimeoutDef), + }) + } + if !s.writeLine(marshal(wireCall{ + T: "call", + ID: corr, + Topic: wireTopic, + Payload: payload, + TimeoutMs: int(callTimeoutDef / time.Millisecond), + })) { + return + } + default: + goto nextSub + } + } + nextSub: + } +} + +func (s *session) drainPendingWireCalls(now time.Time) { + if len(s.pendingWireCalls) == 0 { + return + } + + keep := s.pendingWireCalls[:0] + for _, call := range s.pendingWireCalls { + if !now.Before(call.deadline) { + if call.req != nil && call.req.CanReply() { + s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: "timeout"}, false) + } + continue + } + keep = append(keep, call) + } + s.pendingWireCalls = keep +} + +// ---- transport write ---- + +func (s *session) writeLine(data []byte) bool { + if len(data) > 0 && data[len(data)-1] == '\n' { + data = data[:len(data)-1] + } + if err := s.tr.WriteLine(data); err != nil { + if errors.Is(err, ErrLineTooLong) { + s.log("oversized write dropped") + return true + } + s.handleLinkDown("transport_write_failed", err.Error()) + return false + } + s.noteTx() + return true +} + +func (s *session) logWaiting() { + if s.helloSeen { + return + } + s.log("waiting for connection start") +} diff --git a/services/fabric/session_timer_test.go b/services/fabric/session_timer_test.go new file mode 100644 index 0000000..f846c9b --- /dev/null +++ b/services/fabric/session_timer_test.go @@ -0,0 +1,20 @@ +package fabric + +import ( + "testing" + "time" +) + +func TestResetTimerDrainsExpiredTick(t *testing.T) { + timer := time.NewTimer(5 * time.Millisecond) + defer timer.Stop() + + <-timer.C + resetTimer(timer, 40*time.Millisecond) + + select { + case <-timer.C: + t.Fatal("timer fired from a stale tick") + case <-time.After(15 * time.Millisecond): + } +} diff --git a/services/fabric/trace.go b/services/fabric/trace.go new file mode 100644 index 0000000..4c2637b --- /dev/null +++ b/services/fabric/trace.go @@ -0,0 +1,45 @@ +package fabric + +func traceLine(dir string, data []byte) { + if !fabricTraceEnabled { + return + } + println("[fabric-trace]", dir, "len", len(data), "line", tracePreview(data)) +} + +func tracePreview(data []byte) string { + const max = 200 + if len(data) > max { + data = data[:max] + } + out := make([]byte, 0, len(data)*2+3) + for _, b := range data { + switch b { + case '\n': + out = append(out, '\\', 'n') + case '\r': + out = append(out, '\\', 'r') + case '\t': + out = append(out, '\\', 't') + default: + if b < 0x20 || b > 0x7e { + out = append(out, '\\', 'x') + out = append(out, hexNibble(b>>4), hexNibble(b)) + } else { + out = append(out, b) + } + } + } + if len(data) == max { + out = append(out, '.', '.', '.') + } + return string(out) +} + +func hexNibble(v byte) byte { + v &= 0x0f + if v < 10 { + return '0' + v + } + return 'a' + (v - 10) +} diff --git a/services/fabric/trace_disabled.go b/services/fabric/trace_disabled.go new file mode 100644 index 0000000..734daec --- /dev/null +++ b/services/fabric/trace_disabled.go @@ -0,0 +1,5 @@ +//go:build !fabric_trace + +package fabric + +const fabricTraceEnabled = false diff --git a/services/fabric/trace_enabled.go b/services/fabric/trace_enabled.go new file mode 100644 index 0000000..4ed89ca --- /dev/null +++ b/services/fabric/trace_enabled.go @@ -0,0 +1,5 @@ +//go:build fabric_trace + +package fabric + +const fabricTraceEnabled = true diff --git a/services/fabric/transport_rw.go b/services/fabric/transport_rw.go new file mode 100644 index 0000000..871316e --- /dev/null +++ b/services/fabric/transport_rw.go @@ -0,0 +1,97 @@ +package fabric + +import ( + "bufio" + "fmt" + "io" + "sync" +) + +// Used for USB serial (fabric-test) and host-side unit tests. + +const maxLineLen = 2048 + +var ErrLineTooLong = fmt.Errorf("line exceeds %d bytes", maxLineLen) + +// RWTransport implements Transport over an io.Reader + io.Writer. +type RWTransport struct { + r *bufio.Reader + mu sync.Mutex + w *bufio.Writer + closers []io.Closer +} + +func NewRWTransport(r io.Reader, w io.Writer) *RWTransport { + t := &RWTransport{ + r: bufio.NewReaderSize(r, maxLineLen), + w: bufio.NewWriter(w), + } + var rc io.Closer + if c, ok := r.(io.Closer); ok { + rc = c + t.closers = append(t.closers, c) + } + if c, ok := w.(io.Closer); ok { + if c != rc { + t.closers = append(t.closers, c) + } + } + return t +} + +func (t *RWTransport) ReadLine() ([]byte, error) { + var buf []byte + for { + seg, more, err := t.r.ReadLine() + if err != nil { + return nil, err + } + buf = append(buf, seg...) + if !more { + break + } + if len(buf) > maxLineLen { + for more { + _, more, err = t.r.ReadLine() + if err != nil { + return nil, err + } + } + return nil, ErrLineTooLong + } + } + if len(buf) > maxLineLen { + return nil, ErrLineTooLong + } + traceLine("rx", buf) + return buf, nil +} + +func (t *RWTransport) WriteLine(data []byte) error { + if len(data) > maxLineLen { + return ErrLineTooLong + } + t.mu.Lock() + defer t.mu.Unlock() + if _, err := t.w.Write(data); err != nil { + return err + } + if err := t.w.WriteByte('\n'); err != nil { + return err + } + if err := t.w.Flush(); err != nil { + return err + } + traceLine("tx", data) + return nil +} + +func (t *RWTransport) Close() error { + var first error + for _, c := range t.closers { + if err := c.Close(); err != nil && first == nil { + first = err + } + } + return first +} diff --git a/services/fabric/transport_shmring.go b/services/fabric/transport_shmring.go new file mode 100644 index 0000000..dece826 --- /dev/null +++ b/services/fabric/transport_shmring.go @@ -0,0 +1,148 @@ +package fabric + +import ( + "context" + "fmt" + + "devicecode-go/x/shmring" +) + +// ShmringTransport implements Transport over two shmring rings (RX + TX). +// Used for UART0 in production (main.go). +type ShmringTransport struct { + rx *shmring.Ring + tx *shmring.Ring + cancel context.CancelFunc + ctx context.Context + buf []byte + over bool // draining an oversize line +} + +func NewShmringTransport(rx, tx *shmring.Ring) *ShmringTransport { + ctx, cancel := context.WithCancel(context.Background()) + return &ShmringTransport{ + rx: rx, + tx: tx, + cancel: cancel, + ctx: ctx, + buf: make([]byte, 0, 256), + } +} + +func (t *ShmringTransport) ReadLine() ([]byte, error) { + t.buf = t.buf[:0] + t.over = false + + for { + p1, p2 := t.rx.ReadAcquire() + if len(p1)+len(p2) == 0 { + select { + case <-t.ctx.Done(): + return nil, fmt.Errorf("transport closed") + case <-t.rx.Readable(): + continue + } + } + + // Scan p1 for newline. + if idx := findByte(p1, '\n'); idx >= 0 { + if !t.over { + t.buf = append(t.buf, p1[:idx]...) + } + t.rx.ReadRelease(idx + 1) + if t.over { + t.buf = t.buf[:0] + t.over = false + return nil, ErrLineTooLong + } + if len(t.buf) > maxLineLen { + return nil, ErrLineTooLong + } + out := make([]byte, len(t.buf)) + copy(out, t.buf) + traceLine("rx", out) + return out, nil + } + + // No newline in p1 — consume it, check p2. + if !t.over { + t.buf = append(t.buf, p1...) + } + + if idx := findByte(p2, '\n'); idx >= 0 { + if !t.over { + t.buf = append(t.buf, p2[:idx]...) + } + t.rx.ReadRelease(len(p1) + idx + 1) + if t.over { + t.buf = t.buf[:0] + t.over = false + return nil, ErrLineTooLong + } + if len(t.buf) > maxLineLen { + return nil, ErrLineTooLong + } + out := make([]byte, len(t.buf)) + copy(out, t.buf) + traceLine("rx", out) + return out, nil + } + + // No newline — consume everything, wait for more. + if !t.over { + t.buf = append(t.buf, p2...) + } + t.rx.ReadRelease(len(p1) + len(p2)) + + // Check for oversize. + if len(t.buf) > maxLineLen { + t.buf = t.buf[:0] + t.over = true + } + } +} + +func (t *ShmringTransport) WriteLine(data []byte) error { + if len(data) > maxLineLen { + return ErrLineTooLong + } + line := append(data, '\n') + written := 0 + + for written < len(line) { + p1, p2 := t.tx.WriteAcquire() + if len(p1)+len(p2) == 0 { + select { + case <-t.ctx.Done(): + return fmt.Errorf("transport closed") + case <-t.tx.Writable(): + continue + } + } + + remaining := line[written:] + n := copy(p1, remaining) + remaining = remaining[n:] + if len(remaining) > 0 && len(p2) > 0 { + n += copy(p2, remaining) + } + t.tx.WriteCommit(n) + written += n + } + traceLine("tx", data) + return nil +} + +func (t *ShmringTransport) Close() error { + t.cancel() + return nil +} + +func findByte(b []byte, c byte) int { + for i, v := range b { + if v == c { + return i + } + } + return -1 +} diff --git a/services/fabric/wire.go b/services/fabric/wire.go new file mode 100644 index 0000000..62df959 --- /dev/null +++ b/services/fabric/wire.go @@ -0,0 +1,90 @@ +package fabric + +import "encoding/json" + +// ---- Wire message types (fabric.md §4) ---- + +type wireCaps struct { + Pub bool `json:"pub,omitempty"` + Call bool `json:"call,omitempty"` +} + +type wireHello struct { + T string `json:"t"` + Node string `json:"node"` + Peer string `json:"peer"` + SID string `json:"sid"` + Proto int `json:"proto,omitempty"` + Caps *wireCaps `json:"caps,omitempty"` +} + +type wireHelloAck struct { + T string `json:"t"` + Node string `json:"node"` + SID string `json:"sid,omitempty"` + Proto int `json:"proto,omitempty"` + OK bool `json:"ok"` +} + +type wirePing struct { + T string `json:"t"` + TS int64 `json:"ts"` + SID string `json:"sid,omitempty"` +} + +type wirePong struct { + T string `json:"t"` + TS int64 `json:"ts"` + SID string `json:"sid,omitempty"` +} + +// Not wired yet — defined for forward compatibility. + +type wirePub struct { + T string `json:"t"` + Topic []string `json:"topic"` + Payload json.RawMessage `json:"payload"` + Retain bool `json:"retain"` +} + +type wireUnretain struct { + T string `json:"t"` + Topic []string `json:"topic"` +} + +type wireCall struct { + T string `json:"t"` + ID string `json:"id"` + Topic []string `json:"topic"` + Payload json.RawMessage `json:"payload"` + TimeoutMs int `json:"timeout_ms"` +} + +type wireReply struct { + T string `json:"t"` + Corr string `json:"corr"` + OK bool `json:"ok"` + Payload json.RawMessage `json:"payload,omitempty"` + Err string `json:"err,omitempty"` +} + +// ---- codec helpers ---- + +// marshal returns compact JSON with a trailing newline. +// Panics on encode failure (should be unreachable for wire structs). +func marshal(v any) []byte { + b, err := json.Marshal(v) + if err != nil { + panic("fabric: marshal: " + err.Error()) + } + return append(b, '\n') +} + +// wireType extracts the "t" field from a JSON line. +func wireType(line []byte) string { + var env struct { + T string `json:"t"` + } + json.Unmarshal(line, &env) + return env.T +} From fae844a02d98b57a78602d4e4abba6238e863751 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 09:29:54 +0000 Subject: [PATCH 02/65] feat: fabric topic remapping and config bridge Add static topic remapping rules for import/export between the CM5 bus and the MCU wire protocol. Import rules map incoming pub/call topics to local bus addresses. Export rules forward local HAL state to the peer. Add config bridge that translates config/device into config/hal format, normalizing Lua empty-table encoding ({} -> []) for Go unmarshalling. --- services/fabric/bridge.go | 249 ++++++++++++++++++++++++++++ services/fabric/bridge_test.go | 290 +++++++++++++++++++++++++++++++++ services/fabric/remap.go | 190 +++++++++++++++++++++ 3 files changed, 729 insertions(+) create mode 100644 services/fabric/bridge.go create mode 100644 services/fabric/bridge_test.go create mode 100644 services/fabric/remap.go diff --git a/services/fabric/bridge.go b/services/fabric/bridge.go new file mode 100644 index 0000000..008d69e --- /dev/null +++ b/services/fabric/bridge.go @@ -0,0 +1,249 @@ +package fabric + +import ( + "context" + "encoding/json" + + "devicecode-go/bus" + "devicecode-go/types" +) + +var ( + tConfigDevice = bus.T("config", "device") + tConfigHAL = bus.T("config", "hal") + tRPCHALDump = bus.T("rpc", "hal", "dump") + tHALState = bus.T("hal", "state") +) + +type dumpReply struct { + OK bool `json:"ok"` + Method string `json:"method"` + Echo any `json:"echo,omitempty"` + HAL *types.HALState `json:"hal,omitempty"` + Applied bool `json:"applied"` + ConfigCount int `json:"config_count,omitempty"` + ConfigError string `json:"config_error,omitempty"` +} + +// RunBridge connects generic fabric import topics to concrete MCU services. +// +// The current Lua-side config exports `config/mcu -> config/device`, while the +// MCU HAL consumes `config/hal`. This bridge translates matching HAL configs. +// It also exposes a minimal `rpc/hal/dump` endpoint so CM5 proxy calls have a +// real MCU-side responder. +func RunBridge(ctx context.Context, conn *bus.Connection) { + cfgSub := conn.Subscribe(tConfigDevice) + dumpSub := conn.Subscribe(tRPCHALDump) + halStateSub := conn.Subscribe(tHALState) + defer conn.Unsubscribe(cfgSub) + defer conn.Unsubscribe(dumpSub) + defer conn.Unsubscribe(halStateSub) + + var lastHAL *types.HALState + var appliedConfig bool + var appliedConfigCount int + var lastConfigErr string + + for { + select { + case <-ctx.Done(): + return + + case msg, ok := <-halStateSub.Channel(): + if !ok || msg == nil { + return + } + if st, ok := decodeHALState(msg.Payload); ok { + stCopy := st + lastHAL = &stCopy + } + + case msg, ok := <-cfgSub.Channel(): + if !ok || msg == nil { + return + } + processConfigDevice(conn, msg, &appliedConfig, &appliedConfigCount, &lastConfigErr) + + case msg, ok := <-dumpSub.Channel(): + if !ok || msg == nil { + return + } + if !msg.CanReply() { + continue + } + if !drainConfigDevice(conn, cfgSub, &appliedConfig, &appliedConfigCount, &lastConfigErr) { + return + } + drainHALState(halStateSub, &lastHAL) + conn.Reply(msg, dumpReply{ + OK: true, + Method: "dump", + Echo: decodePayload(msg.Payload), + HAL: lastHAL, + Applied: appliedConfig, + ConfigCount: appliedConfigCount, + ConfigError: lastConfigErr, + }, false) + } + } +} + +func drainConfigDevice(conn *bus.Connection, cfgSub *bus.Subscription, appliedConfig *bool, appliedConfigCount *int, lastConfigErr *string) bool { + for { + select { + case msg, ok := <-cfgSub.Channel(): + if !ok || msg == nil { + return false + } + processConfigDevice(conn, msg, appliedConfig, appliedConfigCount, lastConfigErr) + default: + return true + } + } +} + +func drainHALState(halSub *bus.Subscription, lastHAL **types.HALState) { + for { + select { + case msg, ok := <-halSub.Channel(): + if !ok || msg == nil { + return + } + if st, ok := decodeHALState(msg.Payload); ok { + stCopy := st + *lastHAL = &stCopy + } + default: + return + } + } +} + +func processConfigDevice(conn *bus.Connection, msg *bus.Message, appliedConfig *bool, appliedConfigCount *int, lastConfigErr *string) { + cfg, err := decodeHALConfig(msg.Payload) + if err != "" { + *lastConfigErr = err + println("[fabric] config/device rejected:", err) + return + } + *appliedConfig = true + *appliedConfigCount++ + *lastConfigErr = "" + println("[fabric] config/device bridged to config/hal", *appliedConfigCount, "devices", len(cfg.Devices)) + conn.Publish(conn.NewMessage(tConfigHAL, cfg, true)) +} + +func decodePayload(payload any) any { + switch v := payload.(type) { + case nil: + return nil + case json.RawMessage: + if len(v) == 0 { + return nil + } + var out any + if err := json.Unmarshal(v, &out); err == nil { + return out + } + return []byte(v) + case []byte: + if len(v) == 0 { + return nil + } + var out any + if err := json.Unmarshal(v, &out); err == nil { + return out + } + cp := make([]byte, len(v)) + copy(cp, v) + return cp + default: + return v + } +} + +func decodeHALState(payload any) (types.HALState, bool) { + switch v := payload.(type) { + case types.HALState: + return v, true + case *types.HALState: + if v == nil { + return types.HALState{}, false + } + return *v, true + case json.RawMessage: + var out types.HALState + return out, json.Unmarshal(v, &out) == nil + case []byte: + var out types.HALState + return out, json.Unmarshal(v, &out) == nil + default: + b, err := json.Marshal(v) + if err != nil { + return types.HALState{}, false + } + var out types.HALState + return out, json.Unmarshal(b, &out) == nil + } +} + +func decodeHALConfig(payload any) (types.HALConfig, string) { + switch v := payload.(type) { + case types.HALConfig: + return v, "" + case *types.HALConfig: + if v == nil { + return types.HALConfig{}, "nil_hal_config" + } + return *v, "" + case json.RawMessage: + return decodeHALConfigBytes(v) + case []byte: + return decodeHALConfigBytes(v) + default: + b, err := json.Marshal(v) + if err != nil { + return types.HALConfig{}, "payload_marshal_failed: " + err.Error() + } + return decodeHALConfigBytes(b) + } +} + +func decodeHALConfigBytes(b []byte) (types.HALConfig, string) { + var probe map[string]json.RawMessage + if err := json.Unmarshal(b, &probe); err != nil { + return types.HALConfig{}, "json_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(b) + } + if _, ok := probe["devices"]; !ok { + return types.HALConfig{}, "missing_devices_field; raw=" + truncateRawJSON(b) + } + + // Lua encodes empty tables as {} (object) not [] (array). + // Normalize known slice fields so Go unmarshal accepts them. + for _, key := range []string{"devices", "pollers"} { + if raw, ok := probe[key]; ok && len(raw) == 2 && raw[0] == '{' && raw[1] == '}' { + probe[key] = json.RawMessage("[]") + } + } + fixed, err := json.Marshal(probe) + if err != nil { + return types.HALConfig{}, "normalize_failed: " + err.Error() + } + + var out types.HALConfig + if err := json.Unmarshal(fixed, &out); err != nil { + return types.HALConfig{}, "hal_config_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(fixed) + } + return out, "" +} + +func truncateRawJSON(b []byte) string { + if len(b) == 0 { + return "" + } + const max = 160 + if len(b) <= max { + return string(b) + } + return string(b[:max]) + "..." +} diff --git a/services/fabric/bridge_test.go b/services/fabric/bridge_test.go new file mode 100644 index 0000000..a8ca9fe --- /dev/null +++ b/services/fabric/bridge_test.go @@ -0,0 +1,290 @@ +package fabric + +import ( + "context" + "encoding/json" + "testing" + "time" + + "devicecode-go/bus" + "devicecode-go/types" +) + +func TestBridgeMapsConfigDeviceToConfigHAL(t *testing.T) { + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bridgeConn := b.NewConnection("fabric-bridge") + go RunBridge(ctx, bridgeConn) + + writer := b.NewConnection("writer") + reader := b.NewConnection("reader") + sub := reader.Subscribe(bus.T("config", "hal")) + defer reader.Unsubscribe(sub) + + in := types.HALConfig{ + Devices: []types.HALDevice{ + {ID: "led0", Type: "gpio_led"}, + }, + } + raw, err := json.Marshal(in) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(raw), true)) + + select { + case msg := <-sub.Channel(): + if msg == nil { + t.Fatal("config/hal subscription closed") + } + out, ok := msg.Payload.(types.HALConfig) + if !ok { + t.Fatalf("payload type = %T, want types.HALConfig", msg.Payload) + } + if len(out.Devices) != 1 || out.Devices[0].ID != "led0" { + t.Fatalf("unexpected config: %+v", out) + } + if !msg.Retained { + t.Fatal("config/hal message was not retained") + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for config/hal") + } +} + +func TestBridgeIgnoresNonHALConfigObject(t *testing.T) { + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bridgeConn := b.NewConnection("fabric-bridge") + go RunBridge(ctx, bridgeConn) + + writer := b.NewConnection("writer") + reader := b.NewConnection("reader") + sub := reader.Subscribe(bus.T("config", "hal")) + defer reader.Unsubscribe(sub) + + writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(`{"source":"monitor_auto_probe"}`), true)) + + select { + case msg := <-sub.Channel(): + t.Fatalf("unexpected config/hal message: %+v", msg) + case <-time.After(150 * time.Millisecond): + } + + req := b.NewConnection("requester") + reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer reqCancel() + + replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( + bus.T("rpc", "hal", "dump"), + json.RawMessage(`{"ask":"status"}`), + false, + )) + if err != nil { + t.Fatalf("RequestWait: %v", err) + } + + reply, ok := replyMsg.Payload.(dumpReply) + if !ok { + t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) + } + if reply.Applied { + t.Fatal("reply.Applied = true, want false") + } + if reply.ConfigError == "" { + t.Fatal("reply.ConfigError = empty, want decode error") + } +} + +func TestBridgeRepliesToHALDump(t *testing.T) { + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bridgeConn := b.NewConnection("fabric-bridge") + go RunBridge(ctx, bridgeConn) + time.Sleep(10 * time.Millisecond) + + writer := b.NewConnection("writer") + req := b.NewConnection("requester") + + writer.Publish(writer.NewMessage(bus.T("hal", "state"), types.HALState{ + Level: "ready", + Status: "", + TS: 123, + }, true)) + + reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer reqCancel() + + replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( + bus.T("rpc", "hal", "dump"), + json.RawMessage(`{"ask":"status"}`), + false, + )) + if err != nil { + t.Fatalf("RequestWait: %v", err) + } + + reply, ok := replyMsg.Payload.(dumpReply) + if !ok { + t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) + } + if !reply.OK { + t.Fatalf("reply.OK = false: %+v", reply) + } + if reply.Method != "dump" { + t.Fatalf("reply.Method = %q", reply.Method) + } + echo, ok := reply.Echo.(map[string]any) + if !ok { + t.Fatalf("reply.Echo type = %T, want map[string]any", reply.Echo) + } + if echo["ask"] != "status" { + t.Fatalf("reply.Echo.ask = %#v", echo["ask"]) + } + if reply.HAL == nil || reply.HAL.Level != "ready" || reply.HAL.TS != 123 { + t.Fatalf("reply.HAL = %+v", reply.HAL) + } + if reply.Applied { + t.Fatal("reply.Applied = true, want false") + } + if reply.ConfigCount != 0 { + t.Fatalf("reply.ConfigCount = %d, want 0", reply.ConfigCount) + } +} + +func TestBridgeDumpReflectsAppliedConfig(t *testing.T) { + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bridgeConn := b.NewConnection("fabric-bridge") + go RunBridge(ctx, bridgeConn) + time.Sleep(10 * time.Millisecond) + + writer := b.NewConnection("writer") + req := b.NewConnection("requester") + + in := types.HALConfig{ + Devices: []types.HALDevice{ + {ID: "led0", Type: "gpio_led"}, + }, + } + raw, err := json.Marshal(in) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(raw), true)) + + reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer reqCancel() + + replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( + bus.T("rpc", "hal", "dump"), + json.RawMessage(`{"ask":"status"}`), + false, + )) + if err != nil { + t.Fatalf("RequestWait: %v", err) + } + + reply, ok := replyMsg.Payload.(dumpReply) + if !ok { + t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) + } + if !reply.Applied { + t.Fatal("reply.Applied = false, want true") + } + if reply.ConfigCount != 1 { + t.Fatalf("reply.ConfigCount = %d, want 1", reply.ConfigCount) + } + if reply.ConfigError != "" { + t.Fatalf("reply.ConfigError = %q, want empty", reply.ConfigError) + } +} + +func TestBridgeAcceptsLuaEmptyObjectsAsArrays(t *testing.T) { + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bridgeConn := b.NewConnection("fabric-bridge") + go RunBridge(ctx, bridgeConn) + time.Sleep(10 * time.Millisecond) + + writer := b.NewConnection("writer") + reader := b.NewConnection("reader") + sub := reader.Subscribe(bus.T("config", "hal")) + defer reader.Unsubscribe(sub) + + // Lua encodes empty tables as {} (objects) not [] (arrays). + writer.Publish(writer.NewMessage(bus.T("config", "device"), + json.RawMessage(`{"devices":{},"pollers":{}}`), true)) + + select { + case msg := <-sub.Channel(): + if msg == nil { + t.Fatal("config/hal subscription closed") + } + out, ok := msg.Payload.(types.HALConfig) + if !ok { + t.Fatalf("payload type = %T, want types.HALConfig", msg.Payload) + } + if out.Devices == nil || out.Pollers == nil { + t.Fatalf("expected non-nil slices, got devices=%v pollers=%v", out.Devices, out.Pollers) + } + case <-time.After(2 * time.Second): + t.Fatal("timed out waiting for config/hal") + } +} + +func TestBridgeDumpDrainsPendingConfigBeforeReply(t *testing.T) { + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + bridgeConn := b.NewConnection("fabric-bridge") + go RunBridge(ctx, bridgeConn) + time.Sleep(10 * time.Millisecond) + + writer := b.NewConnection("writer") + req := b.NewConnection("requester") + + in := types.HALConfig{ + Devices: []types.HALDevice{}, + Pollers: []types.PollSpec{}, + } + raw, err := json.Marshal(in) + if err != nil { + t.Fatalf("Marshal: %v", err) + } + writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(raw), true)) + + reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) + defer reqCancel() + + replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( + bus.T("rpc", "hal", "dump"), + json.RawMessage(`{"ask":"status"}`), + false, + )) + if err != nil { + t.Fatalf("RequestWait: %v", err) + } + + reply, ok := replyMsg.Payload.(dumpReply) + if !ok { + t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) + } + if !reply.Applied { + t.Fatal("reply.Applied = false, want true") + } + if reply.ConfigCount != 1 { + t.Fatalf("reply.ConfigCount = %d, want 1", reply.ConfigCount) + } +} diff --git a/services/fabric/remap.go b/services/fabric/remap.go new file mode 100644 index 0000000..086f233 --- /dev/null +++ b/services/fabric/remap.go @@ -0,0 +1,190 @@ +package fabric + +import "devicecode-go/bus" + +// Topic remapping rules matching the shipped Lua fabric link contract. +// +// CM5 -> MCU wire publish: +// ["config","device"] -> config/device +// +// CM5 -> MCU wire call: +// ["rpc","hal","read_state"] -> rpc/hal/read_state +// ["rpc","hal","dump"] -> rpc/hal/dump +// +// MCU local bus publish -> wire: +// hal/cap/env/# -> ["state","env",...] +// hal/cap/power/# -> ["state","power",...] +// hal/state -> ["state","hal"] +// +// MCU local bus call -> wire: +// fabric/out/rpc/hal/read_state -> ["rpc","hal","read_state"] + +type wireImportRule struct { + wire []string + local []string +} + +type busExportRule struct { + localPrefix []string + remotePrefix []string + suffix bool +} + +var importPublishRules = []wireImportRule{ + { + wire: []string{"config", "device"}, + local: []string{"config", "device"}, + }, +} + +var importCallRules = []wireImportRule{ + { + wire: []string{"rpc", "hal", "read_state"}, + local: []string{"rpc", "hal", "read_state"}, + }, + { + wire: []string{"rpc", "hal", "dump"}, + local: []string{"rpc", "hal", "dump"}, + }, +} + +var exportPublishRules = []busExportRule{ + { + localPrefix: []string{"hal", "cap", "env"}, + remotePrefix: []string{"state", "env"}, + suffix: true, + }, + { + localPrefix: []string{"hal", "cap", "power"}, + remotePrefix: []string{"state", "power"}, + suffix: true, + }, + { + localPrefix: []string{"hal", "state"}, + remotePrefix: []string{"state", "hal"}, + }, +} + +var exportCallRules = []busExportRule{ + { + localPrefix: []string{"fabric", "out", "rpc", "hal", "read_state"}, + remotePrefix: []string{"rpc", "hal", "read_state"}, + }, +} + +func importPublishTopic(wire []string) bus.Topic { + return wireImport(wire, importPublishRules) +} + +func importCallTopic(wire []string) bus.Topic { + return wireImport(wire, importCallRules) +} + +func exportTopic(t bus.Topic) []string { + return busExport(t, exportPublishRules) +} + +func exportPatterns() []bus.Topic { + return exportPatternsFor(exportPublishRules) +} + +func exportCallTopic(t bus.Topic) []string { + return busExport(t, exportCallRules) +} + +func exportCallPatterns() []bus.Topic { + return exportPatternsFor(exportCallRules) +} + +func wireImport(wire []string, rules []wireImportRule) bus.Topic { + for _, rule := range rules { + if slicesEqualStrings(wire, rule.wire) { + return stringsToTopic(rule.local) + } + } + return nil +} + +func busExport(t bus.Topic, rules []busExportRule) []string { + for _, rule := range rules { + out, ok := applyBusExportRule(t, rule) + if ok { + return out + } + } + return nil +} + +func applyBusExportRule(t bus.Topic, rule busExportRule) ([]string, bool) { + if t.Len() < len(rule.localPrefix) { + return nil, false + } + for i, want := range rule.localPrefix { + if str(t, i) != want { + return nil, false + } + } + if !rule.suffix && t.Len() != len(rule.localPrefix) { + return nil, false + } + + out := make([]string, 0, len(rule.remotePrefix)+maxInt(0, t.Len()-len(rule.localPrefix))) + out = append(out, rule.remotePrefix...) + if rule.suffix { + for i := len(rule.localPrefix); i < t.Len(); i++ { + s := str(t, i) + if s == "" { + return nil, false + } + out = append(out, s) + } + } + return out, true +} + +func exportPatternsFor(rules []busExportRule) []bus.Topic { + out := make([]bus.Topic, 0, len(rules)) + for _, rule := range rules { + tokens := make([]bus.Token, 0, len(rule.localPrefix)+1) + for _, s := range rule.localPrefix { + tokens = append(tokens, s) + } + if rule.suffix { + tokens = append(tokens, "#") + } + out = append(out, bus.T(tokens...)) + } + return out +} + +func stringsToTopic(parts []string) bus.Topic { + tokens := make([]bus.Token, len(parts)) + for i, s := range parts { + tokens[i] = s + } + return bus.T(tokens...) +} + +func slicesEqualStrings(a, b []string) bool { + if len(a) != len(b) { + return false + } + for i := range a { + if a[i] != b[i] { + return false + } + } + return true +} + +func maxInt(a, b int) int { + if a > b { + return a + } + return b +} + +func str(t bus.Topic, i int) string { + s, _ := t.At(i).(string) + return s +} From 2a7d48a965a2e8551e5f367a53ada43a1e3245f2 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 09:30:02 +0000 Subject: [PATCH 03/65] feat: HAL fabric integration and UART tuning Register the fabric service in main.go and add a standalone fabric-test command for host-side protocol testing. Tune UART ring sizes for the pico_bb_proto_1 hardware setup: - TX shmring: 512 -> 2048 bytes (prevents export replay overflow) - RX shmring: 32 -> 256 bytes (prevents edge notification misses) Add resources_host.go stub so fabric tests compile on host. Update host helpers (fmtx, strconvx) used by fabric wire encoding. --- .devcontainer/devcontainer.json | 25 +- .gitignore | 3 +- cmd/fabric-test/main.go | 84 ++++ main.go | 367 ++++++------------ .../hal/internal/provider/resources_host.go | 52 +++ services/hal/internal/provider/setup_none.go | 2 +- .../provider/setups/pico_bb_proto_1.go | 4 +- x/fmtx/fmtx_host.go | 26 +- x/strconvx/strconvx_host.go | 51 ++- 9 files changed, 322 insertions(+), 292 deletions(-) create mode 100644 cmd/fabric-test/main.go create mode 100644 services/hal/internal/provider/resources_host.go diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index ea3f884..20224e9 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,23 +1,10 @@ -// For format details, see https://aka.ms/devcontainer.json. For config options, see the -// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu { "name": "Ubuntu", - // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/base:noble", - - // Features to add to the dev container. More info: https://containers.dev/features. - // "features": {}, - - // Use 'forwardPorts' to make a list of ports inside the container available locally. - // "forwardPorts": [], - - // Use 'postCreateCommand' to run commands after the container is created. + "features": { + "ghcr.io/devcontainers/features/node:1": { + "version": "20" + } + }, "postCreateCommand": "sh .devcontainer/postCreateCommand.sh" - - - // Configure tool-specific properties. - // "customizations": {}, - - // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. - // "remoteUser": "root" -} +} \ No newline at end of file diff --git a/.gitignore b/.gitignore index f6e73b6..01fc028 100644 --- a/.gitignore +++ b/.gitignore @@ -1,2 +1,3 @@ build/ -.vscode/settings.json \ No newline at end of file +.vscode/settings.json +.DS_Store \ No newline at end of file diff --git a/cmd/fabric-test/main.go b/cmd/fabric-test/main.go new file mode 100644 index 0000000..245a678 --- /dev/null +++ b/cmd/fabric-test/main.go @@ -0,0 +1,84 @@ +//go:build tinygo && rp2350 + +// fabric-test: exercises the fabric protocol over USB serial with real HAL sensors. +// +// tinygo build -target=pico2 -tags "pico_bb_proto_1" -stack-size=8KB -o build/fabric-test.elf ./cmd/fabric-test + +package main + +import ( + "context" + "machine" + "time" + + "devicecode-go/bus" + "devicecode-go/services/fabric" + "devicecode-go/services/hal" + "devicecode-go/types" +) + +const halTimeout = 5 * time.Second + +var halReadiness = bus.T("hal", "state") + +func main() { + time.Sleep(3 * time.Second) + + ctx := context.Background() + b := bus.NewBus(3, "+", "#") + halConn := b.NewConnection("hal") + + go hal.Run(ctx, halConn) + if !waitHALReady(ctx, halConn, halTimeout) { + return + } + + conn := b.NewConnection("fabric") + tr := fabric.NewRWTransport(&serialRW{}, &serialRW{}) + fabric.Run(ctx, tr, conn, "mcu-1", "cm5-local") +} + +func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool { + sub := c.Subscribe(halReadiness) + defer c.Unsubscribe(sub) + ctx2, cancel := context.WithTimeout(ctx, d) + defer cancel() + for { + select { + case m := <-sub.Channel(): + if st, ok := m.Payload.(types.HALState); ok && st.Level == "ready" { + return true + } + case <-ctx2.Done(): + return false + } + } +} + +type serialRW struct{} + +func (s *serialRW) Read(p []byte) (int, error) { + if len(p) == 0 { + return 0, nil + } + for machine.Serial.Buffered() == 0 { + time.Sleep(time.Millisecond) + } + n := 0 + for n < len(p) && machine.Serial.Buffered() > 0 { + b, err := machine.Serial.ReadByte() + if err != nil { + if n > 0 { + return n, nil + } + return 0, err + } + p[n] = b + n++ + } + return n, nil +} + +func (s *serialRW) Write(p []byte) (int, error) { + return machine.Serial.Write(p) +} diff --git a/main.go b/main.go index 6c682af..580270a 100644 --- a/main.go +++ b/main.go @@ -6,6 +6,7 @@ import ( "time" "devicecode-go/bus" + "devicecode-go/services/fabric" "devicecode-go/services/hal" "devicecode-go/types" "devicecode-go/x/shmring" @@ -18,6 +19,8 @@ import ( const halTimeout = 5 * time.Second const pwmTop = 4095 +const handshakeOnlyOutput = true +const fabricSessionWaitLogEvery = 2 * time.Second // Thermal (deci-°C) const ( @@ -132,10 +135,6 @@ const ( type Reactor struct { ui *bus.Connection - // UART - jsonOut *shmring.Ring // telemetry (JSON UART TX) - // Logger UART1 already handled by global logger (see SetUART1) - // inputs (latest) vin_mV, vbat_mV int32 iin_mA, ibat_mA int32 @@ -164,9 +163,6 @@ type Reactor struct { // misc now time.Time - - // telemetry drop counters (bytes) - droppedUART0Bytes int } func NewReactor(ui *bus.Connection) *Reactor { @@ -209,12 +205,12 @@ func (r *Reactor) updateLatchesFromValues() { // Over-temp latch if r.freshTMP() { if r.lastTDeci >= TEMP_LIMIT { - if !r.otActive { + if !handshakeOnlyOutput && !r.otActive { log.Println("[thermal] over-temp → latch active") } r.otActive = true } else if r.lastTDeci <= (TEMP_LIMIT - TEMP_HYST) { - if r.otActive { + if !handshakeOnlyOutput && r.otActive { log.Println("[thermal] temp recovered below hysteresis") } r.otActive = false @@ -235,7 +231,9 @@ func (r *Reactor) updateLatchesFromValues() { // ---- sequencing (non-blocking) ---- func (r *Reactor) startUpSeq() { - log.Println("[power] PG debounced + Temp OK → rails UP") + if !handshakeOnlyOutput { + log.Println("[power] PG debounced + Temp OK → rails UP") + } r.state = stateUpSeq r.seqIdx = 0 // next to apply r.nextActionDue = r.now // first step fires immediately @@ -245,7 +243,9 @@ func (r *Reactor) startUpSeq() { } func (r *Reactor) startDownSeq() { - log.Println("[power] brownout/stale/over-temp → rails DOWN") + if !handshakeOnlyOutput { + log.Println("[power] brownout/stale/over-temp → rails DOWN") + } r.state = stateDownSeq if r.seqOnCount < 0 { r.seqOnCount = 0 @@ -274,7 +274,9 @@ func (r *Reactor) advanceSequenceIfDue() { return } step := powerSeq[r.seqIdx] - log.Println("[event] powering rail UP: ", step.Name) + if !handshakeOnlyOutput { + log.Println("[event] powering rail UP: ", step.Name) + } r.publishSwitch(step.Name, true) r.seqOnCount++ r.seqIdx++ @@ -289,7 +291,9 @@ func (r *Reactor) advanceSequenceIfDue() { return } step := powerSeq[r.seqIdx] - log.Println("[event] powering rail down: ", step.Name) + if !handshakeOnlyOutput { + log.Println("[event] powering rail down: ", step.Name) + } r.publishSwitch(step.Name, false) r.seqOnCount-- r.seqIdx-- @@ -325,7 +329,9 @@ func (r *Reactor) stepFSM() { // If actively powering down and inputs become stably good, reverse. if r.state == stateDownSeq && r.pgStable { - log.Println("[power] inputs stably good → reverse to UP sequence") + if !handshakeOnlyOutput { + log.Println("[power] inputs stably good → reverse to UP sequence") + } r.startUpSeq() return } @@ -375,97 +381,30 @@ func (r *Reactor) OnCharger(v types.ChargerValue) { r.vin_mV = v.VIN_mV r.iin_mA = v.IIn_mA r.tsVIN = r.now - - // JSON: {"power/charger/internal/vin":..,"vsys":..,"iin":..} - if r.jsonOut != nil { - var w jsonw - w.write = r.jsonWrite - w.begin() - w.kvInt("power/charger/internal/vin", int(v.VIN_mV)) - w.kvInt("power/charger/internal/vsys", int(v.VSYS_mV)) - w.kvInt("power/charger/internal/iin", int(v.IIn_mA)) - // Full bitfield maps (0/1) for LOCF pipelines - { - it := types.NewBitIter(types.SystemStatus(v.Sys), types.SystemStatusTable[:]) - for { - bitName, set, ok := it.NextAny() - if !ok { - break - } - if set { - w.kvInt("power/charger/internal/system/"+bitName, 1) - } else { - w.kvInt("power/charger/internal/system/"+bitName, 0) - } - } - } - { - it := types.NewBitIter(types.ChargeStatusBits(v.Status), types.ChargeStatusTable[:]) - for { - bitName, set, ok := it.NextAny() - if !ok { - break - } - if set { - w.kvInt("power/charger/internal/status/"+bitName, 1) - } else { - w.kvInt("power/charger/internal/status/"+bitName, 0) - } - } - } - { - it := types.NewBitIter(types.ChargerStateBits(v.State), types.ChargerStateTable[:]) - for { - bitName, set, ok := it.NextAny() - if !ok { - break - } - if set { - w.kvInt("power/charger/internal/state/"+bitName, 1) - } else { - w.kvInt("power/charger/internal/state/"+bitName, 0) - } - } - } - w.end() - } } func (r *Reactor) OnBattery(v types.BatteryValue) { r.vbat_mV = v.PackMilliV r.ibat_mA = v.IBatMilliA r.tsVBAT = r.now - - // JSON: {"power/battery/internal/vbat":..,"ibat":..} - if r.jsonOut != nil { - var w jsonw - w.write = r.jsonWrite - w.begin() - w.kvInt("power/battery/internal/vbat", int(v.PackMilliV)) - w.kvInt("power/battery/internal/ibat", int(v.IBatMilliA)) - w.kvInt("power/battery/internal/bsr", int(v.BSR_uOhmPerCell)) - w.end() - } } -func (r *Reactor) OnTempDeciC(label string, deci int, jsonKey string) { - log.Deci(label, deci) - if r.jsonOut != nil { - var w jsonw - w.write = r.jsonWrite - w.begin() - w.kvInt(jsonKey, deci) - w.end() +func (r *Reactor) OnTempDeciC(label string, deci int) { + if handshakeOnlyOutput { + return } + log.Deci(label, deci) } // ---- memory snapshot telemetry (every ~2 s in main loop) ---- func (r *Reactor) emitMemSnapshot() { + if handshakeOnlyOutput { + return + } var ms runtime.MemStats runtime.GC() runtime.ReadMemStats(&ms) - // log line log.Println( "[mem] ", "alloc:", int(ms.Alloc), " ", @@ -473,33 +412,34 @@ func (r *Reactor) emitMemSnapshot() { "mallocs:", int(ms.Mallocs), " ", "frees:", int(ms.Frees), ) - // JSON (minimal to keep overhead low) - if r.jsonOut != nil { - var w jsonw - w.write = r.jsonWrite - w.begin() - w.kvInt("sys/mem/alloc", int(ms.Alloc)) - w.end() - } } // ----------------------------------------------------------------------------- // Main // ----------------------------------------------------------------------------- +const buildTag = "fabric-20260401c" + func main() { // Allow early USB/console settle if needed time.Sleep(3 * time.Second) + println("[main] build:", buildTag) log.SetStart(time.Now()) ctx := context.Background() - log.Println("[main] bootstrapping bus …") + if !handshakeOnlyOutput { + log.Println("[main] bootstrapping bus …") + } b := bus.NewBus(3, "+", "#") halConn := b.NewConnection("hal") uiConn := b.NewConnection("ui") + bridgeConn := b.NewConnection("fabric-bridge") - log.Println("[main] starting hal.Run …") + if !handshakeOnlyOutput { + log.Println("[main] starting hal.Run …") + } + go fabric.RunBridge(ctx, bridgeConn) go hal.Run(ctx, halConn) // Wait for retained hal/state=ready (or time out) @@ -511,7 +451,9 @@ func main() { } // Subscriptions (env + power) - log.Println("[main] subscribing env + power …") + if !handshakeOnlyOutput { + log.Println("[main] subscribing env + power …") + } tempSub := uiConn.Subscribe(tTempValue) tempDieSub := uiConn.Subscribe(tDieTempValue) humidSub := uiConn.Subscribe(tHumValue) @@ -519,22 +461,33 @@ func main() { stSub := uiConn.Subscribe(stTopic) evSub := uiConn.Subscribe(evTopic) - // UART sessions (TX only needed for our use) + // UART sessions const ( - uartTele = "uart0" // telemetry JSON - uartLog = "uart1" // log mirror + uartFabric = "uart1" // fabric link to CM5 ) - subSessOpenTele := uiConn.Subscribe(tSessOpened(uartTele)) - subSessOpenLog := uiConn.Subscribe(tSessOpened(uartLog)) - subSessClosedTele := uiConn.Subscribe(tSessClosed(uartTele)) - subSessClosedLog := uiConn.Subscribe(tSessClosed(uartLog)) + subSessOpenFabric := uiConn.Subscribe(tSessOpened(uartFabric)) + subSessClosedFabric := uiConn.Subscribe(tSessClosed(uartFabric)) - // Kick open requests (fire-and-forget; events carry handles) - uiConn.Publish(uiConn.NewMessage(tSessOpen(uartTele), nil, false)) - uiConn.Publish(uiConn.NewMessage(tSessOpen(uartLog), nil, false)) + // Kick open requests + uiConn.Publish(uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) - // Retry back-off guards - var retryTeleAt, retryLogAt time.Time + var retryFabricAt time.Time + var fabricCancel context.CancelFunc + var fabricDone chan struct{} + var fabricSessionOpen bool + nextFabricWaitLog := time.Now() + + stopFabricSession := func() { + if fabricCancel == nil { + return + } + fabricCancel() + fabricCancel = nil + if fabricDone != nil { + <-fabricDone + fabricDone = nil + } + } // Reactor r := NewReactor(uiConn) @@ -544,35 +497,39 @@ func main() { defer ticker.Stop() memTick := 0 - log.Println("[main] entering reactor loop …") for { select { // ---- UART session opened/closed ---- - case m := <-subSessOpenTele.Channel(): + case m := <-subSessOpenFabric.Channel(): if ev, ok := m.Payload.(types.SerialSessionOpened); ok { - r.jsonOut = shmring.Get(shmring.Handle(ev.TXHandle)) - log.Println("[uart0] telemetry session opened") + // Tear down previous fabric session if any. + stopFabricSession() + rx := shmring.Get(shmring.Handle(ev.RXHandle)) + tx := shmring.Get(shmring.Handle(ev.TXHandle)) + tr := fabric.NewShmringTransport(rx, tx) + fabricConn := b.NewConnection("fabric") + fabricCtx, cancel := context.WithCancel(ctx) + done := make(chan struct{}) + fabricCancel = cancel + fabricDone = done + fabricSessionOpen = true + go func() { + defer close(done) + fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5-local") + }() } - case m := <-subSessOpenLog.Channel(): - if ev, ok := m.Payload.(types.SerialSessionOpened); ok { - log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) - log.Println("[uart1] log session opened") - } - case <-subSessClosedTele.Channel(): - r.jsonOut = nil - log.Println("[uart0] telemetry session closed") - // Auto-reopen with back-off - if time.Now().After(retryTeleAt) { - uiConn.Publish(uiConn.NewMessage(tSessOpen(uartTele), nil, false)) - retryTeleAt = time.Now().Add(2 * time.Second) + case <-subSessClosedFabric.Channel(): + // Ignore stale close events — the open handler already + // tears down the previous session before starting a new one. + if !fabricSessionOpen { + continue } - case <-subSessClosedLog.Channel(): - log.SetUART1(nil) - log.Println("[uart1] log session closed") - // Auto-reopen with back-off - if time.Now().After(retryLogAt) { - uiConn.Publish(uiConn.NewMessage(tSessOpen(uartLog), nil, false)) - retryLogAt = time.Now().Add(2 * time.Second) + stopFabricSession() + fabricSessionOpen = false + nextFabricWaitLog = time.Now() + if time.Now().After(retryFabricAt) { + uiConn.Publish(uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) + retryFabricAt = time.Now().Add(2 * time.Second) } // ---- Env prints ---- @@ -585,18 +542,12 @@ func main() { deci := int(v.DeciC) r.lastTDeci = deci r.tsTemp = r.now - r.OnTempDeciC("[value] env/temperature/core °C=", deci, "env/temperature/core") + r.OnTempDeciC("[value] env/temperature/core °C=", deci) } case m := <-humidSub.Channel(): if v, ok := m.Payload.(types.HumidityValue); ok { - log.Hundredths("[value] env/humidity/core %RH=", int(v.RHx100)) - // JSON - if r.jsonOut != nil { - var w jsonw - w.write = r.jsonWrite - w.begin() - w.kvInt("env/humidity/core", int(v.RHx100)) - w.end() + if !handshakeOnlyOutput { + log.Hundredths("[value] env/humidity/core %RH=", int(v.RHx100)) } } @@ -609,7 +560,7 @@ func main() { aht20Alive = false r.lastTDeci = deci r.tsTemp = r.now - r.OnTempDeciC("[value] env/temperature/core °C=", deci, "env/temperature/core") + r.OnTempDeciC("[value] env/temperature/core °C=", deci) } } @@ -624,7 +575,7 @@ func main() { r.OnCharger(v) printCapValue(m, &r.iin_mA, nil, &r.ibat_mA, nil) case types.TemperatureValue: - r.OnTempDeciC("[value] power/temperature/internal °C=", int(v.DeciC), "power/temperature/internal") + r.OnTempDeciC("[value] power/temperature/internal °C=", int(v.DeciC)) } case m := <-stSub.Channel(): @@ -632,24 +583,14 @@ func main() { case m := <-evSub.Channel(): printCapEvent(m) - // JSON: {"///event":""} - if r.jsonOut != nil { - dom, _ := m.Topic.At(2).(string) - kind, _ := m.Topic.At(3).(string) - name, _ := m.Topic.At(4).(string) - tag, _ := m.Topic.At(6).(string) - if dom != "" && kind != "" && name != "" && tag != "" { - var w jsonw - w.write = r.jsonWrite - w.begin() - w.kvStr(dom+"/"+kind+"/"+name+"/event", tag) - w.end() - } - } // ---- Supervisory tick ---- case <-ticker.C: r.now = time.Now() + if handshakeOnlyOutput && !fabricSessionOpen && !r.now.Before(nextFabricWaitLog) { + log.Println("[main] waiting for fabric connection start") + nextFabricWaitLog = r.now.Add(fabricSessionWaitLogEvery) + } // 1) Run FSM (includes symmetric reversal) r.stepFSM() @@ -698,22 +639,6 @@ func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool // Centralised UART write helpers (handle partial writes) // ----------------------------------------------------------------------------- -// uart0 (telemetry JSON) — returns bytes written; tracks dropped bytes on partial writes. -func (r *Reactor) jsonWrite(b []byte) int { - if r == nil || r.jsonOut == nil || len(b) == 0 { - return 0 - } - n := r.jsonOut.TryWriteFrom(b) - if n < len(b) { - r.droppedUART0Bytes += (len(b) - n) - // Rate-limited note - if r.droppedUART0Bytes == (len(b)-n) || (r.droppedUART0Bytes%1024) == 0 { - log.Println("[uart0] dropped bytes =", r.droppedUART0Bytes) - } - } - return n -} - // uart1 (logger mirror) — returns bytes written; tracks dropped bytes on partial writes. func (l *Logger) logWrite(b []byte) int { if l == nil || l.target == nil || len(b) == 0 { @@ -732,94 +657,14 @@ func (l *Logger) logWrite(b []byte) int { return n } -// ----------------------------------------------------------------------------- -// Minimal streaming JSON writer for shmring (no buffers/allocs) -// ----------------------------------------------------------------------------- - -type jsonw struct { - write func([]byte) int - first bool -} - -func (w *jsonw) begin() { - w.first = true - if w.write != nil { - w.write([]byte("{")) - } -} -func (w *jsonw) end() { - if w.write != nil { - w.write([]byte("}\n")) - } -} -func (w *jsonw) comma() { - if w.write == nil { - return - } - if !w.first { - w.write([]byte(",")) - } else { - w.first = false - } -} -func (w *jsonw) key(k string) { - if w.write == nil { - return - } - w.write([]byte(`"`)) - w.write([]byte(k)) - w.write([]byte(`":`)) -} -func (w *jsonw) kvInt(k string, v int) { - w.comma() - w.key(k) - if w.write != nil { - w.write([]byte(strconvx.Itoa(v))) - } -} -func (w *jsonw) kvStr(k, s string) { - w.comma() - w.key(k) - if w.write == nil { - return - } - w.write([]byte(`"`)) - for i := 0; i < len(s); i++ { - c := s[i] - switch c { - case '\\', '"': - w.write([]byte{'\\', c}) - case '\b': - w.write([]byte{'\\', 'b'}) - case '\f': - w.write([]byte{'\\', 'f'}) - case '\n': - w.write([]byte{'\\', 'n'}) - case '\r': - w.write([]byte{'\\', 'r'}) - case '\t': - w.write([]byte{'\\', 't'}) - default: - if c < 0x20 { - var buf [6]byte - buf[0], buf[1], buf[2], buf[3] = '\\', 'u', '0', '0' - const hex = "0123456789abcdef" - buf[4] = hex[c>>4] - buf[5] = hex[c&0xF] - w.write(buf[:]) - } else { - w.write([]byte{c}) - } - } - } - w.write([]byte(`"`)) -} - // ----------------------------------------------------------------------------- // Printing helpers (via Logger) // ----------------------------------------------------------------------------- func printCapValue(m *bus.Message, lastIIn *int32, _ *bool, lastIBat *int32, _ *bool) { + if handshakeOnlyOutput { + return + } // hal/cap////value dom, _ := m.Topic.At(2).(string) kind, _ := m.Topic.At(3).(string) @@ -904,6 +749,9 @@ func (r *Reactor) logPrefixStatus(path, label string) { } func printCapStatus(m *bus.Message) { + if handshakeOnlyOutput { + return + } // hal/cap////status dom, _ := m.Topic.At(2).(string) kind, _ := m.Topic.At(3).(string) @@ -927,6 +775,9 @@ func printCapStatus(m *bus.Message) { } func printCapEvent(m *bus.Message) { + if handshakeOnlyOutput { + return + } // hal/cap////event/ dom, _ := m.Topic.At(2).(string) kind, _ := m.Topic.At(3).(string) diff --git a/services/hal/internal/provider/resources_host.go b/services/hal/internal/provider/resources_host.go new file mode 100644 index 0000000..884ab54 --- /dev/null +++ b/services/hal/internal/provider/resources_host.go @@ -0,0 +1,52 @@ +//go:build !(rp2040 || rp2350) + +package provider + +import ( + "time" + + "devicecode-go/errcode" + "devicecode-go/services/hal/internal/core" + "devicecode-go/services/hal/internal/provider/setups" + + "tinygo.org/x/drivers" +) + +var ( + SelectedPlan setups.ResourcePlan + InitialHALConfig core.HALConfig +) + +type hostRegistry struct{} + +func NewResources() core.Resources { + return core.Resources{Reg: hostRegistry{}} +} + +func (hostRegistry) ClassOf(id core.ResourceID) (core.BusClass, bool) { + return 0, false +} + +func (hostRegistry) ClaimI2C(devID string, id core.ResourceID) (drivers.I2C, error) { + return nil, errcode.Unsupported +} + +func (hostRegistry) ReleaseI2C(devID string, id core.ResourceID) {} + +func (hostRegistry) ClaimSerial(devID string, id core.ResourceID) (core.SerialPort, error) { + return nil, errcode.Unsupported +} + +func (hostRegistry) ReleaseSerial(devID string, id core.ResourceID) {} + +func (hostRegistry) ClaimPin(devID string, pin int, fn core.PinFunc) (core.PinHandle, error) { + return nil, errcode.Unsupported +} + +func (hostRegistry) ReleasePin(devID string, pin int) {} + +func (hostRegistry) SubscribeGPIOEdges(devID string, pin int, sel core.GPIOEdge, debounce time.Duration, buf int) (core.GPIOEdgeStream, error) { + return nil, errcode.Unsupported +} + +func (hostRegistry) UnsubscribeGPIOEdges(devID string, pin int) {} diff --git a/services/hal/internal/provider/setup_none.go b/services/hal/internal/provider/setup_none.go index 2863103..9bfa5b8 100644 --- a/services/hal/internal/provider/setup_none.go +++ b/services/hal/internal/provider/setup_none.go @@ -1,4 +1,4 @@ -//go:build !((rp2040 || rp2350) && (pico_rich_dev || pico_bb_proto_1)) +//go:build (rp2040 || rp2350) && !(pico_rich_dev || pico_bb_proto_1) package provider diff --git a/services/hal/internal/provider/setups/pico_bb_proto_1.go b/services/hal/internal/provider/setups/pico_bb_proto_1.go index 58a8101..ae3d94d 100644 --- a/services/hal/internal/provider/setups/pico_bb_proto_1.go +++ b/services/hal/internal/provider/setups/pico_bb_proto_1.go @@ -58,8 +58,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 32, - TXSize: 512, + RXSize: 256, + TXSize: 2048, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ diff --git a/x/fmtx/fmtx_host.go b/x/fmtx/fmtx_host.go index 848722a..7640d3e 100644 --- a/x/fmtx/fmtx_host.go +++ b/x/fmtx/fmtx_host.go @@ -5,12 +5,30 @@ package fmtx import ( "fmt" "io" + "os" + "strings" ) +// DefaultOutput matches the MCU API surface so tests and callers can redirect +// Print/Printf without depending on the host's process stdout directly. +var DefaultOutput io.Writer = os.Stdout + func Sprintf(format string, a ...any) string { return fmt.Sprintf(format, a...) } -func Printf(format string, a ...any) (int, error) { return fmt.Printf(format, a...) } +func Printf(format string, a ...any) (int, error) { return Fprintf(DefaultOutput, format, a...) } func Fprintf(w io.Writer, format string, a ...any) (int, error) { return fmt.Fprintf(w, format, a...) } func Errorf(format string, a ...any) error { return fmt.Errorf(format, a...) } -func Sprint(a ...any) string { return fmt.Sprint(a...) } -func Fprint(w io.Writer, a ...any) (int, error) { return fmt.Fprint(w, a...) } -func Print(a ...any) (int, error) { return fmt.Print(a...) } + +// Keep host behavior aligned with the MCU formatter, which always separates +// Sprint/Fprint operands with spaces. +func Sprint(a ...any) string { + var b strings.Builder + for i, v := range a { + if i > 0 { + b.WriteByte(' ') + } + b.WriteString(fmt.Sprint(v)) + } + return b.String() +} +func Fprint(w io.Writer, a ...any) (int, error) { return io.WriteString(w, Sprint(a...)) } +func Print(a ...any) (int, error) { return Fprint(DefaultOutput, a...) } diff --git a/x/strconvx/strconvx_host.go b/x/strconvx/strconvx_host.go index bf918c6..398ace4 100644 --- a/x/strconvx/strconvx_host.go +++ b/x/strconvx/strconvx_host.go @@ -4,18 +4,55 @@ package strconvx import "strconv" -// The goal is signature parity with strconv. -// Delegate straight through. +// Mirror the MCU helpers on host builds so tests exercise the same parsing +// rules, including base-0 prefix handling for 0b/0o/0x inputs. -func Itoa(i int) string { return strconv.Itoa(i) } -func Atoi(s string) (int, error) { return strconv.Atoi(s) } -func FormatInt(i int64, base int) string { return strconv.FormatInt(i, base) } -func FormatUint(u uint64, base int) string { return strconv.FormatUint(u, base) } -func ParseInt(s string, base, bitSize int) (int64, error) { return strconv.ParseInt(s, base, bitSize) } +func Itoa(i int) string { return strconv.Itoa(i) } +func Itoa64(i int64) string { return strconv.FormatInt(i, 10) } +func Utoa64(u uint64) string { return strconv.FormatUint(u, 10) } +func Atoi(s string) (int, error) { return strconv.Atoi(s) } +func FormatInt(i int64, base int) string { return strconv.FormatInt(i, base) } +func FormatUint(u uint64, base int) string { return strconv.FormatUint(u, base) } +func ParseInt(s string, base, bitSize int) (int64, error) { + if base != 0 { + return strconv.ParseInt(s, base, bitSize) + } + neg := false + if len(s) > 0 && (s[0] == '+' || s[0] == '-') { + neg = s[0] == '-' + s = s[1:] + } + base = detectBase(&s) + if neg { + s = "-" + s + } + return strconv.ParseInt(s, base, bitSize) +} func ParseUint(s string, base, bitSize int) (uint64, error) { + if base == 0 { + base = detectBase(&s) + } return strconv.ParseUint(s, base, bitSize) } func FormatFloat(f float64, fmt byte, prec, bitSize int) string { return strconv.FormatFloat(f, fmt, prec, bitSize) } func ParseFloat(s string, bitSize int) (float64, error) { return strconv.ParseFloat(s, bitSize) } + +func detectBase(ps *string) int { + s := *ps + if len(s) >= 2 && s[0] == '0' { + switch s[1] { + case 'x', 'X': + *ps = s[2:] + return 16 + case 'b', 'B': + *ps = s[2:] + return 2 + case 'o', 'O': + *ps = s[2:] + return 8 + } + } + return 10 +} From 435025f9697fe57e46169d27130d6c6e8b2c76a4 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 09:50:58 +0000 Subject: [PATCH 04/65] cleanup: consistent logging, fix indentation, document design decisions - Replace all bare println() in session handlers with s.log()/s.logKV() so every log line includes the session ID - Include actual error detail in JSON unmarshal failure logs - Fix writeLine() indentation (cosmetic, logic was correct) - Document timing constant relationships and interdependencies - Document postHelloAckSettle as TinyGo scheduler constraint - Document SID-change handling asymmetry vs Lua side - Document hardcoded import rules as intentional v1 scope - Document wireCaps as forward-compatibility stub --- services/fabric/remap.go | 5 +++ services/fabric/session.go | 71 +++++++++++++++++++++++--------------- services/fabric/wire.go | 2 ++ 3 files changed, 50 insertions(+), 28 deletions(-) diff --git a/services/fabric/remap.go b/services/fabric/remap.go index 086f233..b39af4b 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -4,6 +4,11 @@ import "devicecode-go/bus" // Topic remapping rules matching the shipped Lua fabric link contract. // +// These rules are hardcoded and exact-match for v1. The Lua (CM5) side +// uses config-driven wildcard rules, but the MCU only needs a fixed set +// of routes. If new routes are required, add them here and on the Lua +// config side. +// // CM5 -> MCU wire publish: // ["config","device"] -> config/device // diff --git a/services/fabric/session.go b/services/fabric/session.go index d9be723..3995d32 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -21,20 +21,30 @@ const ( ) // ---- timeouts (local policy) ---- +// +// Timing relationships: +// staleTimeout (45s) > exportWaitFallback (15s) > callTimeoutDef (5s) +// +// The CM5 sends pings every 15s of TX inactivity. The MCU marks the +// peer stale after 45s without any RX, giving a 30s margin. The +// exportWaitFallback arms exports if no peer traffic arrives within +// 15s of link-up (normally armed earlier by peer_pub). const ( staleTimeout = 45 * time.Second callTimeoutDef = 5 * time.Second waitLogEvery = 2 * time.Second exportStartHoldoff = 1 * time.Second - // Give the serial reactor a chance to drain hello_ack before - // promoteLink publishes bus state and starts more work. This avoids - // relying on incidental println/GC timing in TinyGo. + // postHelloAckSettle gives the serial reactor goroutine a chance + // to drain the hello_ack bytes from the TX shmring before + // promoteLink publishes bus state and triggers export work. + // TinyGo's cooperative scheduler does not preempt, so without + // this yield the reactor may not run until the next tick. postHelloAckSettle = 10 * time.Millisecond // exportMaxPerTick caps the total export messages sent per drain // cycle across all subscriptions, keeping UART throughput within // the 115200-baud link capacity. - exportMaxPerTick = 1 + exportMaxPerTick = 1 exportWaitFallback = 15 * time.Second errPayloadMarshal = "payload_marshal_failed" ) @@ -340,6 +350,11 @@ func (s *session) dispatch(line []byte) bool { } } +// notePeerIdentity records the remote peer's node, SID, and proto version. +// If the SID changes mid-session, the returned reason triggers a full +// teardown of exports and pending calls on the Go side. Note: the Lua +// side only tears down pending calls on SID change, not exports — this +// asymmetry is intentional since the CM5 re-subscribes on reconnect. func (s *session) notePeerIdentity(node, sid string, proto int) string { reason := "" if s.link == linkUp && s.peerSID != "" && sid != "" && s.peerSID != sid { @@ -381,8 +396,8 @@ func hasWirePrefix(topic, prefix []string) bool { func (s *session) onHello(line []byte) bool { var msg wireHello - if json.Unmarshal(line, &msg) != nil { - s.log("malformed hello dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed hello dropped", "err", err.Error()) return false } s.noteRx("hello") @@ -426,8 +441,8 @@ func (s *session) armExports(reason string) { func (s *session) onHelloAck(line []byte) bool { var msg wireHelloAck - if json.Unmarshal(line, &msg) != nil { - s.log("malformed hello_ack dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed hello_ack dropped", "err", err.Error()) return false } if s.isSelfControlFrame(msg.Node, msg.SID) { @@ -449,8 +464,8 @@ func (s *session) onHelloAck(line []byte) bool { func (s *session) onPing(line []byte) bool { var msg wirePing - if json.Unmarshal(line, &msg) != nil { - s.log("malformed ping dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed ping dropped", "err", err.Error()) return false } s.noteRx("ping") @@ -478,8 +493,8 @@ func (s *session) onPing(line []byte) bool { func (s *session) onPong(line []byte) bool { var msg wirePong - if json.Unmarshal(line, &msg) != nil { - s.log("malformed pong dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed pong dropped", "err", err.Error()) return false } if s.isSelfControlFrame("", msg.SID) { @@ -502,8 +517,8 @@ func (s *session) onPong(line []byte) bool { func (s *session) onPub(line []byte) bool { var msg wirePub - if json.Unmarshal(line, &msg) != nil { - s.log("malformed pub dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed pub dropped", "err", err.Error()) return false } s.noteRx("pub") @@ -527,18 +542,18 @@ func (s *session) onPub(line []byte) bool { func (s *session) onUnretain(line []byte) bool { var msg wireUnretain - if json.Unmarshal(line, &msg) != nil { - println("[fabric] malformed unretain dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed unretain dropped", "err", err.Error()) return false } s.noteRx("unretain") if s.link != linkUp { - println("[fabric] unretain dropped before handshake") + s.log("unretain dropped before handshake") return true } t := importPublishTopic(msg.Topic) if t == nil { - println("[fabric] incoming unretain dropped: no_route") + s.log("incoming unretain dropped: no_route") return true } s.armExports("peer_unretain") @@ -548,18 +563,18 @@ func (s *session) onUnretain(line []byte) bool { func (s *session) onCall(line []byte) bool { var msg wireCall - if json.Unmarshal(line, &msg) != nil { - println("[fabric] malformed call dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed call dropped", "err", err.Error()) return false } s.noteRx("call") if s.link != linkUp { - println("[fabric] call dropped before handshake") + s.log("call dropped before handshake") return true } t := importCallTopic(msg.Topic) if t == nil { - println("[fabric] incoming call dropped: no_route") + s.log("incoming call dropped: no_route") s.writeLine(marshal(wireReply{T: "reply", Corr: msg.ID, OK: false, Err: "no_route"})) return true } @@ -581,8 +596,8 @@ func (s *session) onCall(line []byte) bool { func (s *session) onReply(line []byte) bool { var msg wireReply - if json.Unmarshal(line, &msg) != nil { - println("[fabric] malformed reply dropped") + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed reply dropped", "err", err.Error()) return false } s.noteRx("reply") @@ -604,7 +619,7 @@ func (s *session) onReply(line []byte) bool { return true } - println("[fabric] unexpected reply dropped:", msg.Corr) + s.logKV("unexpected reply dropped", "corr", msg.Corr) return true } @@ -708,7 +723,7 @@ func (s *session) drainExports() { } payload, err := marshalPayload(m.Payload) if err != nil { - println("[fabric] export payload dropped:", err.Error()) + s.logKV("export payload dropped", "err", err.Error()) continue } if !s.writeLine(marshal(wirePub{ @@ -803,7 +818,7 @@ func (s *session) drainOutgoingWireCalls(now time.Time) { payload, err := marshalPayload(msg.Payload) if err != nil { - println("[fabric] outgoing call dropped:", err.Error()) + s.logKV("outgoing call dropped", "err", err.Error()) if msg.CanReply() { s.conn.Reply(msg, types.ErrorReply{OK: false, Error: errPayloadMarshal}, false) } @@ -861,7 +876,7 @@ func (s *session) writeLine(data []byte) bool { data = data[:len(data)-1] } if err := s.tr.WriteLine(data); err != nil { - if errors.Is(err, ErrLineTooLong) { + if errors.Is(err, ErrLineTooLong) { s.log("oversized write dropped") return true } diff --git a/services/fabric/wire.go b/services/fabric/wire.go index 62df959..75657ba 100644 --- a/services/fabric/wire.go +++ b/services/fabric/wire.go @@ -4,6 +4,8 @@ import "encoding/json" // ---- Wire message types (fabric.md §4) ---- +// wireCaps is carried in hello for forward compatibility. The Lua side +// sends caps but neither side enforces them in v1. type wireCaps struct { Pub bool `json:"pub,omitempty"` Call bool `json:"call,omitempty"` From b43ee67e353ae02cbe45fec336d240bee20757e9 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 10:16:50 +0000 Subject: [PATCH 05/65] feat: emit unretain on export when retained state is cleared When a local retained publish is cleared (retain=true, payload=nil), the export drain now sends a wireUnretain instead of a wirePub. This lets the CM5 clear the corresponding retained topic on its side. --- services/fabric/fabric_test.go | 38 ++++++++++++++++++++++++++++++++++ services/fabric/session.go | 10 +++++++++ 2 files changed, 48 insertions(+) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 775393c..a830db9 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -733,6 +733,44 @@ func TestPubExport(t *testing.T) { } } +func TestUnretainExport(t *testing.T) { + mcu, cm5 := pipePair() + b := newBus() + fabricConn := b.NewConnection("fabric") + publishConn := b.NewConnection("hal") + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + ack := bringUp(t, cm5) + unlockExports(t, cm5, ack.SID) + + // Publish retained value first. + publishConn.Publish(publishConn.NewMessage( + bus.T("hal", "cap", "env", "temperature", "core", "value"), + map[string]int{"deci_c": 412}, + true, + )) + pub := readMsg[wirePub](t, cm5) + if pub.T != "pub" || !pub.Retain { + t.Fatalf("expected retained pub, got t=%q retain=%v", pub.T, pub.Retain) + } + + // Clear retained state (retain=true, payload=nil). + publishConn.Publish(publishConn.NewMessage( + bus.T("hal", "cap", "env", "temperature", "core", "value"), + nil, + true, + )) + unr := readMsg[wireUnretain](t, cm5) + if unr.T != "unretain" { + t.Fatalf("expected unretain, got %q", unr.T) + } + want := []string{"state", "env", "temperature", "core", "value"} + if !slicesEqual(unr.Topic, want) { + t.Errorf("topic = %v, want %v", unr.Topic, want) + } +} + func TestDrainExportsReturnsWhenSubscriptionClosed(t *testing.T) { b := newBus() conn := b.NewConnection("fabric") diff --git a/services/fabric/session.go b/services/fabric/session.go index 3995d32..544f87b 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -721,6 +721,16 @@ func (s *session) drainExports() { if wire == nil { continue } + if m.Retained && m.Payload == nil { + if !s.writeLine(marshal(wireUnretain{ + T: "unretain", + Topic: wire, + })) { + return + } + total++ + continue + } payload, err := marshalPayload(m.Payload) if err != nil { s.logKV("export payload dropped", "err", err.Error()) From 2e630b33bdabecbcc49a015cf061e25465f5033b Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 14:27:06 +0000 Subject: [PATCH 06/65] fix: retained replay overflow and double-unsubscribe panic MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Bus queue length increased from 3 to 16 so retained state replay on export subscribe does not overflow. With 3, only the last 3 of ~10 retained hal/cap topics survived, giving CM5 an incomplete snapshot. Nil out call.sub after unsubscribe in drainPendingCalls so that a subsequent writeLine failure -> handleLinkDown -> teardownPendingCalls does not close the same subscription channel twice (panic). Document rpc/hal/read_state import rule as a placeholder with no handler — CM5 calls to this endpoint will timeout until implemented. --- main.go | 6 +++++- services/fabric/remap.go | 3 +++ services/fabric/session.go | 7 ++++++- 3 files changed, 14 insertions(+), 2 deletions(-) diff --git a/main.go b/main.go index 580270a..e25d5db 100644 --- a/main.go +++ b/main.go @@ -431,7 +431,11 @@ func main() { if !handshakeOnlyOutput { log.Println("[main] bootstrapping bus …") } - b := bus.NewBus(3, "+", "#") + // Queue length must be large enough to hold the retained replay burst + // when fabric subscribes to wildcard export patterns (hal/cap/env/#, + // hal/cap/power/#). With ~10 retained topics per pattern and a channel + // of length N, messages beyond N are dropped during initial subscribe. + b := bus.NewBus(16, "+", "#") halConn := b.NewConnection("hal") uiConn := b.NewConnection("ui") bridgeConn := b.NewConnection("fabric-bridge") diff --git a/services/fabric/remap.go b/services/fabric/remap.go index b39af4b..fa1c3bb 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -44,6 +44,9 @@ var importPublishRules = []wireImportRule{ var importCallRules = []wireImportRule{ { + // Placeholder: no handler subscribes to rpc/hal/read_state yet. + // CM5 calls will forward onto the bus and timeout. Remove or + // implement before production use. wire: []string{"rpc", "hal", "read_state"}, local: []string{"rpc", "hal", "read_state"}, }, diff --git a/services/fabric/session.go b/services/fabric/session.go index 544f87b..153f747 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -676,7 +676,10 @@ func (s *session) teardownExports() { func (s *session) teardownPendingCalls() { for _, call := range s.pendingCalls { - s.conn.Unsubscribe(call.sub) + if call.sub != nil { + s.conn.Unsubscribe(call.sub) + call.sub = nil + } } s.pendingCalls = nil } @@ -763,6 +766,7 @@ func (s *session) drainPendingCalls(now time.Time) { select { case reply, ok := <-call.sub.Channel(): s.conn.Unsubscribe(call.sub) + call.sub = nil // prevent double-unsubscribe in teardownPendingCalls if !ok || reply == nil { if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: "timeout"})) { return @@ -791,6 +795,7 @@ func (s *session) drainPendingCalls(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) + call.sub = nil if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: "timeout"})) { return } From b170de8934cac52d9951ae83f84ccbdf421789d0 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 14:48:31 +0000 Subject: [PATCH 07/65] fix: bus queue overflow on retained replay, remove dead import route MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Increase bus queue from 16 to 32. pico_bb_proto_1 publishes ~26 retained topics across env/power domains (info + status + value per capability). Queue of 16 still overflowed during export subscribe replay. Remove rpc/hal/read_state import call rule — no handler subscribes to this topic, so CM5 calls would silently timeout. The export call rule now routes rpc/hal/dump (matching the bridge handler). --- main.go | 11 +++++----- services/fabric/fabric_test.go | 37 +++++++++++++++++----------------- services/fabric/remap.go | 15 ++------------ 3 files changed, 26 insertions(+), 37 deletions(-) diff --git a/main.go b/main.go index e25d5db..f06d91f 100644 --- a/main.go +++ b/main.go @@ -431,11 +431,12 @@ func main() { if !handshakeOnlyOutput { log.Println("[main] bootstrapping bus …") } - // Queue length must be large enough to hold the retained replay burst - // when fabric subscribes to wildcard export patterns (hal/cap/env/#, - // hal/cap/power/#). With ~10 retained topics per pattern and a channel - // of length N, messages beyond N are dropped during initial subscribe. - b := bus.NewBus(16, "+", "#") + // Queue length must cover the retained replay burst when fabric + // subscribes to wildcard export patterns (hal/cap/env/#, + // hal/cap/power/#). Each capability publishes retained info + + // status + value; pico_bb_proto_1 has ~26 retained topics across + // env and power domains. 32 provides margin for growth. + b := bus.NewBus(32, "+", "#") halConn := b.NewConnection("hal") uiConn := b.NewConnection("ui") bridgeConn := b.NewConnection("fabric-bridge") diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index a830db9..9aa625b 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -583,7 +583,6 @@ func TestImportCallTopic(t *testing.T) { wire []string want string }{ - {[]string{"rpc", "hal", "read_state"}, "rpc/hal/read_state"}, {[]string{"rpc", "hal", "dump"}, "rpc/hal/dump"}, {[]string{"rpc", "hal", "other"}, ""}, {[]string{"config", "device"}, ""}, @@ -625,8 +624,7 @@ func TestExportCallTopic(t *testing.T) { bus bus.Topic want []string }{ - {bus.T("fabric", "out", "rpc", "hal", "read_state"), []string{"rpc", "hal", "read_state"}}, - {bus.T("fabric", "out", "rpc", "hal", "dump"), nil}, + {bus.T("fabric", "out", "rpc", "hal", "dump"), []string{"rpc", "hal", "dump"}}, {bus.T("fabric", "out", "rpc", "hal"), nil}, {bus.T("other", "topic"), nil}, } { @@ -646,8 +644,8 @@ func TestExportCallPatterns(t *testing.T) { if len(patterns) != 1 { t.Fatalf("len(exportCallPatterns()) = %d, want 1", len(patterns)) } - if got := topicString(patterns[0]); got != "fabric/out/rpc/hal/read_state" { - t.Fatalf("exportCallPatterns()[0] = %q, want fabric/out/rpc/hal/read_state", got) + if got := topicString(patterns[0]); got != "fabric/out/rpc/hal/dump" { + t.Fatalf("exportCallPatterns()[0] = %q, want fabric/out/rpc/hal/dump", got) } } @@ -1005,7 +1003,7 @@ func TestCallHandlerError(t *testing.T) { bringUp(t, cm5) handler := b.NewConnection("handler") - sub := handler.Subscribe(bus.T("rpc", "hal", "read_state")) + sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) go func() { for m := range sub.Channel() { handler.Reply(m, struct { @@ -1016,7 +1014,7 @@ func TestCallHandlerError(t *testing.T) { }() sendMsg(t, cm5, wireCall{ - T: "call", ID: "err-1", Topic: []string{"rpc", "hal", "read_state"}, + T: "call", ID: "err-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) @@ -1042,7 +1040,7 @@ func TestCallDoesNotBlockPing(t *testing.T) { bringUp(t, cm5) handler := b.NewConnection("handler") - sub := handler.Subscribe(bus.T("rpc", "hal", "read_state")) + sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) go func() { for m := range sub.Channel() { time.Sleep(300 * time.Millisecond) @@ -1051,7 +1049,7 @@ func TestCallDoesNotBlockPing(t *testing.T) { }() sendMsg(t, cm5, wireCall{ - T: "call", ID: "slow-1", Topic: []string{"rpc", "hal", "read_state"}, + T: "call", ID: "slow-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 1000, }) sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: "s1"}) @@ -1112,7 +1110,7 @@ func TestCallExport(t *testing.T) { done := make(chan result, 1) go func() { msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "read_state"), + bus.T("fabric", "out", "rpc", "hal", "dump"), map[string]string{"ask": "status"}, false, )) @@ -1123,7 +1121,7 @@ func TestCallExport(t *testing.T) { if call.T != "call" { t.Fatalf("expected call, got %q", call.T) } - want := []string{"rpc", "hal", "read_state"} + want := []string{"rpc", "hal", "dump"} if !slicesEqual(call.Topic, want) { t.Fatalf("topic = %v, want %v", call.Topic, want) } @@ -1176,11 +1174,12 @@ func TestCallExportOnlyConfiguredRule(t *testing.T) { ack := bringUp(t, cm5) unlockExports(t, cm5, ack.SID) + // Use an unconfigured topic — only fabric/out/rpc/hal/dump is routed. reqCtx, reqCancel := context.WithTimeout(context.Background(), 250*time.Millisecond) defer reqCancel() go func() { _, _ = reqConn.RequestWait(reqCtx, reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "dump"), + bus.T("fabric", "out", "rpc", "hal", "not_configured"), map[string]string{"ask": "status"}, false, )) @@ -1204,7 +1203,7 @@ func TestPendingWireCallsTimeout(t *testing.T) { fabricConn := b.NewConnection("fabric") reqConn := b.NewConnection("caller") msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "read_state"), + bus.T("fabric", "out", "rpc", "hal", "dump"), map[string]string{"ask": "status"}, false, ) @@ -1268,9 +1267,9 @@ func TestDrainPendingCallsReportsMarshalFailure(t *testing.T) { handlerConn := b.NewConnection("handler") tr := &captureTransport{} - sub := handlerConn.Subscribe(bus.T("rpc", "hal", "read_state")) + sub := handlerConn.Subscribe(bus.T("rpc", "hal", "dump")) defer handlerConn.Unsubscribe(sub) - req := fabricConn.NewMessage(bus.T("rpc", "hal", "read_state"), map[string]string{"ask": "status"}, false) + req := fabricConn.NewMessage(bus.T("rpc", "hal", "dump"), map[string]string{"ask": "status"}, false) replySub := fabricConn.Request(req) var msg *bus.Message @@ -1329,7 +1328,7 @@ func TestDrainOutgoingWireCallsReportsMarshalFailure(t *testing.T) { defer s.teardownExports() msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "read_state"), + bus.T("fabric", "out", "rpc", "hal", "dump"), make(chan int), false, ) @@ -1380,7 +1379,7 @@ func TestDrainOutgoingWireCallsReportsWriteFailure(t *testing.T) { defer s.teardownExports() msg := reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "read_state"), + bus.T("fabric", "out", "rpc", "hal", "dump"), map[string]string{"ask": "status"}, false, ) @@ -1434,7 +1433,7 @@ func TestCallExportPeerReset(t *testing.T) { done := make(chan result, 1) go func() { msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "read_state"), + bus.T("fabric", "out", "rpc", "hal", "dump"), map[string]string{"ask": "status"}, false, )) @@ -1492,7 +1491,7 @@ func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { done := make(chan result, 1) go func() { msg, err := reqConn.RequestWait(context.Background(), reqConn.NewMessage( - bus.T("fabric", "out", "rpc", "hal", "read_state"), + bus.T("fabric", "out", "rpc", "hal", "dump"), map[string]string{"ask": "status"}, false, )) diff --git a/services/fabric/remap.go b/services/fabric/remap.go index fa1c3bb..48eb9d7 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -13,16 +13,12 @@ import "devicecode-go/bus" // ["config","device"] -> config/device // // CM5 -> MCU wire call: -// ["rpc","hal","read_state"] -> rpc/hal/read_state // ["rpc","hal","dump"] -> rpc/hal/dump // // MCU local bus publish -> wire: // hal/cap/env/# -> ["state","env",...] // hal/cap/power/# -> ["state","power",...] // hal/state -> ["state","hal"] -// -// MCU local bus call -> wire: -// fabric/out/rpc/hal/read_state -> ["rpc","hal","read_state"] type wireImportRule struct { wire []string @@ -43,13 +39,6 @@ var importPublishRules = []wireImportRule{ } var importCallRules = []wireImportRule{ - { - // Placeholder: no handler subscribes to rpc/hal/read_state yet. - // CM5 calls will forward onto the bus and timeout. Remove or - // implement before production use. - wire: []string{"rpc", "hal", "read_state"}, - local: []string{"rpc", "hal", "read_state"}, - }, { wire: []string{"rpc", "hal", "dump"}, local: []string{"rpc", "hal", "dump"}, @@ -75,8 +64,8 @@ var exportPublishRules = []busExportRule{ var exportCallRules = []busExportRule{ { - localPrefix: []string{"fabric", "out", "rpc", "hal", "read_state"}, - remotePrefix: []string{"rpc", "hal", "read_state"}, + localPrefix: []string{"fabric", "out", "rpc", "hal", "dump"}, + remotePrefix: []string{"rpc", "hal", "dump"}, }, } From da1f50baa757010b10ed973d92b2f1e1ba9d3c41 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 15:13:03 +0000 Subject: [PATCH 08/65] refactor: use message type constants instead of string literals Define msgHello, msgHelloAck, msgPing, msgPong, msgPub, msgUnretain, msgCall, msgReply constants in wire.go. Replace all raw string literals in dispatch, noteRx, and wire struct construction throughout session.go. --- services/fabric/fabric_test.go | 22 +-- services/fabric/session.go | 238 ++++++++++++++++++--------------- services/fabric/wire.go | 15 ++- 3 files changed, 157 insertions(+), 118 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 9aa625b..2feab90 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -808,7 +808,7 @@ func TestDrainExportsWaitsForStartupHoldoff(t *testing.T) { s := session{ link: linkUp, - exportsArmed: true, + exportsEnabled: true, exportSubs: []*bus.Subscription{sub}, exportReadyAt: time.Now().Add(time.Second), } @@ -1212,12 +1212,12 @@ func TestPendingWireCallsTimeout(t *testing.T) { s := session{ conn: fabricConn, - pendingWireCalls: []*pendingWireCall{ + outboundCalls: []*outboundCall{ {id: "wire-1", req: msg, deadline: time.Now().Add(-time.Millisecond)}, }, } - s.drainPendingWireCalls(time.Now()) + s.drainOutboundPending(time.Now()) select { case reply := <-sub.Channel(): @@ -1286,14 +1286,14 @@ func TestDrainPendingCallsReportsMarshalFailure(t *testing.T) { s := session{ conn: fabricConn, tr: tr, - pendingCalls: []*pendingCall{{ + inboundCalls: []*inboundCall{{ id: "call-1", sub: replySub, deadline: time.Now().Add(time.Second), }}, } - s.drainPendingCalls(time.Now()) + s.drainInbound(time.Now()) if len(tr.writes) != 1 { t.Fatalf("writes = %d, want 1", len(tr.writes)) @@ -1335,13 +1335,13 @@ func TestDrainOutgoingWireCallsReportsMarshalFailure(t *testing.T) { replySub := reqConn.Request(msg) defer reqConn.Unsubscribe(replySub) - s.drainOutgoingWireCalls(time.Now()) + s.drainOutboundNew(time.Now()) if len(tr.writes) != 0 { t.Fatalf("writes = %d, want 0", len(tr.writes)) } - if len(s.pendingWireCalls) != 0 { - t.Fatalf("pendingWireCalls = %d, want 0", len(s.pendingWireCalls)) + if len(s.outboundCalls) != 0 { + t.Fatalf("outboundCalls = %d, want 0", len(s.outboundCalls)) } select { @@ -1386,13 +1386,13 @@ func TestDrainOutgoingWireCallsReportsWriteFailure(t *testing.T) { replySub := reqConn.Request(msg) defer reqConn.Unsubscribe(replySub) - s.drainOutgoingWireCalls(time.Now()) + s.drainOutboundNew(time.Now()) if s.link != linkDown { t.Fatalf("link = %v, want %v", s.link, linkDown) } - if len(s.pendingWireCalls) != 0 { - t.Fatalf("pendingWireCalls = %d, want 0", len(s.pendingWireCalls)) + if len(s.outboundCalls) != 0 { + t.Fatalf("outboundCalls = %d, want 0", len(s.outboundCalls)) } select { diff --git a/services/fabric/session.go b/services/fabric/session.go index 153f747..18a8734 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -46,7 +46,35 @@ const ( // the 115200-baud link capacity. exportMaxPerTick = 1 exportWaitFallback = 15 * time.Second - errPayloadMarshal = "payload_marshal_failed" + errPayloadMarshal = "payload_marshal_failed" +) + +// ---- link reasons and error strings ---- + +const ( + reasonLinkDown = "link_down" + reasonPeerStale = "peer_stale" + reasonPeerReset = "peer_reset" + reasonPeerSessionChanged = "peer_session_changed" + reasonHelloRejected = "hello_rejected" + reasonTransportDown = "transport_down" + reasonTransportWrite = "transport_write_failed" + reasonNoRoute = "no_route" + reasonTimeout = "timeout" +) + +// ---- export arm reasons ---- + +// ---- export trigger reasons (why exports were enabled) ---- + +const ( + exportTriggerPub = "peer_pub" + exportTriggerPing = "peer_ping" + exportTriggerPong = "peer_pong" + exportTriggerCall = "peer_call" + exportTriggerReply = "peer_reply" + exportTriggerUnretain = "peer_unretain" + exportTriggerFallback = "fallback" ) // session manages the fabric link state machine over a Transport. @@ -72,13 +100,13 @@ type session struct { lastPongAt time.Time exportReadyAt time.Time exportWaitUntil time.Time - exportsArmed bool + exportsEnabled bool exportSubs []*bus.Subscription exportCallSubs []*bus.Subscription - pendingCalls []*pendingCall - pendingWireCalls []*pendingWireCall - nextWireCallID uint64 + inboundCalls []*inboundCall + outboundCalls []*outboundCall + nextOutboundID uint64 } func (s *session) log(msg string) { @@ -89,13 +117,13 @@ func (s *session) logKV(msg, key, value string) { println("[fabric]", "sid", s.localSID, msg, key, value) } -type pendingCall struct { +type inboundCall struct { id string sub *bus.Subscription deadline time.Time } -type pendingWireCall struct { +type outboundCall struct { id string req *bus.Message deadline time.Time @@ -119,8 +147,8 @@ type linkStatePayload struct { LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` - PendingCalls int `json:"pending_calls"` - PendingWireCalls int `json:"pending_wire_calls"` + InboundCalls int `json:"inbound_calls"` + OutboundCalls int `json:"outbound_calls"` Reason string `json:"reason,omitempty"` Err string `json:"err,omitempty"` } @@ -156,8 +184,8 @@ func (s *session) run(ctx context.Context) { defer s.tr.Close() defer s.teardownExports() - defer s.teardownPendingCalls() - defer s.teardownPendingWireCalls("link_down") + defer s.teardownInbound() + defer s.teardownOutbound(reasonLinkDown) defer s.log("run stop") stale := time.NewTimer(staleTimeout) @@ -186,7 +214,7 @@ func (s *session) run(ctx context.Context) { return } if res.err != nil { - s.handleLinkDown("transport_down", res.err.Error()) + s.handleLinkDown(reasonTransportDown, res.err.Error()) return } if s.dispatch(res.line) { @@ -195,15 +223,15 @@ func (s *session) run(ctx context.Context) { case <-exportTick.C: s.drainExports() - s.drainPendingCalls(time.Now()) - s.drainWireCalls(time.Now()) + s.drainInbound(time.Now()) + s.drainOutbound(time.Now()) case <-waitTick.C: s.logWaiting() case <-stale.C: if s.link == linkUp { - s.handleLinkDown("peer_stale", "") + s.handleLinkDown(reasonPeerStale, "") } else { stale.Reset(staleTimeout) } @@ -258,8 +286,8 @@ func (s *session) publishLinkState(reason, err string) { LastRxUnixMilli: unixMilli(s.lastRxAt), LastTxUnixMilli: unixMilli(s.lastTxAt), LastPongUnixMilli: unixMilli(s.lastPongAt), - PendingCalls: len(s.pendingCalls), - PendingWireCalls: len(s.pendingWireCalls), + InboundCalls: len(s.inboundCalls), + OutboundCalls: len(s.outboundCalls), Reason: reason, Err: err, }, @@ -267,21 +295,18 @@ func (s *session) publishLinkState(reason, err string) { )) } -func (s *session) noteRx(msgType string) { +func (s *session) markRx() { s.lastRxAt = time.Now() - if msgType == "pong" { - s.lastPongAt = s.lastRxAt - } } -func (s *session) noteTx() { +func (s *session) markTx() { s.lastTxAt = time.Now() } func (s *session) handleLinkDown(reason, err string) { pendingReason := reason if pendingReason == "" { - pendingReason = "link_down" + pendingReason = reasonLinkDown } s.link = linkDown s.remoteNode = "" @@ -290,10 +315,10 @@ func (s *session) handleLinkDown(reason, err string) { s.helloSeen = false s.exportReadyAt = time.Time{} s.exportWaitUntil = time.Time{} - s.exportsArmed = false + s.exportsEnabled = false s.teardownExports() - s.teardownPendingCalls() - s.teardownPendingWireCalls(pendingReason) + s.teardownInbound() + s.teardownOutbound(pendingReason) s.publishLinkState(reason, err) if err != "" { s.logKV("link down", "err", err) @@ -306,16 +331,16 @@ func (s *session) handleLinkDown(reason, err string) { func (s *session) promoteLink(reason string) { if s.link == linkUp { s.teardownExports() - s.teardownPendingCalls() + s.teardownInbound() if reason == "" { - reason = "peer_reset" + reason = reasonPeerReset } - s.teardownPendingWireCalls(reason) + s.teardownOutbound(reason) } s.link = linkUp s.exportReadyAt = time.Time{} s.exportWaitUntil = time.Now().Add(exportWaitFallback) - s.exportsArmed = false + s.exportsEnabled = false s.publishLinkState(reason, "") } @@ -324,21 +349,21 @@ func (s *session) promoteLink(reason string) { func (s *session) dispatch(line []byte) bool { msgType := wireType(line) switch msgType { - case "hello": + case msgHello: return s.onHello(line) - case "hello_ack": + case msgHelloAck: return s.onHelloAck(line) - case "ping": + case msgPing: return s.onPing(line) - case "pong": + case msgPong: return s.onPong(line) - case "pub": + case msgPub: return s.onPub(line) - case "unretain": + case msgUnretain: return s.onUnretain(line) - case "call": + case msgCall: return s.onCall(line) - case "reply": + case msgReply: return s.onReply(line) default: if msgType == "" { @@ -358,7 +383,7 @@ func (s *session) dispatch(line []byte) bool { func (s *session) notePeerIdentity(node, sid string, proto int) string { reason := "" if s.link == linkUp && s.peerSID != "" && sid != "" && s.peerSID != sid { - reason = "peer_session_changed" + reason = reasonPeerSessionChanged } if node != "" { s.remoteNode = node @@ -400,7 +425,7 @@ func (s *session) onHello(line []byte) bool { s.logKV("malformed hello dropped", "err", err.Error()) return false } - s.noteRx("hello") + s.markRx() if msg.Peer != "" && msg.Peer != s.nodeID { s.log("hello dropped: wrong peer") return false @@ -414,7 +439,7 @@ func (s *session) onHello(line []byte) bool { s.logKV("hello rx", "peer_sid", msg.SID) if !s.writeLine(marshal(wireHelloAck{ - T: "hello_ack", + T: msgHelloAck, Node: s.nodeID, SID: s.localSID, Proto: protoVersion, @@ -428,14 +453,14 @@ func (s *session) onHello(line []byte) bool { return true } -func (s *session) armExports(reason string) { - if s.link != linkUp || s.exportsArmed { +func (s *session) enableExports(reason string) { + if s.link != linkUp || s.exportsEnabled { return } s.setupExports() s.exportReadyAt = time.Now().Add(exportStartHoldoff) s.exportWaitUntil = time.Time{} - s.exportsArmed = true + s.exportsEnabled = true s.logKV("export replay armed", "reason", reason) } @@ -449,10 +474,10 @@ func (s *session) onHelloAck(line []byte) bool { s.log("echoed hello_ack ignored") return true } - s.noteRx("hello_ack") + s.markRx() if !msg.OK { s.log("hello_ack rejected by peer") - s.handleLinkDown("hello_rejected", "") + s.handleLinkDown(reasonHelloRejected, "") return true } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) @@ -468,7 +493,7 @@ func (s *session) onPing(line []byte) bool { s.logKV("malformed ping dropped", "err", err.Error()) return false } - s.noteRx("ping") + s.markRx() if s.link != linkUp { s.log("ping dropped: link not up") return true @@ -477,13 +502,13 @@ func (s *session) onPing(line []byte) bool { if reason != "" { s.logKV("peer session changed", "reason", reason) s.teardownExports() - s.teardownPendingCalls() - s.teardownPendingWireCalls(reason) - s.exportsArmed = false + s.teardownInbound() + s.teardownOutbound(reason) + s.exportsEnabled = false } - s.armExports("peer_ping") + s.enableExports(exportTriggerPing) s.logKV("ping rx", "peer_sid", msg.SID) - if !s.writeLine(marshal(wirePong{T: "pong", TS: msg.TS, SID: s.localSID})) { + if !s.writeLine(marshal(wirePong{T: msgPong, TS: msg.TS, SID: s.localSID})) { return true } s.log("pong tx") @@ -501,16 +526,17 @@ func (s *session) onPong(line []byte) bool { s.log("echoed pong ignored") return true } - s.noteRx("pong") + s.markRx() + s.lastPongAt = s.lastRxAt reason := s.notePeerIdentity("", msg.SID, 0) if reason != "" { s.logKV("peer session changed", "reason", reason) s.teardownExports() - s.teardownPendingCalls() - s.teardownPendingWireCalls(reason) - s.exportsArmed = false + s.teardownInbound() + s.teardownOutbound(reason) + s.exportsEnabled = false } - s.armExports("peer_pong") + s.enableExports(exportTriggerPong) s.publishLinkState(reason, "") return true } @@ -521,7 +547,7 @@ func (s *session) onPub(line []byte) bool { s.logKV("malformed pub dropped", "err", err.Error()) return false } - s.noteRx("pub") + s.markRx() if s.link != linkUp { s.log("pub dropped before handshake") return true @@ -535,7 +561,7 @@ func (s *session) onPub(line []byte) bool { s.log("incoming pub dropped: no_route") return true } - s.armExports("peer_pub") + s.enableExports(exportTriggerPub) s.conn.Publish(s.conn.NewMessage(t, msg.Payload, msg.Retain)) return true } @@ -546,7 +572,7 @@ func (s *session) onUnretain(line []byte) bool { s.logKV("malformed unretain dropped", "err", err.Error()) return false } - s.noteRx("unretain") + s.markRx() if s.link != linkUp { s.log("unretain dropped before handshake") return true @@ -556,7 +582,7 @@ func (s *session) onUnretain(line []byte) bool { s.log("incoming unretain dropped: no_route") return true } - s.armExports("peer_unretain") + s.enableExports(exportTriggerUnretain) s.conn.Publish(s.conn.NewMessage(t, nil, true)) return true } @@ -567,7 +593,7 @@ func (s *session) onCall(line []byte) bool { s.logKV("malformed call dropped", "err", err.Error()) return false } - s.noteRx("call") + s.markRx() if s.link != linkUp { s.log("call dropped before handshake") return true @@ -575,10 +601,10 @@ func (s *session) onCall(line []byte) bool { t := importCallTopic(msg.Topic) if t == nil { s.log("incoming call dropped: no_route") - s.writeLine(marshal(wireReply{T: "reply", Corr: msg.ID, OK: false, Err: "no_route"})) + s.writeLine(marshal(wireReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return true } - s.armExports("peer_call") + s.enableExports(exportTriggerCall) timeout := callTimeoutDef if msg.TimeoutMs > 0 { @@ -586,7 +612,7 @@ func (s *session) onCall(line []byte) bool { } busMsg := s.conn.NewMessage(t, msg.Payload, false) sub := s.conn.Request(busMsg) - s.pendingCalls = append(s.pendingCalls, &pendingCall{ + s.inboundCalls = append(s.inboundCalls, &inboundCall{ id: msg.ID, sub: sub, deadline: time.Now().Add(timeout), @@ -600,14 +626,14 @@ func (s *session) onReply(line []byte) bool { s.logKV("malformed reply dropped", "err", err.Error()) return false } - s.noteRx("reply") - s.armExports("peer_reply") + s.markRx() + s.enableExports(exportTriggerReply) - for i, call := range s.pendingWireCalls { + for i, call := range s.outboundCalls { if call.id != msg.Corr { continue } - s.pendingWireCalls = append(s.pendingWireCalls[:i], s.pendingWireCalls[i+1:]...) + s.outboundCalls = append(s.outboundCalls[:i], s.outboundCalls[i+1:]...) if !call.req.CanReply() { return true } @@ -674,23 +700,23 @@ func (s *session) teardownExports() { s.exportCallSubs = nil } -func (s *session) teardownPendingCalls() { - for _, call := range s.pendingCalls { +func (s *session) teardownInbound() { + for _, call := range s.inboundCalls { if call.sub != nil { s.conn.Unsubscribe(call.sub) call.sub = nil } } - s.pendingCalls = nil + s.inboundCalls = nil } -func (s *session) teardownPendingWireCalls(reason string) { - for _, call := range s.pendingWireCalls { +func (s *session) teardownOutbound(reason string) { + for _, call := range s.outboundCalls { if call.req != nil && call.req.CanReply() { s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reason}, false) } } - s.pendingWireCalls = nil + s.outboundCalls = nil } // drainExports does a non-blocking read of each export subscription @@ -699,9 +725,9 @@ func (s *session) drainExports() { if s.link != linkUp { return } - if !s.exportsArmed { + if !s.exportsEnabled { if !s.exportWaitUntil.IsZero() && !time.Now().Before(s.exportWaitUntil) { - s.armExports("fallback") + s.enableExports(exportTriggerFallback) } else { return } @@ -726,7 +752,7 @@ func (s *session) drainExports() { } if m.Retained && m.Payload == nil { if !s.writeLine(marshal(wireUnretain{ - T: "unretain", + T: msgUnretain, Topic: wire, })) { return @@ -740,7 +766,7 @@ func (s *session) drainExports() { continue } if !s.writeLine(marshal(wirePub{ - T: "pub", + T: msgPub, Topic: wire, Payload: payload, Retain: m.Retained, @@ -756,37 +782,37 @@ func (s *session) drainExports() { } } -func (s *session) drainPendingCalls(now time.Time) { - if len(s.pendingCalls) == 0 { +func (s *session) drainInbound(now time.Time) { + if len(s.inboundCalls) == 0 { return } - keep := s.pendingCalls[:0] - for _, call := range s.pendingCalls { + keep := s.inboundCalls[:0] + for _, call := range s.inboundCalls { select { case reply, ok := <-call.sub.Channel(): s.conn.Unsubscribe(call.sub) - call.sub = nil // prevent double-unsubscribe in teardownPendingCalls + call.sub = nil // prevent double-unsubscribe in teardownInbound if !ok || reply == nil { - if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: "timeout"})) { + if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue } if errStr := checkBusError(reply.Payload); errStr != "" { - if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: errStr})) { + if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: errStr})) { return } continue } payload, err := marshalPayload(reply.Payload) if err != nil { - if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: errPayloadMarshal})) { + if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { return } continue } - if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: true, Payload: payload})) { + if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: true, Payload: payload})) { return } continue @@ -796,7 +822,7 @@ func (s *session) drainPendingCalls(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) call.sub = nil - if !s.writeLine(marshal(wireReply{T: "reply", Corr: call.id, OK: false, Err: "timeout"})) { + if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue @@ -805,15 +831,15 @@ func (s *session) drainPendingCalls(now time.Time) { keep = append(keep, call) } - s.pendingCalls = keep + s.inboundCalls = keep } -func (s *session) drainWireCalls(now time.Time) { - s.drainOutgoingWireCalls(now) - s.drainPendingWireCalls(now) +func (s *session) drainOutbound(now time.Time) { + s.drainOutboundNew(now) + s.drainOutboundPending(now) } -func (s *session) drainOutgoingWireCalls(now time.Time) { +func (s *session) drainOutboundNew(now time.Time) { if s.link != linkUp || len(s.exportCallSubs) == 0 { return } @@ -839,18 +865,18 @@ func (s *session) drainOutgoingWireCalls(now time.Time) { } continue } - id := s.nextWireCallID - s.nextWireCallID++ + id := s.nextOutboundID + s.nextOutboundID++ corr := "wire-" + strconvx.Utoa64(id) if msg.CanReply() { - s.pendingWireCalls = append(s.pendingWireCalls, &pendingWireCall{ + s.outboundCalls = append(s.outboundCalls, &outboundCall{ id: corr, req: msg, deadline: now.Add(callTimeoutDef), }) } if !s.writeLine(marshal(wireCall{ - T: "call", + T: msgCall, ID: corr, Topic: wireTopic, Payload: payload, @@ -866,22 +892,22 @@ func (s *session) drainOutgoingWireCalls(now time.Time) { } } -func (s *session) drainPendingWireCalls(now time.Time) { - if len(s.pendingWireCalls) == 0 { +func (s *session) drainOutboundPending(now time.Time) { + if len(s.outboundCalls) == 0 { return } - keep := s.pendingWireCalls[:0] - for _, call := range s.pendingWireCalls { + keep := s.outboundCalls[:0] + for _, call := range s.outboundCalls { if !now.Before(call.deadline) { if call.req != nil && call.req.CanReply() { - s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: "timeout"}, false) + s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reasonTimeout}, false) } continue } keep = append(keep, call) } - s.pendingWireCalls = keep + s.outboundCalls = keep } // ---- transport write ---- @@ -895,10 +921,10 @@ func (s *session) writeLine(data []byte) bool { s.log("oversized write dropped") return true } - s.handleLinkDown("transport_write_failed", err.Error()) + s.handleLinkDown(reasonTransportWrite, err.Error()) return false } - s.noteTx() + s.markTx() return true } diff --git a/services/fabric/wire.go b/services/fabric/wire.go index 75657ba..908d8b1 100644 --- a/services/fabric/wire.go +++ b/services/fabric/wire.go @@ -2,7 +2,20 @@ package fabric import "encoding/json" -// ---- Wire message types (fabric.md §4) ---- +// ---- Wire message type identifiers (fabric.md §4) ---- + +const ( + msgHello = "hello" + msgHelloAck = "hello_ack" + msgPing = "ping" + msgPong = "pong" + msgPub = "pub" + msgUnretain = "unretain" + msgCall = "call" + msgReply = "reply" +) + +// ---- Wire message structs ---- // wireCaps is carried in hello for forward compatibility. The Lua side // sends caps but neither side enforces them in v1. From 068096179c0cfe83c032bb5fb198d449f64e47ab Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 16:08:57 +0000 Subject: [PATCH 09/65] refactor: rename remoteNode to peerNode, align with Lua naming Rename remoteNode -> peerNode and RemoteID -> PeerNode in the link state payload to match the Lua side's peer_* naming convention. Update JSON tag from remote_id to peer_node. --- services/fabric/session.go | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 18a8734..1d29297 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -91,7 +91,7 @@ type session struct { conn *bus.Connection link linkState - remoteNode string + peerNode string peerSID string peerProto int helloSeen bool @@ -142,7 +142,7 @@ type linkStatePayload struct { PeerID string `json:"peer_id"` LocalSID string `json:"local_sid"` PeerSID string `json:"peer_sid,omitempty"` - RemoteID string `json:"remote_id,omitempty"` + PeerNode string `json:"peer_node,omitempty"` PeerProto int `json:"peer_proto,omitempty"` LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` @@ -281,7 +281,7 @@ func (s *session) publishLinkState(reason, err string) { PeerID: s.peerID, LocalSID: s.localSID, PeerSID: s.peerSID, - RemoteID: s.remoteNode, + PeerNode: s.peerNode, PeerProto: s.peerProto, LastRxUnixMilli: unixMilli(s.lastRxAt), LastTxUnixMilli: unixMilli(s.lastTxAt), @@ -309,7 +309,7 @@ func (s *session) handleLinkDown(reason, err string) { pendingReason = reasonLinkDown } s.link = linkDown - s.remoteNode = "" + s.peerNode = "" s.peerSID = "" s.peerProto = 0 s.helloSeen = false @@ -386,7 +386,7 @@ func (s *session) notePeerIdentity(node, sid string, proto int) string { reason = reasonPeerSessionChanged } if node != "" { - s.remoteNode = node + s.peerNode = node } if sid != "" { s.peerSID = sid From a46446f414a30be8f764d5c1aae2a5ee5fae9c17 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 16:10:26 +0000 Subject: [PATCH 10/65] style: gofmt --- services/fabric/fabric_test.go | 8 ++++---- services/fabric/session.go | 36 +++++++++++++++++----------------- 2 files changed, 22 insertions(+), 22 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 2feab90..0d45dd2 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -807,10 +807,10 @@ func TestDrainExportsWaitsForStartupHoldoff(t *testing.T) { ) s := session{ - link: linkUp, - exportsEnabled: true, - exportSubs: []*bus.Subscription{sub}, - exportReadyAt: time.Now().Add(time.Second), + link: linkUp, + exportsEnabled: true, + exportSubs: []*bus.Subscription{sub}, + exportReadyAt: time.Now().Add(time.Second), } pub.Publish(msg) diff --git a/services/fabric/session.go b/services/fabric/session.go index 1d29297..2ff9c5d 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -46,7 +46,7 @@ const ( // the 115200-baud link capacity. exportMaxPerTick = 1 exportWaitFallback = 15 * time.Second - errPayloadMarshal = "payload_marshal_failed" + errPayloadMarshal = "payload_marshal_failed" ) // ---- link reasons and error strings ---- @@ -90,23 +90,23 @@ type session struct { tr Transport conn *bus.Connection - link linkState - peerNode string - peerSID string - peerProto int - helloSeen bool - lastRxAt time.Time - lastTxAt time.Time - lastPongAt time.Time - exportReadyAt time.Time + link linkState + peerNode string + peerSID string + peerProto int + helloSeen bool + lastRxAt time.Time + lastTxAt time.Time + lastPongAt time.Time + exportReadyAt time.Time exportWaitUntil time.Time exportsEnabled bool - exportSubs []*bus.Subscription - exportCallSubs []*bus.Subscription - inboundCalls []*inboundCall - outboundCalls []*outboundCall - nextOutboundID uint64 + exportSubs []*bus.Subscription + exportCallSubs []*bus.Subscription + inboundCalls []*inboundCall + outboundCalls []*outboundCall + nextOutboundID uint64 } func (s *session) log(msg string) { @@ -147,8 +147,8 @@ type linkStatePayload struct { LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` - InboundCalls int `json:"inbound_calls"` - OutboundCalls int `json:"outbound_calls"` + InboundCalls int `json:"inbound_calls"` + OutboundCalls int `json:"outbound_calls"` Reason string `json:"reason,omitempty"` Err string `json:"err,omitempty"` } @@ -287,7 +287,7 @@ func (s *session) publishLinkState(reason, err string) { LastTxUnixMilli: unixMilli(s.lastTxAt), LastPongUnixMilli: unixMilli(s.lastPongAt), InboundCalls: len(s.inboundCalls), - OutboundCalls: len(s.outboundCalls), + OutboundCalls: len(s.outboundCalls), Reason: reason, Err: err, }, From c17e7d17c58b90aa9d9910bb2ac1b6a6c75a576f Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 16:23:36 +0000 Subject: [PATCH 11/65] refactor: centralise dispatch unmarshal, remove helloSeen, remove logWaiting on start MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Unmarshal incoming frames once in dispatch via wireMsg union struct instead of double-parsing (wireType + per-handler unmarshal). markRx() is now called once in dispatch rather than in every handler. Remove helloSeen flag — logWaiting now checks peerSID != "" which is already managed by notePeerIdentity and cleared on link down. Remove redundant logWaiting() call at session start — the waitTick prints the same message after 2 seconds. --- services/fabric/session.go | 100 ++++++++++--------------------------- services/fabric/wire.go | 21 ++++++++ 2 files changed, 47 insertions(+), 74 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 2ff9c5d..ae9813f 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -94,7 +94,6 @@ type session struct { peerNode string peerSID string peerProto int - helloSeen bool lastRxAt time.Time lastTxAt time.Time lastPongAt time.Time @@ -200,7 +199,6 @@ func (s *session) run(ctx context.Context) { exportTick := time.NewTicker(50 * time.Millisecond) defer exportTick.Stop() - s.logWaiting() s.publishLinkState("", "") s.log("run start") @@ -312,7 +310,6 @@ func (s *session) handleLinkDown(reason, err string) { s.peerNode = "" s.peerSID = "" s.peerProto = 0 - s.helloSeen = false s.exportReadyAt = time.Time{} s.exportWaitUntil = time.Time{} s.exportsEnabled = false @@ -347,29 +344,34 @@ func (s *session) promoteLink(reason string) { // ---- dispatch ---- func (s *session) dispatch(line []byte) bool { - msgType := wireType(line) - switch msgType { + var msg wireMsg + if err := json.Unmarshal(line, &msg); err != nil { + s.logKV("malformed frame dropped", "err", err.Error()) + return false + } + s.markRx() + switch msg.T { case msgHello: - return s.onHello(line) + return s.onHello(&msg) case msgHelloAck: - return s.onHelloAck(line) + return s.onHelloAck(&msg) case msgPing: - return s.onPing(line) + return s.onPing(&msg) case msgPong: - return s.onPong(line) + return s.onPong(&msg) case msgPub: - return s.onPub(line) + return s.onPub(&msg) case msgUnretain: - return s.onUnretain(line) + return s.onUnretain(&msg) case msgCall: - return s.onCall(line) + return s.onCall(&msg) case msgReply: - return s.onReply(line) + return s.onReply(&msg) default: - if msgType == "" { + if msg.T == "" { s.log("invalid frame dropped") } else { - s.logKV("unknown frame type dropped", "type", msgType) + s.logKV("unknown frame type dropped", "type", msg.T) } return false } @@ -419,13 +421,7 @@ func hasWirePrefix(topic, prefix []string) bool { return true } -func (s *session) onHello(line []byte) bool { - var msg wireHello - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed hello dropped", "err", err.Error()) - return false - } - s.markRx() +func (s *session) onHello(msg *wireMsg) bool { if msg.Peer != "" && msg.Peer != s.nodeID { s.log("hello dropped: wrong peer") return false @@ -435,7 +431,6 @@ func (s *session) onHello(line []byte) bool { return false } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) - s.helloSeen = true s.logKV("hello rx", "peer_sid", msg.SID) if !s.writeLine(marshal(wireHelloAck{ @@ -464,36 +459,23 @@ func (s *session) enableExports(reason string) { s.logKV("export replay armed", "reason", reason) } -func (s *session) onHelloAck(line []byte) bool { - var msg wireHelloAck - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed hello_ack dropped", "err", err.Error()) - return false - } +func (s *session) onHelloAck(msg *wireMsg) bool { if s.isSelfControlFrame(msg.Node, msg.SID) { s.log("echoed hello_ack ignored") return true } - s.markRx() if !msg.OK { s.log("hello_ack rejected by peer") s.handleLinkDown(reasonHelloRejected, "") return true } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) - s.helloSeen = true s.logKV("hello_ack rx", "peer_sid", msg.SID) s.promoteLink(reason) return true } -func (s *session) onPing(line []byte) bool { - var msg wirePing - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed ping dropped", "err", err.Error()) - return false - } - s.markRx() +func (s *session) onPing(msg *wireMsg) bool { if s.link != linkUp { s.log("ping dropped: link not up") return true @@ -516,17 +498,11 @@ func (s *session) onPing(line []byte) bool { return true } -func (s *session) onPong(line []byte) bool { - var msg wirePong - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed pong dropped", "err", err.Error()) - return false - } +func (s *session) onPong(msg *wireMsg) bool { if s.isSelfControlFrame("", msg.SID) { s.log("echoed pong ignored") return true } - s.markRx() s.lastPongAt = s.lastRxAt reason := s.notePeerIdentity("", msg.SID, 0) if reason != "" { @@ -541,13 +517,7 @@ func (s *session) onPong(line []byte) bool { return true } -func (s *session) onPub(line []byte) bool { - var msg wirePub - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed pub dropped", "err", err.Error()) - return false - } - s.markRx() +func (s *session) onPub(msg *wireMsg) bool { if s.link != linkUp { s.log("pub dropped before handshake") return true @@ -566,13 +536,7 @@ func (s *session) onPub(line []byte) bool { return true } -func (s *session) onUnretain(line []byte) bool { - var msg wireUnretain - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed unretain dropped", "err", err.Error()) - return false - } - s.markRx() +func (s *session) onUnretain(msg *wireMsg) bool { if s.link != linkUp { s.log("unretain dropped before handshake") return true @@ -587,13 +551,7 @@ func (s *session) onUnretain(line []byte) bool { return true } -func (s *session) onCall(line []byte) bool { - var msg wireCall - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed call dropped", "err", err.Error()) - return false - } - s.markRx() +func (s *session) onCall(msg *wireMsg) bool { if s.link != linkUp { s.log("call dropped before handshake") return true @@ -620,13 +578,7 @@ func (s *session) onCall(line []byte) bool { return true } -func (s *session) onReply(line []byte) bool { - var msg wireReply - if err := json.Unmarshal(line, &msg); err != nil { - s.logKV("malformed reply dropped", "err", err.Error()) - return false - } - s.markRx() +func (s *session) onReply(msg *wireMsg) bool { s.enableExports(exportTriggerReply) for i, call := range s.outboundCalls { @@ -929,7 +881,7 @@ func (s *session) writeLine(data []byte) bool { } func (s *session) logWaiting() { - if s.helloSeen { + if s.peerSID != "" { return } s.log("waiting for connection start") diff --git a/services/fabric/wire.go b/services/fabric/wire.go index 908d8b1..aab8ea8 100644 --- a/services/fabric/wire.go +++ b/services/fabric/wire.go @@ -83,6 +83,27 @@ type wireReply struct { Err string `json:"err,omitempty"` } +// wireMsg is a union struct for single-pass unmarshal in dispatch. +// Fields are the superset of all message types. Only the fields +// relevant to the T value are populated; the rest are zero. +type wireMsg struct { + T string `json:"t"` + Node string `json:"node,omitempty"` + Peer string `json:"peer,omitempty"` + SID string `json:"sid,omitempty"` + Proto int `json:"proto,omitempty"` + OK bool `json:"ok,omitempty"` + Caps *wireCaps `json:"caps,omitempty"` + TS int64 `json:"ts,omitempty"` + Topic []string `json:"topic,omitempty"` + Payload json.RawMessage `json:"payload,omitempty"` + Retain bool `json:"retain,omitempty"` + ID string `json:"id,omitempty"` + Corr string `json:"corr,omitempty"` + TimeoutMs int `json:"timeout_ms,omitempty"` + Err string `json:"err,omitempty"` +} + // ---- codec helpers ---- // marshal returns compact JSON with a trailing newline. From ba2d3e837728eaa243eb1bbe1c9cdadf7498de25 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 16:38:38 +0000 Subject: [PATCH 12/65] refactor: centralise link-up guard in dispatch MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move the link-up check from individual handlers (onPing, onPub, onUnretain, onCall) into dispatch. Only hello and hello_ack are accepted before the link is established — they are the handshake. All other message types (ping, pong, pub, unretain, call, reply) are dropped with a single log line if the link is not up. This removes scattered precondition checks from 4 handlers and ensures pong/reply are also rejected before handshake, which they previously were not. --- services/fabric/session.go | 39 ++++++++++++++++++-------------------- 1 file changed, 18 insertions(+), 21 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index ae9813f..2fc0013 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -350,6 +350,23 @@ func (s *session) dispatch(line []byte) bool { return false } s.markRx() + + // Only hello and hello_ack are accepted before the link is up — + // they're the handshake that establishes the session. Everything + // else (ping, pong, pub, call, reply, unretain) requires an + // established link. Stale traffic from a previous session or + // messages from an unrecognised peer are dropped here rather + // than scattered across individual handlers. + switch msg.T { + case msgHello, msgHelloAck: + // Handshake messages are always accepted. + default: + if s.link != linkUp { + s.logKV("dropped before handshake", "type", msg.T) + return true + } + } + switch msg.T { case msgHello: return s.onHello(&msg) @@ -368,11 +385,7 @@ func (s *session) dispatch(line []byte) bool { case msgReply: return s.onReply(&msg) default: - if msg.T == "" { - s.log("invalid frame dropped") - } else { - s.logKV("unknown frame type dropped", "type", msg.T) - } + s.logKV("unknown message type dropped", "type", msg.T) return false } } @@ -476,10 +489,6 @@ func (s *session) onHelloAck(msg *wireMsg) bool { } func (s *session) onPing(msg *wireMsg) bool { - if s.link != linkUp { - s.log("ping dropped: link not up") - return true - } reason := s.notePeerIdentity("", msg.SID, 0) if reason != "" { s.logKV("peer session changed", "reason", reason) @@ -518,10 +527,6 @@ func (s *session) onPong(msg *wireMsg) bool { } func (s *session) onPub(msg *wireMsg) bool { - if s.link != linkUp { - s.log("pub dropped before handshake") - return true - } t := importPublishTopic(msg.Topic) if t == nil { if hasWirePrefix(msg.Topic, []string{"state"}) { @@ -537,10 +542,6 @@ func (s *session) onPub(msg *wireMsg) bool { } func (s *session) onUnretain(msg *wireMsg) bool { - if s.link != linkUp { - s.log("unretain dropped before handshake") - return true - } t := importPublishTopic(msg.Topic) if t == nil { s.log("incoming unretain dropped: no_route") @@ -552,10 +553,6 @@ func (s *session) onUnretain(msg *wireMsg) bool { } func (s *session) onCall(msg *wireMsg) bool { - if s.link != linkUp { - s.log("call dropped before handshake") - return true - } t := importCallTopic(msg.Topic) if t == nil { s.log("incoming call dropped: no_route") From 85a92606e90974a60747c9be8fc7936f1114392f Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 16:53:14 +0000 Subject: [PATCH 13/65] refactor: centralise validation in validateInbound, add status consts MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Add validateInbound() to gate all non-handshake messages on link state and session ID match. Removes SID-change teardown logic from onPing and onPong — mismatched SIDs are now dropped at the dispatch level. Add statusReady/statusOpening/statusDown constants for link state payload strings. Fix test helper unlockExports to use the CM5's SID (matching peerSID) rather than the MCU's ack SID. --- services/fabric/fabric_test.go | 31 +++++++++------- services/fabric/session.go | 68 ++++++++++++++++------------------ 2 files changed, 48 insertions(+), 51 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 0d45dd2..9991918 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -62,10 +62,12 @@ func sendMsg(t *testing.T, tr Transport, v any) { } } +const testCM5SID = "s1" + func bringUp(t *testing.T, cm5 Transport) wireHelloAck { t.Helper() sendMsg(t, cm5, wireHello{ - T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, + T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: testCM5SID, Proto: protoVersion, }) ack := readMsg[wireHelloAck](t, cm5) if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { @@ -75,9 +77,9 @@ func bringUp(t *testing.T, cm5 Transport) wireHelloAck { return ack } -func unlockExports(t *testing.T, cm5 Transport, sid string) { +func unlockExports(t *testing.T, cm5 Transport) { t.Helper() - sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: sid}) + sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: testCM5SID}) pong := readMsg[wirePong](t, cm5) if pong.T != "pong" { t.Fatalf("expected pong, got %q", pong.T) @@ -709,8 +711,8 @@ func TestPubExport(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") - ack := bringUp(t, cm5) - unlockExports(t, cm5, ack.SID) + bringUp(t, cm5) + unlockExports(t, cm5) publishConn.Publish(publishConn.NewMessage( bus.T("hal", "cap", "env", "temperature", "core", "value"), @@ -739,8 +741,8 @@ func TestUnretainExport(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") - ack := bringUp(t, cm5) - unlockExports(t, cm5, ack.SID) + bringUp(t, cm5) + unlockExports(t, cm5) // Publish retained value first. publishConn.Publish(publishConn.NewMessage( @@ -1100,8 +1102,8 @@ func TestCallExport(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") - ack := bringUp(t, cm5) - unlockExports(t, cm5, ack.SID) + bringUp(t, cm5) + unlockExports(t, cm5) type result struct { msg *bus.Message @@ -1171,8 +1173,8 @@ func TestCallExportOnlyConfiguredRule(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") - ack := bringUp(t, cm5) - unlockExports(t, cm5, ack.SID) + bringUp(t, cm5) + unlockExports(t, cm5) // Use an unconfigured topic — only fabric/out/rpc/hal/dump is routed. reqCtx, reqCancel := context.WithTimeout(context.Background(), 250*time.Millisecond) @@ -1423,8 +1425,8 @@ func TestCallExportPeerReset(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") - ack := bringUp(t, cm5) - unlockExports(t, cm5, ack.SID) + bringUp(t, cm5) + unlockExports(t, cm5) type result struct { msg *bus.Message @@ -1482,7 +1484,7 @@ func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { defer cancel() go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") ack := bringUp(t, cm5) - unlockExports(t, cm5, ack.SID) + unlockExports(t, cm5) type result struct { msg *bus.Message @@ -1503,6 +1505,7 @@ func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { t.Fatalf("expected call, got %q", call.T) } + // Send an echoed hello_ack (our own SID) — should be ignored. sendMsg(t, cm5, wireHelloAck{ T: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, }) diff --git a/services/fabric/session.go b/services/fabric/session.go index 2fc0013..c071586 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -20,6 +20,14 @@ const ( linkUp ) +// ---- link status strings (published in link state payload) ---- + +const ( + statusReady = "ready" + statusOpening = "opening" + statusDown = "down" +) + // ---- timeouts (local policy) ---- // // Timing relationships: @@ -256,9 +264,9 @@ func unixMilli(t time.Time) int64 { func (s *session) currentStatus() string { if s.link == linkUp { - return "ready" + return statusReady } - return "opening" + return statusOpening } func (s *session) publishLinkState(reason, err string) { @@ -267,7 +275,7 @@ func (s *session) publishLinkState(reason, err string) { } status := s.currentStatus() if s.link != linkUp && (reason != "" || err != "") { - status = "down" + status = statusDown } s.conn.Publish(s.conn.NewMessage( bus.T("state", "fabric", "link", s.linkID), @@ -343,6 +351,24 @@ func (s *session) promoteLink(reason string) { // ---- dispatch ---- +// validateInbound checks whether a message should be processed. +// Handshake messages (hello, hello_ack) are always accepted. +// All others require an established link and a matching session ID. +func (s *session) validateInbound(msg *wireMsg) bool { + if msg.T == msgHello || msg.T == msgHelloAck { + return true + } + if s.link != linkUp { + s.logKV("dropped before handshake", "type", msg.T) + return false + } + if s.peerSID != "" && msg.SID != "" && msg.SID != s.peerSID { + s.logKV("dropped: wrong session", "type", msg.T) + return false + } + return true +} + func (s *session) dispatch(line []byte) bool { var msg wireMsg if err := json.Unmarshal(line, &msg); err != nil { @@ -350,23 +376,9 @@ func (s *session) dispatch(line []byte) bool { return false } s.markRx() - - // Only hello and hello_ack are accepted before the link is up — - // they're the handshake that establishes the session. Everything - // else (ping, pong, pub, call, reply, unretain) requires an - // established link. Stale traffic from a previous session or - // messages from an unrecognised peer are dropped here rather - // than scattered across individual handlers. - switch msg.T { - case msgHello, msgHelloAck: - // Handshake messages are always accepted. - default: - if s.link != linkUp { - s.logKV("dropped before handshake", "type", msg.T) - return true - } + if !s.validateInbound(&msg) { + return true } - switch msg.T { case msgHello: return s.onHello(&msg) @@ -489,21 +501,12 @@ func (s *session) onHelloAck(msg *wireMsg) bool { } func (s *session) onPing(msg *wireMsg) bool { - reason := s.notePeerIdentity("", msg.SID, 0) - if reason != "" { - s.logKV("peer session changed", "reason", reason) - s.teardownExports() - s.teardownInbound() - s.teardownOutbound(reason) - s.exportsEnabled = false - } s.enableExports(exportTriggerPing) s.logKV("ping rx", "peer_sid", msg.SID) if !s.writeLine(marshal(wirePong{T: msgPong, TS: msg.TS, SID: s.localSID})) { return true } s.log("pong tx") - s.publishLinkState(reason, "") return true } @@ -513,16 +516,7 @@ func (s *session) onPong(msg *wireMsg) bool { return true } s.lastPongAt = s.lastRxAt - reason := s.notePeerIdentity("", msg.SID, 0) - if reason != "" { - s.logKV("peer session changed", "reason", reason) - s.teardownExports() - s.teardownInbound() - s.teardownOutbound(reason) - s.exportsEnabled = false - } s.enableExports(exportTriggerPong) - s.publishLinkState(reason, "") return true } From adc4ac3fd75b5aa65a031120d262c898b297a3aa Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 17:04:51 +0000 Subject: [PATCH 14/65] refactor: enable exports on link-up, remove deferred arming MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Exports are now enabled immediately in promoteLink when the link transitions to up, instead of being deferred and re-triggered by every incoming message handler. Removes enableExports(), exportWaitFallback, exportWaitUntil, and all exportTrigger* constants. The fallback timer and per-handler arming were unnecessary complexity — exports should start when the session is established, not when arbitrary messages arrive. --- services/fabric/session.go | 76 ++++++++++---------------------------- 1 file changed, 19 insertions(+), 57 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index c071586..7cf191d 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -31,12 +31,11 @@ const ( // ---- timeouts (local policy) ---- // // Timing relationships: -// staleTimeout (45s) > exportWaitFallback (15s) > callTimeoutDef (5s) +// staleTimeout (45s) > callTimeoutDef (5s) // // The CM5 sends pings every 15s of TX inactivity. The MCU marks the -// peer stale after 45s without any RX, giving a 30s margin. The -// exportWaitFallback arms exports if no peer traffic arrives within -// 15s of link-up (normally armed earlier by peer_pub). +// peer stale after 45s without any RX, giving a 30s margin. +// Exports are enabled immediately on link-up (after exportStartHoldoff). const ( staleTimeout = 45 * time.Second @@ -52,9 +51,8 @@ const ( // exportMaxPerTick caps the total export messages sent per drain // cycle across all subscriptions, keeping UART throughput within // the 115200-baud link capacity. - exportMaxPerTick = 1 - exportWaitFallback = 15 * time.Second - errPayloadMarshal = "payload_marshal_failed" + exportMaxPerTick = 1 + errPayloadMarshal = "payload_marshal_failed" ) // ---- link reasons and error strings ---- @@ -71,20 +69,6 @@ const ( reasonTimeout = "timeout" ) -// ---- export arm reasons ---- - -// ---- export trigger reasons (why exports were enabled) ---- - -const ( - exportTriggerPub = "peer_pub" - exportTriggerPing = "peer_ping" - exportTriggerPong = "peer_pong" - exportTriggerCall = "peer_call" - exportTriggerReply = "peer_reply" - exportTriggerUnretain = "peer_unretain" - exportTriggerFallback = "fallback" -) - // session manages the fabric link state machine over a Transport. // // All bus access happens in the main loop goroutine only. TinyGo's @@ -98,16 +82,15 @@ type session struct { tr Transport conn *bus.Connection - link linkState - peerNode string - peerSID string - peerProto int - lastRxAt time.Time - lastTxAt time.Time - lastPongAt time.Time - exportReadyAt time.Time - exportWaitUntil time.Time - exportsEnabled bool + link linkState + peerNode string + peerSID string + peerProto int + lastRxAt time.Time + lastTxAt time.Time + lastPongAt time.Time + exportReadyAt time.Time + exportsEnabled bool exportSubs []*bus.Subscription exportCallSubs []*bus.Subscription @@ -319,7 +302,6 @@ func (s *session) handleLinkDown(reason, err string) { s.peerSID = "" s.peerProto = 0 s.exportReadyAt = time.Time{} - s.exportWaitUntil = time.Time{} s.exportsEnabled = false s.teardownExports() s.teardownInbound() @@ -343,9 +325,10 @@ func (s *session) promoteLink(reason string) { s.teardownOutbound(reason) } s.link = linkUp - s.exportReadyAt = time.Time{} - s.exportWaitUntil = time.Now().Add(exportWaitFallback) - s.exportsEnabled = false + s.setupExports() + s.exportsEnabled = true + s.exportReadyAt = time.Now().Add(exportStartHoldoff) + s.log("exports enabled") s.publishLinkState(reason, "") } @@ -473,17 +456,6 @@ func (s *session) onHello(msg *wireMsg) bool { return true } -func (s *session) enableExports(reason string) { - if s.link != linkUp || s.exportsEnabled { - return - } - s.setupExports() - s.exportReadyAt = time.Now().Add(exportStartHoldoff) - s.exportWaitUntil = time.Time{} - s.exportsEnabled = true - s.logKV("export replay armed", "reason", reason) -} - func (s *session) onHelloAck(msg *wireMsg) bool { if s.isSelfControlFrame(msg.Node, msg.SID) { s.log("echoed hello_ack ignored") @@ -501,7 +473,6 @@ func (s *session) onHelloAck(msg *wireMsg) bool { } func (s *session) onPing(msg *wireMsg) bool { - s.enableExports(exportTriggerPing) s.logKV("ping rx", "peer_sid", msg.SID) if !s.writeLine(marshal(wirePong{T: msgPong, TS: msg.TS, SID: s.localSID})) { return true @@ -516,7 +487,6 @@ func (s *session) onPong(msg *wireMsg) bool { return true } s.lastPongAt = s.lastRxAt - s.enableExports(exportTriggerPong) return true } @@ -530,7 +500,6 @@ func (s *session) onPub(msg *wireMsg) bool { s.log("incoming pub dropped: no_route") return true } - s.enableExports(exportTriggerPub) s.conn.Publish(s.conn.NewMessage(t, msg.Payload, msg.Retain)) return true } @@ -541,7 +510,6 @@ func (s *session) onUnretain(msg *wireMsg) bool { s.log("incoming unretain dropped: no_route") return true } - s.enableExports(exportTriggerUnretain) s.conn.Publish(s.conn.NewMessage(t, nil, true)) return true } @@ -553,7 +521,6 @@ func (s *session) onCall(msg *wireMsg) bool { s.writeLine(marshal(wireReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return true } - s.enableExports(exportTriggerCall) timeout := callTimeoutDef if msg.TimeoutMs > 0 { @@ -570,7 +537,6 @@ func (s *session) onCall(msg *wireMsg) bool { } func (s *session) onReply(msg *wireMsg) bool { - s.enableExports(exportTriggerReply) for i, call := range s.outboundCalls { if call.id != msg.Corr { @@ -669,11 +635,7 @@ func (s *session) drainExports() { return } if !s.exportsEnabled { - if !s.exportWaitUntil.IsZero() && !time.Now().Before(s.exportWaitUntil) { - s.enableExports(exportTriggerFallback) - } else { - return - } + return } if !s.exportReadyAt.IsZero() && time.Now().Before(s.exportReadyAt) { return From 48b8512d4255e02e67906e19d5c5fe8aa08c1292 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 17:17:33 +0000 Subject: [PATCH 15/65] refactor: make handlers void, always reset stale timer on receive MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Handlers no longer return bool — dispatch always resets the stale timer after any received frame (even malformed ones indicate the peer is alive). This removes the meaningless return true/false from all 8 handlers and simplifies the dispatch/handler contract. --- services/fabric/session.go | 80 +++++++++++++++++--------------------- 1 file changed, 35 insertions(+), 45 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 7cf191d..123b2f0 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -206,9 +206,8 @@ func (s *session) run(ctx context.Context) { s.handleLinkDown(reasonTransportDown, res.err.Error()) return } - if s.dispatch(res.line) { - resetTimer(stale, staleTimeout) - } + s.dispatch(res.line) + resetTimer(stale, staleTimeout) case <-exportTick.C: s.drainExports() @@ -352,36 +351,35 @@ func (s *session) validateInbound(msg *wireMsg) bool { return true } -func (s *session) dispatch(line []byte) bool { +func (s *session) dispatch(line []byte) { var msg wireMsg if err := json.Unmarshal(line, &msg); err != nil { s.logKV("malformed frame dropped", "err", err.Error()) - return false + return } s.markRx() if !s.validateInbound(&msg) { - return true + return } switch msg.T { case msgHello: - return s.onHello(&msg) + s.onHello(&msg) case msgHelloAck: - return s.onHelloAck(&msg) + s.onHelloAck(&msg) case msgPing: - return s.onPing(&msg) + s.onPing(&msg) case msgPong: - return s.onPong(&msg) + s.onPong(&msg) case msgPub: - return s.onPub(&msg) + s.onPub(&msg) case msgUnretain: - return s.onUnretain(&msg) + s.onUnretain(&msg) case msgCall: - return s.onCall(&msg) + s.onCall(&msg) case msgReply: - return s.onReply(&msg) + s.onReply(&msg) default: s.logKV("unknown message type dropped", "type", msg.T) - return false } } @@ -429,14 +427,14 @@ func hasWirePrefix(topic, prefix []string) bool { return true } -func (s *session) onHello(msg *wireMsg) bool { +func (s *session) onHello(msg *wireMsg) { if msg.Peer != "" && msg.Peer != s.nodeID { s.log("hello dropped: wrong peer") - return false + return } if s.peerID != "" && msg.Node != s.peerID { s.log("hello dropped: wrong node") - return false + return } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello rx", "peer_sid", msg.SID) @@ -448,78 +446,72 @@ func (s *session) onHello(msg *wireMsg) bool { Proto: protoVersion, OK: true, })) { - return true + return } s.log("hello_ack tx") time.Sleep(postHelloAckSettle) s.promoteLink(reason) - return true } -func (s *session) onHelloAck(msg *wireMsg) bool { +func (s *session) onHelloAck(msg *wireMsg) { if s.isSelfControlFrame(msg.Node, msg.SID) { s.log("echoed hello_ack ignored") - return true + return } if !msg.OK { s.log("hello_ack rejected by peer") s.handleLinkDown(reasonHelloRejected, "") - return true + return } reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello_ack rx", "peer_sid", msg.SID) s.promoteLink(reason) - return true } -func (s *session) onPing(msg *wireMsg) bool { +func (s *session) onPing(msg *wireMsg) { s.logKV("ping rx", "peer_sid", msg.SID) if !s.writeLine(marshal(wirePong{T: msgPong, TS: msg.TS, SID: s.localSID})) { - return true + return } s.log("pong tx") - return true } -func (s *session) onPong(msg *wireMsg) bool { +func (s *session) onPong(msg *wireMsg) { if s.isSelfControlFrame("", msg.SID) { s.log("echoed pong ignored") - return true + return } s.lastPongAt = s.lastRxAt - return true } -func (s *session) onPub(msg *wireMsg) bool { +func (s *session) onPub(msg *wireMsg) { t := importPublishTopic(msg.Topic) if t == nil { if hasWirePrefix(msg.Topic, []string{"state"}) { s.log("echoed state pub ignored") - return true + return } s.log("incoming pub dropped: no_route") - return true + return } s.conn.Publish(s.conn.NewMessage(t, msg.Payload, msg.Retain)) - return true } -func (s *session) onUnretain(msg *wireMsg) bool { +func (s *session) onUnretain(msg *wireMsg) { t := importPublishTopic(msg.Topic) if t == nil { s.log("incoming unretain dropped: no_route") - return true + return } s.conn.Publish(s.conn.NewMessage(t, nil, true)) - return true } -func (s *session) onCall(msg *wireMsg) bool { +func (s *session) onCall(msg *wireMsg) { t := importCallTopic(msg.Topic) if t == nil { s.log("incoming call dropped: no_route") s.writeLine(marshal(wireReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) - return true + return } timeout := callTimeoutDef @@ -533,10 +525,9 @@ func (s *session) onCall(msg *wireMsg) bool { sub: sub, deadline: time.Now().Add(timeout), }) - return true } -func (s *session) onReply(msg *wireMsg) bool { +func (s *session) onReply(msg *wireMsg) { for i, call := range s.outboundCalls { if call.id != msg.Corr { @@ -544,18 +535,17 @@ func (s *session) onReply(msg *wireMsg) bool { } s.outboundCalls = append(s.outboundCalls[:i], s.outboundCalls[i+1:]...) if !call.req.CanReply() { - return true + return } if !msg.OK { s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: msg.Err}, false) - return true + return } s.conn.Reply(call.req, decodePayload(msg.Payload), false) - return true + return } s.logKV("unexpected reply dropped", "corr", msg.Corr) - return true } func checkBusError(payload any) string { From a9dc3e7692e25c05bd83723078d776685092171e Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 17:23:35 +0000 Subject: [PATCH 16/65] refactor: rename t to localTopic, add status consts Rename single-letter variable t to localTopic in onPub, onUnretain, and onCall for clarity. Add statusReady/statusOpening/statusDown constants for link state payload strings. --- services/fabric/session.go | 19 +++++++++---------- 1 file changed, 9 insertions(+), 10 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 123b2f0..e7a5c70 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -485,8 +485,8 @@ func (s *session) onPong(msg *wireMsg) { } func (s *session) onPub(msg *wireMsg) { - t := importPublishTopic(msg.Topic) - if t == nil { + localTopic := importPublishTopic(msg.Topic) + if localTopic == nil { if hasWirePrefix(msg.Topic, []string{"state"}) { s.log("echoed state pub ignored") return @@ -494,21 +494,21 @@ func (s *session) onPub(msg *wireMsg) { s.log("incoming pub dropped: no_route") return } - s.conn.Publish(s.conn.NewMessage(t, msg.Payload, msg.Retain)) + s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) } func (s *session) onUnretain(msg *wireMsg) { - t := importPublishTopic(msg.Topic) - if t == nil { + localTopic := importPublishTopic(msg.Topic) + if localTopic == nil { s.log("incoming unretain dropped: no_route") return } - s.conn.Publish(s.conn.NewMessage(t, nil, true)) + s.conn.Publish(s.conn.NewMessage(localTopic, nil, true)) } func (s *session) onCall(msg *wireMsg) { - t := importCallTopic(msg.Topic) - if t == nil { + localTopic := importCallTopic(msg.Topic) + if localTopic == nil { s.log("incoming call dropped: no_route") s.writeLine(marshal(wireReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return @@ -518,7 +518,7 @@ func (s *session) onCall(msg *wireMsg) { if msg.TimeoutMs > 0 { timeout = time.Duration(msg.TimeoutMs) * time.Millisecond } - busMsg := s.conn.NewMessage(t, msg.Payload, false) + busMsg := s.conn.NewMessage(localTopic, msg.Payload, false) sub := s.conn.Request(busMsg) s.inboundCalls = append(s.inboundCalls, &inboundCall{ id: msg.ID, @@ -528,7 +528,6 @@ func (s *session) onCall(msg *wireMsg) { } func (s *session) onReply(msg *wireMsg) { - for i, call := range s.outboundCalls { if call.id != msg.Corr { continue From f9234d40080e1b2f9e6bf2389e0abb24fc5ff843 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 17:30:20 +0000 Subject: [PATCH 17/65] refactor: inline drainOutbound, extract tick const, optimise checkBusError Inline drainOutboundNew and drainOutboundPending into drainOutbound with comments separating the two phases (forward new calls, expire pending calls). Extract 50ms magic number into exportTickInterval const. Optimise checkBusError: try types.ErrorReply type assertion first (zero alloc) before falling back to JSON marshal/unmarshal for ad-hoc error structs. Add comment explaining why writeLine returns true for oversized frames (transport is still healthy, session continues). --- services/fabric/fabric_test.go | 6 +- services/fabric/session.go | 126 ++++++++++++++++----------------- 2 files changed, 65 insertions(+), 67 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 9991918..179c7a6 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -1219,7 +1219,7 @@ func TestPendingWireCallsTimeout(t *testing.T) { }, } - s.drainOutboundPending(time.Now()) + s.drainOutbound(time.Now()) select { case reply := <-sub.Channel(): @@ -1337,7 +1337,7 @@ func TestDrainOutgoingWireCallsReportsMarshalFailure(t *testing.T) { replySub := reqConn.Request(msg) defer reqConn.Unsubscribe(replySub) - s.drainOutboundNew(time.Now()) + s.drainOutbound(time.Now()) if len(tr.writes) != 0 { t.Fatalf("writes = %d, want 0", len(tr.writes)) @@ -1388,7 +1388,7 @@ func TestDrainOutgoingWireCallsReportsWriteFailure(t *testing.T) { replySub := reqConn.Request(msg) defer reqConn.Unsubscribe(replySub) - s.drainOutboundNew(time.Now()) + s.drainOutbound(time.Now()) if s.link != linkDown { t.Fatalf("link = %v, want %v", s.link, linkDown) diff --git a/services/fabric/session.go b/services/fabric/session.go index e7a5c70..6a60885 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -51,8 +51,9 @@ const ( // exportMaxPerTick caps the total export messages sent per drain // cycle across all subscriptions, keeping UART throughput within // the 115200-baud link capacity. - exportMaxPerTick = 1 - errPayloadMarshal = "payload_marshal_failed" + exportMaxPerTick = 1 + exportTickInterval = 50 * time.Millisecond + errPayloadMarshal = "payload_marshal_failed" ) // ---- link reasons and error strings ---- @@ -187,7 +188,7 @@ func (s *session) run(ctx context.Context) { // Poll subscription channels periodically. Needed because select // blocks until a line/timer fires; without this, exported bus // messages and async call replies would sit in subscription channels. - exportTick := time.NewTicker(50 * time.Millisecond) + exportTick := time.NewTicker(exportTickInterval) defer exportTick.Stop() s.publishLinkState("", "") @@ -548,6 +549,10 @@ func (s *session) onReply(msg *wireMsg) { } func checkBusError(payload any) string { + if e, ok := payload.(types.ErrorReply); ok && !e.OK && e.Error != "" { + return e.Error + } + // Fall back to JSON probe for handlers that reply with ad-hoc structs. b, err := json.Marshal(payload) if err != nil { return "" @@ -729,79 +734,70 @@ func (s *session) drainInbound(now time.Time) { } func (s *session) drainOutbound(now time.Time) { - s.drainOutboundNew(now) - s.drainOutboundPending(now) -} - -func (s *session) drainOutboundNew(now time.Time) { - if s.link != linkUp || len(s.exportCallSubs) == 0 { - return - } - - for _, sub := range s.exportCallSubs { - for { - select { - case msg, ok := <-sub.Channel(): - if !ok || msg == nil { - goto nextSub - } + // Forward new outgoing calls from the local bus onto the wire. + if s.link == linkUp && len(s.exportCallSubs) > 0 { + for _, sub := range s.exportCallSubs { + for { + select { + case msg, ok := <-sub.Channel(): + if !ok || msg == nil { + goto nextSub + } - wireTopic := exportCallTopic(msg.Topic) - if wireTopic == nil { - continue - } + wireTopic := exportCallTopic(msg.Topic) + if wireTopic == nil { + continue + } - payload, err := marshalPayload(msg.Payload) - if err != nil { - s.logKV("outgoing call dropped", "err", err.Error()) + payload, err := marshalPayload(msg.Payload) + if err != nil { + s.logKV("outgoing call dropped", "err", err.Error()) + if msg.CanReply() { + s.conn.Reply(msg, types.ErrorReply{OK: false, Error: errPayloadMarshal}, false) + } + continue + } + id := s.nextOutboundID + s.nextOutboundID++ + corr := "wire-" + strconvx.Utoa64(id) if msg.CanReply() { - s.conn.Reply(msg, types.ErrorReply{OK: false, Error: errPayloadMarshal}, false) + s.outboundCalls = append(s.outboundCalls, &outboundCall{ + id: corr, + req: msg, + deadline: now.Add(callTimeoutDef), + }) } - continue - } - id := s.nextOutboundID - s.nextOutboundID++ - corr := "wire-" + strconvx.Utoa64(id) - if msg.CanReply() { - s.outboundCalls = append(s.outboundCalls, &outboundCall{ - id: corr, - req: msg, - deadline: now.Add(callTimeoutDef), - }) - } - if !s.writeLine(marshal(wireCall{ - T: msgCall, - ID: corr, - Topic: wireTopic, - Payload: payload, - TimeoutMs: int(callTimeoutDef / time.Millisecond), - })) { - return + if !s.writeLine(marshal(wireCall{ + T: msgCall, + ID: corr, + Topic: wireTopic, + Payload: payload, + TimeoutMs: int(callTimeoutDef / time.Millisecond), + })) { + return + } + default: + goto nextSub } - default: - goto nextSub } + nextSub: } - nextSub: } -} -func (s *session) drainOutboundPending(now time.Time) { - if len(s.outboundCalls) == 0 { - return - } - - keep := s.outboundCalls[:0] - for _, call := range s.outboundCalls { - if !now.Before(call.deadline) { - if call.req != nil && call.req.CanReply() { - s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reasonTimeout}, false) + // Expire outbound calls that have timed out waiting for a remote reply. + if len(s.outboundCalls) > 0 { + keep := s.outboundCalls[:0] + for _, call := range s.outboundCalls { + if !now.Before(call.deadline) { + if call.req != nil && call.req.CanReply() { + s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: reasonTimeout}, false) + } + continue } - continue + keep = append(keep, call) } - keep = append(keep, call) + s.outboundCalls = keep } - s.outboundCalls = keep } // ---- transport write ---- @@ -812,6 +808,8 @@ func (s *session) writeLine(data []byte) bool { } if err := s.tr.WriteLine(data); err != nil { if errors.Is(err, ErrLineTooLong) { + // Oversized frame is dropped but the transport is still + // healthy — return true so the session continues. s.log("oversized write dropped") return true } From ac72ff36a6d3c21866c525be6d34cf39e05588f0 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 17:39:13 +0000 Subject: [PATCH 18/65] refactor: rename wire.go to protocol.go, wire* structs to proto* Rename wire.go to protocol.go to match the Lua side (protocol.lua). Rename all wire-prefixed structs and functions: wireHello -> protoHello, wireMsg -> protoMsg, wireType -> protoType, wireImportRule -> importRule, wireImport -> importMatch, etc. Also renames session.writeLine to session.sendFrame to match Lua's send_frame and avoid confusion with Transport.WriteLine. --- services/fabric/fabric_test.go | 142 +++++++++++------------ services/fabric/{wire.go => protocol.go} | 42 +++---- services/fabric/remap.go | 12 +- services/fabric/session.go | 44 +++---- 4 files changed, 120 insertions(+), 120 deletions(-) rename services/fabric/{wire.go => protocol.go} (78%) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 179c7a6..e737efb 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -64,12 +64,12 @@ func sendMsg(t *testing.T, tr Transport, v any) { const testCM5SID = "s1" -func bringUp(t *testing.T, cm5 Transport) wireHelloAck { +func bringUp(t *testing.T, cm5 Transport) protoHelloAck { t.Helper() - sendMsg(t, cm5, wireHello{ + sendMsg(t, cm5, protoHello{ T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: testCM5SID, Proto: protoVersion, }) - ack := readMsg[wireHelloAck](t, cm5) + ack := readMsg[protoHelloAck](t, cm5) if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { t.Fatalf("bad hello_ack: %+v", ack) } @@ -79,8 +79,8 @@ func bringUp(t *testing.T, cm5 Transport) wireHelloAck { func unlockExports(t *testing.T, cm5 Transport) { t.Helper() - sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: testCM5SID}) - pong := readMsg[wirePong](t, cm5) + sendMsg(t, cm5, protoPing{T: "ping", TS: 77, SID: testCM5SID}) + pong := readMsg[protoPong](t, cm5) if pong.T != "pong" { t.Fatalf("expected pong, got %q", pong.T) } @@ -89,7 +89,7 @@ func unlockExports(t *testing.T, cm5 Transport) { // ---- codec ---- func TestCodecRoundTrip(t *testing.T) { - orig := wireHello{T: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} + orig := protoHello{T: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} data := marshal(orig) if !bytes.HasSuffix(data, []byte("\n")) { t.Error("marshal should end with newline") @@ -98,10 +98,10 @@ func TestCodecRoundTrip(t *testing.T) { if bytes.Contains(jsonPart, []byte("\n")) { t.Error("JSON should not contain embedded newlines") } - if wireType(jsonPart) != "hello" { - t.Errorf("wireType = %q", wireType(jsonPart)) + if protoType(jsonPart) != "hello" { + t.Errorf("protoType = %q", protoType(jsonPart)) } - var dec wireHello + var dec protoHello json.Unmarshal(jsonPart, &dec) if dec != orig { t.Errorf("round-trip: %+v vs %+v", dec, orig) @@ -113,26 +113,26 @@ func TestCodecAllTypes(t *testing.T) { v any want string }{ - {wireHello{T: "hello"}, "hello"}, - {wireHelloAck{T: "hello_ack"}, "hello_ack"}, - {wirePing{T: "ping", TS: 1}, "ping"}, - {wirePong{T: "pong", TS: 2}, "pong"}, - {wirePub{T: "pub", Topic: []string{"a"}}, "pub"}, - {wireUnretain{T: "unretain", Topic: []string{"a"}}, "unretain"}, - {wireCall{T: "call", ID: "c1"}, "call"}, - {wireReply{T: "reply", Corr: "c1", OK: true}, "reply"}, + {protoHello{T: "hello"}, "hello"}, + {protoHelloAck{T: "hello_ack"}, "hello_ack"}, + {protoPing{T: "ping", TS: 1}, "ping"}, + {protoPong{T: "pong", TS: 2}, "pong"}, + {protoPub{T: "pub", Topic: []string{"a"}}, "pub"}, + {protoUnretain{T: "unretain", Topic: []string{"a"}}, "unretain"}, + {protoCall{T: "call", ID: "c1"}, "call"}, + {protoReply{T: "reply", Corr: "c1", OK: true}, "reply"}, } { b := marshal(tc.v) - if got := wireType(b[:len(b)-1]); got != tc.want { - t.Errorf("wireType = %q, want %q", got, tc.want) + if got := protoType(b[:len(b)-1]); got != tc.want { + t.Errorf("protoType = %q, want %q", got, tc.want) } } } func TestWireTypeBadInput(t *testing.T) { for _, b := range [][]byte{[]byte("not json"), []byte(`{"no_t":true}`), nil} { - if got := wireType(b); got != "" { - t.Errorf("wireType(%q) = %q, want empty", b, got) + if got := protoType(b); got != "" { + t.Errorf("protoType(%q) = %q, want empty", b, got) } } } @@ -153,7 +153,7 @@ func TestTransportRoundTrip(t *testing.T) { t.Errorf("got %q", line) } }() - sendMsg(t, a, wirePing{T: "ping", TS: 99}) + sendMsg(t, a, protoPing{T: "ping", TS: 99}) select { case <-done: case <-time.After(2 * time.Second): @@ -311,16 +311,16 @@ func TestHandshake(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") - sendMsg(t, cm5, wireHello{ + sendMsg(t, cm5, protoHello{ T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, }) - ack := readMsg[wireHelloAck](t, cm5) + ack := readMsg[protoHelloAck](t, cm5) if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { t.Errorf("bad ack: %+v", ack) } time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, wirePing{T: "ping", TS: 99, SID: "s1"}) - pong := readMsg[wirePong](t, cm5) + sendMsg(t, cm5, protoPing{T: "ping", TS: 99, SID: "s1"}) + pong := readMsg[protoPong](t, cm5) if pong.TS != 99 || pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) } @@ -334,13 +334,13 @@ func TestSessionReset(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) - sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) - ack := readMsg[wireHelloAck](t, cm5) + sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + ack := readMsg[protoHelloAck](t, cm5) if !ack.OK || ack.SID == "" || ack.Proto != protoVersion { t.Error("hello_ack.OK = false") } - sendMsg(t, cm5, wirePing{T: "ping", TS: 55, SID: "s2"}) - pong := readMsg[wirePong](t, cm5) + sendMsg(t, cm5, protoPing{T: "ping", TS: 55, SID: "s2"}) + pong := readMsg[protoPong](t, cm5) if pong.TS != 55 || pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) } @@ -353,7 +353,7 @@ func TestRejectsWrongPeer(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") - sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) gotLine := make(chan readResult, 1) go func() { line, err := cm5.ReadLine() @@ -364,13 +364,13 @@ func TestRejectsWrongPeer(t *testing.T) { t.Fatal("got response to wrong-peer hello") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) select { case res := <-gotLine: if res.err != nil { t.Fatalf("ReadLine error: %v", res.err) } - var ack wireHelloAck + var ack protoHelloAck if err := json.Unmarshal(res.line, &ack); err != nil { t.Fatalf("expected hello_ack: %v", err) } @@ -395,20 +395,20 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { gotLine <- readResult{line: line, err: err} }() - sendMsg(t, cm5, wireHello{T: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{T: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) select { case <-gotLine: t.Fatal("got response to hello without node") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, wireHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) select { case res := <-gotLine: if res.err != nil { t.Fatalf("ReadLine error: %v", res.err) } - var ack wireHelloAck + var ack protoHelloAck if err := json.Unmarshal(res.line, &ack); err != nil { t.Fatalf("expected hello_ack: %v", err) } @@ -427,8 +427,8 @@ func TestPingPong(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") ack := bringUp(t, cm5) - sendMsg(t, cm5, wirePing{T: "ping", TS: 42, SID: "s1"}) - pong := readMsg[wirePong](t, cm5) + sendMsg(t, cm5, protoPing{T: "ping", TS: 42, SID: "s1"}) + pong := readMsg[protoPong](t, cm5) if pong.TS != 42 || pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) } @@ -458,8 +458,8 @@ func TestUnknownTypeIgnored(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) cm5.WriteLine([]byte(`{"t":"future_msg"}`)) - sendMsg(t, cm5, wirePing{T: "ping", TS: 1}) - pong := readMsg[wirePong](t, cm5) + sendMsg(t, cm5, protoPing{T: "ping", TS: 1}) + pong := readMsg[protoPong](t, cm5) if pong.TS != 1 { t.Errorf("pong.TS = %d", pong.TS) } @@ -473,8 +473,8 @@ func TestMalformedJSONIgnored(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) cm5.WriteLine([]byte("not json")) - sendMsg(t, cm5, wirePing{T: "ping", TS: 2}) - pong := readMsg[wirePong](t, cm5) + sendMsg(t, cm5, protoPing{T: "ping", TS: 2}) + pong := readMsg[protoPong](t, cm5) if pong.TS != 2 { t.Errorf("pong.TS = %d", pong.TS) } @@ -677,7 +677,7 @@ func TestPubImport(t *testing.T) { reader := b.NewConnection("test") sub := reader.Subscribe(bus.T("config", "device")) - sendMsg(t, cm5, wirePub{ + sendMsg(t, cm5, protoPub{ T: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"mode":"normal"}`), @@ -720,7 +720,7 @@ func TestPubExport(t *testing.T) { true, )) - msg := readMsg[wirePub](t, cm5) + msg := readMsg[protoPub](t, cm5) if msg.T != "pub" { t.Fatalf("expected pub, got %q", msg.T) } @@ -750,7 +750,7 @@ func TestUnretainExport(t *testing.T) { map[string]int{"deci_c": 412}, true, )) - pub := readMsg[wirePub](t, cm5) + pub := readMsg[protoPub](t, cm5) if pub.T != "pub" || !pub.Retain { t.Fatalf("expected retained pub, got t=%q retain=%v", pub.T, pub.Retain) } @@ -761,7 +761,7 @@ func TestUnretainExport(t *testing.T) { nil, true, )) - unr := readMsg[wireUnretain](t, cm5) + unr := readMsg[protoUnretain](t, cm5) if unr.T != "unretain" { t.Fatalf("expected unretain, got %q", unr.T) } @@ -839,7 +839,7 @@ func TestPubIgnoredBeforeHandshake(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") - sendMsg(t, cm5, wirePub{ + sendMsg(t, cm5, protoPub{ T: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"v":1}`), Retain: true, }) @@ -877,7 +877,7 @@ func TestUnretainIgnoredBeforeHandshake(t *testing.T) { t.Fatal("timed out waiting for retained config/device") } - sendMsg(t, cm5, wireUnretain{T: "unretain", Topic: []string{"config", "device"}}) + sendMsg(t, cm5, protoUnretain{T: "unretain", Topic: []string{"config", "device"}}) select { case m := <-sub.Channel(): t.Fatalf("unexpected pre-handshake unretain effect: %+v", m) @@ -894,12 +894,12 @@ func TestUnretain(t *testing.T) { go Run(ctx, mcu, conn, "mcu-1", "cm5-local") bringUp(t, cm5) - sendMsg(t, cm5, wirePub{ + sendMsg(t, cm5, protoPub{ T: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"v":1}`), Retain: true, }) time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, wireUnretain{T: "unretain", Topic: []string{"config", "device"}}) + sendMsg(t, cm5, protoUnretain{T: "unretain", Topic: []string{"config", "device"}}) time.Sleep(50 * time.Millisecond) reader := b.NewConnection("test") @@ -927,7 +927,7 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) defer handler.Unsubscribe(sub) - sendMsg(t, cm5, wireCall{ + sendMsg(t, cm5, protoCall{ T: "call", ID: "pre-hello-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) @@ -956,12 +956,12 @@ func TestCallImport(t *testing.T) { } }() - sendMsg(t, cm5, wireCall{ + sendMsg(t, cm5, protoCall{ T: "call", ID: "test-corr-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) - reply := readMsg[wireReply](t, cm5) + reply := readMsg[protoReply](t, cm5) if reply.Corr != "test-corr-1" { t.Errorf("corr = %q", reply.Corr) } @@ -978,12 +978,12 @@ func TestCallNoRoute(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) - sendMsg(t, cm5, wireCall{ + sendMsg(t, cm5, protoCall{ T: "call", ID: "no-route-1", Topic: []string{"unknown", "endpoint"}, Payload: json.RawMessage(`{}`), TimeoutMs: 1000, }) - reply := readMsg[wireReply](t, cm5) + reply := readMsg[protoReply](t, cm5) if reply.Corr != "no-route-1" { t.Errorf("corr = %q", reply.Corr) } @@ -1015,12 +1015,12 @@ func TestCallHandlerError(t *testing.T) { } }() - sendMsg(t, cm5, wireCall{ + sendMsg(t, cm5, protoCall{ T: "call", ID: "err-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) - reply := readMsg[wireReply](t, cm5) + reply := readMsg[protoReply](t, cm5) if reply.Corr != "err-1" { t.Errorf("corr = %q", reply.Corr) } @@ -1050,11 +1050,11 @@ func TestCallDoesNotBlockPing(t *testing.T) { } }() - sendMsg(t, cm5, wireCall{ + sendMsg(t, cm5, protoCall{ T: "call", ID: "slow-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 1000, }) - sendMsg(t, cm5, wirePing{T: "ping", TS: 77, SID: "s1"}) + sendMsg(t, cm5, protoPing{T: "ping", TS: 77, SID: "s1"}) type readResult struct { line []byte @@ -1071,10 +1071,10 @@ func TestCallDoesNotBlockPing(t *testing.T) { if res.err != nil { t.Fatalf("ReadLine: %v", res.err) } - if got := wireType(res.line); got != "pong" { + if got := protoType(res.line); got != "pong" { t.Fatalf("first response type = %q, want pong", got) } - var pong wirePong + var pong protoPong if err := json.Unmarshal(res.line, &pong); err != nil { t.Fatalf("Unmarshal pong: %v", err) } @@ -1085,7 +1085,7 @@ func TestCallDoesNotBlockPing(t *testing.T) { t.Fatal("ping blocked behind slow call") } - reply := readMsg[wireReply](t, cm5) + reply := readMsg[protoReply](t, cm5) if reply.Corr != "slow-1" { t.Errorf("corr = %q", reply.Corr) } @@ -1119,7 +1119,7 @@ func TestCallExport(t *testing.T) { done <- result{msg: msg, err: err} }() - call := readMsg[wireCall](t, cm5) + call := readMsg[protoCall](t, cm5) if call.T != "call" { t.Fatalf("expected call, got %q", call.T) } @@ -1135,7 +1135,7 @@ func TestCallExport(t *testing.T) { t.Fatalf("payload.ask = %q, want status", payload["ask"]) } - sendMsg(t, cm5, wireReply{ + sendMsg(t, cm5, protoReply{ T: "reply", Corr: call.ID, OK: true, @@ -1300,7 +1300,7 @@ func TestDrainPendingCallsReportsMarshalFailure(t *testing.T) { if len(tr.writes) != 1 { t.Fatalf("writes = %d, want 1", len(tr.writes)) } - var reply wireReply + var reply protoReply if err := json.Unmarshal(tr.writes[0], &reply); err != nil { t.Fatalf("Unmarshal reply: %v", err) } @@ -1442,15 +1442,15 @@ func TestCallExportPeerReset(t *testing.T) { done <- result{msg: msg, err: err} }() - call := readMsg[wireCall](t, cm5) + call := readMsg[protoCall](t, cm5) if call.T != "call" { t.Fatalf("expected call, got %q", call.T) } - sendMsg(t, cm5, wireHello{ + sendMsg(t, cm5, protoHello{ T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "fresh-session", Proto: protoVersion, }) - _ = readMsg[wireHelloAck](t, cm5) + _ = readMsg[protoHelloAck](t, cm5) select { case res := <-done: @@ -1500,17 +1500,17 @@ func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { done <- result{msg: msg, err: err} }() - call := readMsg[wireCall](t, cm5) + call := readMsg[protoCall](t, cm5) if call.T != "call" { t.Fatalf("expected call, got %q", call.T) } // Send an echoed hello_ack (our own SID) — should be ignored. - sendMsg(t, cm5, wireHelloAck{ + sendMsg(t, cm5, protoHelloAck{ T: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, }) - sendMsg(t, cm5, wireReply{ + sendMsg(t, cm5, protoReply{ T: "reply", Corr: call.ID, OK: true, diff --git a/services/fabric/wire.go b/services/fabric/protocol.go similarity index 78% rename from services/fabric/wire.go rename to services/fabric/protocol.go index aab8ea8..6d9e153 100644 --- a/services/fabric/wire.go +++ b/services/fabric/protocol.go @@ -17,23 +17,23 @@ const ( // ---- Wire message structs ---- -// wireCaps is carried in hello for forward compatibility. The Lua side +// protoCaps is carried in hello for forward compatibility. The Lua side // sends caps but neither side enforces them in v1. -type wireCaps struct { +type protoCaps struct { Pub bool `json:"pub,omitempty"` Call bool `json:"call,omitempty"` } -type wireHello struct { - T string `json:"t"` - Node string `json:"node"` - Peer string `json:"peer"` - SID string `json:"sid"` - Proto int `json:"proto,omitempty"` - Caps *wireCaps `json:"caps,omitempty"` +type protoHello struct { + T string `json:"t"` + Node string `json:"node"` + Peer string `json:"peer"` + SID string `json:"sid"` + Proto int `json:"proto,omitempty"` + Caps *protoCaps `json:"caps,omitempty"` } -type wireHelloAck struct { +type protoHelloAck struct { T string `json:"t"` Node string `json:"node"` SID string `json:"sid,omitempty"` @@ -41,13 +41,13 @@ type wireHelloAck struct { OK bool `json:"ok"` } -type wirePing struct { +type protoPing struct { T string `json:"t"` TS int64 `json:"ts"` SID string `json:"sid,omitempty"` } -type wirePong struct { +type protoPong struct { T string `json:"t"` TS int64 `json:"ts"` SID string `json:"sid,omitempty"` @@ -55,19 +55,19 @@ type wirePong struct { // Not wired yet — defined for forward compatibility. -type wirePub struct { +type protoPub struct { T string `json:"t"` Topic []string `json:"topic"` Payload json.RawMessage `json:"payload"` Retain bool `json:"retain"` } -type wireUnretain struct { +type protoUnretain struct { T string `json:"t"` Topic []string `json:"topic"` } -type wireCall struct { +type protoCall struct { T string `json:"t"` ID string `json:"id"` Topic []string `json:"topic"` @@ -75,7 +75,7 @@ type wireCall struct { TimeoutMs int `json:"timeout_ms"` } -type wireReply struct { +type protoReply struct { T string `json:"t"` Corr string `json:"corr"` OK bool `json:"ok"` @@ -83,17 +83,17 @@ type wireReply struct { Err string `json:"err,omitempty"` } -// wireMsg is a union struct for single-pass unmarshal in dispatch. +// protoMsg is a union struct for single-pass unmarshal in dispatch. // Fields are the superset of all message types. Only the fields // relevant to the T value are populated; the rest are zero. -type wireMsg struct { +type protoMsg struct { T string `json:"t"` Node string `json:"node,omitempty"` Peer string `json:"peer,omitempty"` SID string `json:"sid,omitempty"` Proto int `json:"proto,omitempty"` OK bool `json:"ok,omitempty"` - Caps *wireCaps `json:"caps,omitempty"` + Caps *protoCaps `json:"caps,omitempty"` TS int64 `json:"ts,omitempty"` Topic []string `json:"topic,omitempty"` Payload json.RawMessage `json:"payload,omitempty"` @@ -116,8 +116,8 @@ func marshal(v any) []byte { return append(b, '\n') } -// wireType extracts the "t" field from a JSON line. -func wireType(line []byte) string { +// protoType extracts the "t" field from a JSON line. +func protoType(line []byte) string { var env struct { T string `json:"t"` } diff --git a/services/fabric/remap.go b/services/fabric/remap.go index 48eb9d7..13ee063 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -20,7 +20,7 @@ import "devicecode-go/bus" // hal/cap/power/# -> ["state","power",...] // hal/state -> ["state","hal"] -type wireImportRule struct { +type importRule struct { wire []string local []string } @@ -31,14 +31,14 @@ type busExportRule struct { suffix bool } -var importPublishRules = []wireImportRule{ +var importPublishRules = []importRule{ { wire: []string{"config", "device"}, local: []string{"config", "device"}, }, } -var importCallRules = []wireImportRule{ +var importCallRules = []importRule{ { wire: []string{"rpc", "hal", "dump"}, local: []string{"rpc", "hal", "dump"}, @@ -70,11 +70,11 @@ var exportCallRules = []busExportRule{ } func importPublishTopic(wire []string) bus.Topic { - return wireImport(wire, importPublishRules) + return importMatch(wire, importPublishRules) } func importCallTopic(wire []string) bus.Topic { - return wireImport(wire, importCallRules) + return importMatch(wire, importCallRules) } func exportTopic(t bus.Topic) []string { @@ -93,7 +93,7 @@ func exportCallPatterns() []bus.Topic { return exportPatternsFor(exportCallRules) } -func wireImport(wire []string, rules []wireImportRule) bus.Topic { +func importMatch(wire []string, rules []importRule) bus.Topic { for _, rule := range rules { if slicesEqualStrings(wire, rule.wire) { return stringsToTopic(rule.local) diff --git a/services/fabric/session.go b/services/fabric/session.go index 6a60885..2a7c74f 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -337,7 +337,7 @@ func (s *session) promoteLink(reason string) { // validateInbound checks whether a message should be processed. // Handshake messages (hello, hello_ack) are always accepted. // All others require an established link and a matching session ID. -func (s *session) validateInbound(msg *wireMsg) bool { +func (s *session) validateInbound(msg *protoMsg) bool { if msg.T == msgHello || msg.T == msgHelloAck { return true } @@ -353,7 +353,7 @@ func (s *session) validateInbound(msg *wireMsg) bool { } func (s *session) dispatch(line []byte) { - var msg wireMsg + var msg protoMsg if err := json.Unmarshal(line, &msg); err != nil { s.logKV("malformed frame dropped", "err", err.Error()) return @@ -428,7 +428,7 @@ func hasWirePrefix(topic, prefix []string) bool { return true } -func (s *session) onHello(msg *wireMsg) { +func (s *session) onHello(msg *protoMsg) { if msg.Peer != "" && msg.Peer != s.nodeID { s.log("hello dropped: wrong peer") return @@ -440,7 +440,7 @@ func (s *session) onHello(msg *wireMsg) { reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello rx", "peer_sid", msg.SID) - if !s.writeLine(marshal(wireHelloAck{ + if !s.sendFrame(marshal(protoHelloAck{ T: msgHelloAck, Node: s.nodeID, SID: s.localSID, @@ -454,7 +454,7 @@ func (s *session) onHello(msg *wireMsg) { s.promoteLink(reason) } -func (s *session) onHelloAck(msg *wireMsg) { +func (s *session) onHelloAck(msg *protoMsg) { if s.isSelfControlFrame(msg.Node, msg.SID) { s.log("echoed hello_ack ignored") return @@ -469,15 +469,15 @@ func (s *session) onHelloAck(msg *wireMsg) { s.promoteLink(reason) } -func (s *session) onPing(msg *wireMsg) { +func (s *session) onPing(msg *protoMsg) { s.logKV("ping rx", "peer_sid", msg.SID) - if !s.writeLine(marshal(wirePong{T: msgPong, TS: msg.TS, SID: s.localSID})) { + if !s.sendFrame(marshal(protoPong{T: msgPong, TS: msg.TS, SID: s.localSID})) { return } s.log("pong tx") } -func (s *session) onPong(msg *wireMsg) { +func (s *session) onPong(msg *protoMsg) { if s.isSelfControlFrame("", msg.SID) { s.log("echoed pong ignored") return @@ -485,7 +485,7 @@ func (s *session) onPong(msg *wireMsg) { s.lastPongAt = s.lastRxAt } -func (s *session) onPub(msg *wireMsg) { +func (s *session) onPub(msg *protoMsg) { localTopic := importPublishTopic(msg.Topic) if localTopic == nil { if hasWirePrefix(msg.Topic, []string{"state"}) { @@ -498,7 +498,7 @@ func (s *session) onPub(msg *wireMsg) { s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) } -func (s *session) onUnretain(msg *wireMsg) { +func (s *session) onUnretain(msg *protoMsg) { localTopic := importPublishTopic(msg.Topic) if localTopic == nil { s.log("incoming unretain dropped: no_route") @@ -507,11 +507,11 @@ func (s *session) onUnretain(msg *wireMsg) { s.conn.Publish(s.conn.NewMessage(localTopic, nil, true)) } -func (s *session) onCall(msg *wireMsg) { +func (s *session) onCall(msg *protoMsg) { localTopic := importCallTopic(msg.Topic) if localTopic == nil { s.log("incoming call dropped: no_route") - s.writeLine(marshal(wireReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) + s.sendFrame(marshal(protoReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return } @@ -528,7 +528,7 @@ func (s *session) onCall(msg *wireMsg) { }) } -func (s *session) onReply(msg *wireMsg) { +func (s *session) onReply(msg *protoMsg) { for i, call := range s.outboundCalls { if call.id != msg.Corr { continue @@ -650,7 +650,7 @@ func (s *session) drainExports() { continue } if m.Retained && m.Payload == nil { - if !s.writeLine(marshal(wireUnretain{ + if !s.sendFrame(marshal(protoUnretain{ T: msgUnretain, Topic: wire, })) { @@ -664,7 +664,7 @@ func (s *session) drainExports() { s.logKV("export payload dropped", "err", err.Error()) continue } - if !s.writeLine(marshal(wirePub{ + if !s.sendFrame(marshal(protoPub{ T: msgPub, Topic: wire, Payload: payload, @@ -693,25 +693,25 @@ func (s *session) drainInbound(now time.Time) { s.conn.Unsubscribe(call.sub) call.sub = nil // prevent double-unsubscribe in teardownInbound if !ok || reply == nil { - if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue } if errStr := checkBusError(reply.Payload); errStr != "" { - if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: errStr})) { + if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: errStr})) { return } continue } payload, err := marshalPayload(reply.Payload) if err != nil { - if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { + if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { return } continue } - if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: true, Payload: payload})) { + if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: true, Payload: payload})) { return } continue @@ -721,7 +721,7 @@ func (s *session) drainInbound(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) call.sub = nil - if !s.writeLine(marshal(wireReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue @@ -767,7 +767,7 @@ func (s *session) drainOutbound(now time.Time) { deadline: now.Add(callTimeoutDef), }) } - if !s.writeLine(marshal(wireCall{ + if !s.sendFrame(marshal(protoCall{ T: msgCall, ID: corr, Topic: wireTopic, @@ -802,7 +802,7 @@ func (s *session) drainOutbound(now time.Time) { // ---- transport write ---- -func (s *session) writeLine(data []byte) bool { +func (s *session) sendFrame(data []byte) bool { if len(data) > 0 && data[len(data)-1] == '\n' { data = data[:len(data)-1] } From da2117906e5c23323240f8f4f27a2450676d6f75 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 17:52:06 +0000 Subject: [PATCH 19/65] fix: restore production logging by setting handshakeOnlyOutput to false The const was set to true during fabric debugging to reduce console noise. This silenced all power/thermal/charger/HAL logging at compile time. Set to false to restore original logging behaviour. The const and its guards can be fully removed in a follow-up cleanup. --- main.go | 58 ++++++++++++--------------------------------------------- 1 file changed, 12 insertions(+), 46 deletions(-) diff --git a/main.go b/main.go index f06d91f..3526e34 100644 --- a/main.go +++ b/main.go @@ -19,7 +19,6 @@ import ( const halTimeout = 5 * time.Second const pwmTop = 4095 -const handshakeOnlyOutput = true const fabricSessionWaitLogEvery = 2 * time.Second // Thermal (deci-°C) @@ -205,12 +204,12 @@ func (r *Reactor) updateLatchesFromValues() { // Over-temp latch if r.freshTMP() { if r.lastTDeci >= TEMP_LIMIT { - if !handshakeOnlyOutput && !r.otActive { + if !r.otActive { log.Println("[thermal] over-temp → latch active") } r.otActive = true } else if r.lastTDeci <= (TEMP_LIMIT - TEMP_HYST) { - if !handshakeOnlyOutput && r.otActive { + if r.otActive { log.Println("[thermal] temp recovered below hysteresis") } r.otActive = false @@ -231,9 +230,7 @@ func (r *Reactor) updateLatchesFromValues() { // ---- sequencing (non-blocking) ---- func (r *Reactor) startUpSeq() { - if !handshakeOnlyOutput { - log.Println("[power] PG debounced + Temp OK → rails UP") - } + log.Println("[power] PG debounced + Temp OK → rails UP") r.state = stateUpSeq r.seqIdx = 0 // next to apply r.nextActionDue = r.now // first step fires immediately @@ -243,9 +240,7 @@ func (r *Reactor) startUpSeq() { } func (r *Reactor) startDownSeq() { - if !handshakeOnlyOutput { - log.Println("[power] brownout/stale/over-temp → rails DOWN") - } + log.Println("[power] brownout/stale/over-temp → rails DOWN") r.state = stateDownSeq if r.seqOnCount < 0 { r.seqOnCount = 0 @@ -274,9 +269,7 @@ func (r *Reactor) advanceSequenceIfDue() { return } step := powerSeq[r.seqIdx] - if !handshakeOnlyOutput { - log.Println("[event] powering rail UP: ", step.Name) - } + log.Println("[event] powering rail UP: ", step.Name) r.publishSwitch(step.Name, true) r.seqOnCount++ r.seqIdx++ @@ -291,9 +284,7 @@ func (r *Reactor) advanceSequenceIfDue() { return } step := powerSeq[r.seqIdx] - if !handshakeOnlyOutput { - log.Println("[event] powering rail down: ", step.Name) - } + log.Println("[event] powering rail down: ", step.Name) r.publishSwitch(step.Name, false) r.seqOnCount-- r.seqIdx-- @@ -329,9 +320,7 @@ func (r *Reactor) stepFSM() { // If actively powering down and inputs become stably good, reverse. if r.state == stateDownSeq && r.pgStable { - if !handshakeOnlyOutput { - log.Println("[power] inputs stably good → reverse to UP sequence") - } + log.Println("[power] inputs stably good → reverse to UP sequence") r.startUpSeq() return } @@ -390,18 +379,12 @@ func (r *Reactor) OnBattery(v types.BatteryValue) { } func (r *Reactor) OnTempDeciC(label string, deci int) { - if handshakeOnlyOutput { - return - } log.Deci(label, deci) } // ---- memory snapshot telemetry (every ~2 s in main loop) ---- func (r *Reactor) emitMemSnapshot() { - if handshakeOnlyOutput { - return - } var ms runtime.MemStats runtime.GC() runtime.ReadMemStats(&ms) @@ -428,9 +411,7 @@ func main() { ctx := context.Background() - if !handshakeOnlyOutput { - log.Println("[main] bootstrapping bus …") - } + log.Println("[main] bootstrapping bus …") // Queue length must cover the retained replay burst when fabric // subscribes to wildcard export patterns (hal/cap/env/#, // hal/cap/power/#). Each capability publishes retained info + @@ -441,9 +422,7 @@ func main() { uiConn := b.NewConnection("ui") bridgeConn := b.NewConnection("fabric-bridge") - if !handshakeOnlyOutput { - log.Println("[main] starting hal.Run …") - } + log.Println("[main] starting hal.Run …") go fabric.RunBridge(ctx, bridgeConn) go hal.Run(ctx, halConn) @@ -456,9 +435,7 @@ func main() { } // Subscriptions (env + power) - if !handshakeOnlyOutput { - log.Println("[main] subscribing env + power …") - } + log.Println("[main] subscribing env + power …") tempSub := uiConn.Subscribe(tTempValue) tempDieSub := uiConn.Subscribe(tDieTempValue) humidSub := uiConn.Subscribe(tHumValue) @@ -551,9 +528,7 @@ func main() { } case m := <-humidSub.Channel(): if v, ok := m.Payload.(types.HumidityValue); ok { - if !handshakeOnlyOutput { - log.Hundredths("[value] env/humidity/core %RH=", int(v.RHx100)) - } + log.Hundredths("[value] env/humidity/core %RH=", int(v.RHx100)) } // ---- Die Temp Backup ---- @@ -592,7 +567,7 @@ func main() { // ---- Supervisory tick ---- case <-ticker.C: r.now = time.Now() - if handshakeOnlyOutput && !fabricSessionOpen && !r.now.Before(nextFabricWaitLog) { + if !fabricSessionOpen && !r.now.Before(nextFabricWaitLog) { log.Println("[main] waiting for fabric connection start") nextFabricWaitLog = r.now.Add(fabricSessionWaitLogEvery) } @@ -667,9 +642,6 @@ func (l *Logger) logWrite(b []byte) int { // ----------------------------------------------------------------------------- func printCapValue(m *bus.Message, lastIIn *int32, _ *bool, lastIBat *int32, _ *bool) { - if handshakeOnlyOutput { - return - } // hal/cap////value dom, _ := m.Topic.At(2).(string) kind, _ := m.Topic.At(3).(string) @@ -754,9 +726,6 @@ func (r *Reactor) logPrefixStatus(path, label string) { } func printCapStatus(m *bus.Message) { - if handshakeOnlyOutput { - return - } // hal/cap////status dom, _ := m.Topic.At(2).(string) kind, _ := m.Topic.At(3).(string) @@ -780,9 +749,6 @@ func printCapStatus(m *bus.Message) { } func printCapEvent(m *bus.Message) { - if handshakeOnlyOutput { - return - } // hal/cap////event/ dom, _ := m.Topic.At(2).(string) kind, _ := m.Topic.At(3).(string) From 1f349bcecc84e4a1b73840a25f620f1b60de2038 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 18:04:09 +0000 Subject: [PATCH 20/65] fix: remove handshakeOnlyOutput, clean up main.go naming Remove the handshakeOnlyOutput const and all 17 guard blocks that silenced production logging. Restore original logging behaviour. Remove stale buildTag const. Rename fabricSessionWaitLogEvery to fabricWaitLogInterval. --- main.go | 7 ++----- 1 file changed, 2 insertions(+), 5 deletions(-) diff --git a/main.go b/main.go index 3526e34..527d915 100644 --- a/main.go +++ b/main.go @@ -19,7 +19,7 @@ import ( const halTimeout = 5 * time.Second const pwmTop = 4095 -const fabricSessionWaitLogEvery = 2 * time.Second +const fabricWaitLogInterval = 2 * time.Second // Thermal (deci-°C) const ( @@ -401,12 +401,9 @@ func (r *Reactor) emitMemSnapshot() { // Main // ----------------------------------------------------------------------------- -const buildTag = "fabric-20260401c" - func main() { // Allow early USB/console settle if needed time.Sleep(3 * time.Second) - println("[main] build:", buildTag) log.SetStart(time.Now()) ctx := context.Background() @@ -569,7 +566,7 @@ func main() { r.now = time.Now() if !fabricSessionOpen && !r.now.Before(nextFabricWaitLog) { log.Println("[main] waiting for fabric connection start") - nextFabricWaitLog = r.now.Add(fabricSessionWaitLogEvery) + nextFabricWaitLog = r.now.Add(fabricWaitLogInterval) } // 1) Run FSM (includes symmetric reversal) From 2939b8bb53d8d294d073307f49c0356fc29e9513 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 20:20:08 +0000 Subject: [PATCH 21/65] refactor: remove bridge.go, inline config and dump handling in session MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The bridge was a separate goroutine handling config/device translation and rpc/hal/dump replies independently of the session lifecycle. This was not in the design spec (fabric.md) and ran even when no session was active. Config handling is now inline in onPub: when config/device arrives, the session normalizes Lua empty tables, validates the HAL config, and publishes to config/hal directly. Config state (apply count, errors) is tracked on the session struct. The rpc/hal/dump handler is now inline in onCall: the session builds the reply directly using its cached HAL state and config tracking, with no bus round-trip. - Delete bridge.go and bridge_test.go - Create config.go with decodeHALConfig and helpers (moved from bridge) - Change import rule: config/device → config/hal (was config/device) - Remove rpc/hal/dump from import call rules (handled directly) - Remove RunBridge and bridgeConn from main.go - Add halStateSub to session for HAL state caching - Rewrite bridge tests as session integration tests --- main.go | 2 - services/fabric/bridge.go | 249 ---------------------------- services/fabric/bridge_test.go | 290 --------------------------------- services/fabric/config.go | 124 ++++++++++++++ services/fabric/fabric_test.go | 120 ++++++-------- services/fabric/remap.go | 14 +- services/fabric/session.go | 105 ++++++++++++ 7 files changed, 283 insertions(+), 621 deletions(-) delete mode 100644 services/fabric/bridge.go delete mode 100644 services/fabric/bridge_test.go create mode 100644 services/fabric/config.go diff --git a/main.go b/main.go index 527d915..8e02640 100644 --- a/main.go +++ b/main.go @@ -417,10 +417,8 @@ func main() { b := bus.NewBus(32, "+", "#") halConn := b.NewConnection("hal") uiConn := b.NewConnection("ui") - bridgeConn := b.NewConnection("fabric-bridge") log.Println("[main] starting hal.Run …") - go fabric.RunBridge(ctx, bridgeConn) go hal.Run(ctx, halConn) // Wait for retained hal/state=ready (or time out) diff --git a/services/fabric/bridge.go b/services/fabric/bridge.go deleted file mode 100644 index 008d69e..0000000 --- a/services/fabric/bridge.go +++ /dev/null @@ -1,249 +0,0 @@ -package fabric - -import ( - "context" - "encoding/json" - - "devicecode-go/bus" - "devicecode-go/types" -) - -var ( - tConfigDevice = bus.T("config", "device") - tConfigHAL = bus.T("config", "hal") - tRPCHALDump = bus.T("rpc", "hal", "dump") - tHALState = bus.T("hal", "state") -) - -type dumpReply struct { - OK bool `json:"ok"` - Method string `json:"method"` - Echo any `json:"echo,omitempty"` - HAL *types.HALState `json:"hal,omitempty"` - Applied bool `json:"applied"` - ConfigCount int `json:"config_count,omitempty"` - ConfigError string `json:"config_error,omitempty"` -} - -// RunBridge connects generic fabric import topics to concrete MCU services. -// -// The current Lua-side config exports `config/mcu -> config/device`, while the -// MCU HAL consumes `config/hal`. This bridge translates matching HAL configs. -// It also exposes a minimal `rpc/hal/dump` endpoint so CM5 proxy calls have a -// real MCU-side responder. -func RunBridge(ctx context.Context, conn *bus.Connection) { - cfgSub := conn.Subscribe(tConfigDevice) - dumpSub := conn.Subscribe(tRPCHALDump) - halStateSub := conn.Subscribe(tHALState) - defer conn.Unsubscribe(cfgSub) - defer conn.Unsubscribe(dumpSub) - defer conn.Unsubscribe(halStateSub) - - var lastHAL *types.HALState - var appliedConfig bool - var appliedConfigCount int - var lastConfigErr string - - for { - select { - case <-ctx.Done(): - return - - case msg, ok := <-halStateSub.Channel(): - if !ok || msg == nil { - return - } - if st, ok := decodeHALState(msg.Payload); ok { - stCopy := st - lastHAL = &stCopy - } - - case msg, ok := <-cfgSub.Channel(): - if !ok || msg == nil { - return - } - processConfigDevice(conn, msg, &appliedConfig, &appliedConfigCount, &lastConfigErr) - - case msg, ok := <-dumpSub.Channel(): - if !ok || msg == nil { - return - } - if !msg.CanReply() { - continue - } - if !drainConfigDevice(conn, cfgSub, &appliedConfig, &appliedConfigCount, &lastConfigErr) { - return - } - drainHALState(halStateSub, &lastHAL) - conn.Reply(msg, dumpReply{ - OK: true, - Method: "dump", - Echo: decodePayload(msg.Payload), - HAL: lastHAL, - Applied: appliedConfig, - ConfigCount: appliedConfigCount, - ConfigError: lastConfigErr, - }, false) - } - } -} - -func drainConfigDevice(conn *bus.Connection, cfgSub *bus.Subscription, appliedConfig *bool, appliedConfigCount *int, lastConfigErr *string) bool { - for { - select { - case msg, ok := <-cfgSub.Channel(): - if !ok || msg == nil { - return false - } - processConfigDevice(conn, msg, appliedConfig, appliedConfigCount, lastConfigErr) - default: - return true - } - } -} - -func drainHALState(halSub *bus.Subscription, lastHAL **types.HALState) { - for { - select { - case msg, ok := <-halSub.Channel(): - if !ok || msg == nil { - return - } - if st, ok := decodeHALState(msg.Payload); ok { - stCopy := st - *lastHAL = &stCopy - } - default: - return - } - } -} - -func processConfigDevice(conn *bus.Connection, msg *bus.Message, appliedConfig *bool, appliedConfigCount *int, lastConfigErr *string) { - cfg, err := decodeHALConfig(msg.Payload) - if err != "" { - *lastConfigErr = err - println("[fabric] config/device rejected:", err) - return - } - *appliedConfig = true - *appliedConfigCount++ - *lastConfigErr = "" - println("[fabric] config/device bridged to config/hal", *appliedConfigCount, "devices", len(cfg.Devices)) - conn.Publish(conn.NewMessage(tConfigHAL, cfg, true)) -} - -func decodePayload(payload any) any { - switch v := payload.(type) { - case nil: - return nil - case json.RawMessage: - if len(v) == 0 { - return nil - } - var out any - if err := json.Unmarshal(v, &out); err == nil { - return out - } - return []byte(v) - case []byte: - if len(v) == 0 { - return nil - } - var out any - if err := json.Unmarshal(v, &out); err == nil { - return out - } - cp := make([]byte, len(v)) - copy(cp, v) - return cp - default: - return v - } -} - -func decodeHALState(payload any) (types.HALState, bool) { - switch v := payload.(type) { - case types.HALState: - return v, true - case *types.HALState: - if v == nil { - return types.HALState{}, false - } - return *v, true - case json.RawMessage: - var out types.HALState - return out, json.Unmarshal(v, &out) == nil - case []byte: - var out types.HALState - return out, json.Unmarshal(v, &out) == nil - default: - b, err := json.Marshal(v) - if err != nil { - return types.HALState{}, false - } - var out types.HALState - return out, json.Unmarshal(b, &out) == nil - } -} - -func decodeHALConfig(payload any) (types.HALConfig, string) { - switch v := payload.(type) { - case types.HALConfig: - return v, "" - case *types.HALConfig: - if v == nil { - return types.HALConfig{}, "nil_hal_config" - } - return *v, "" - case json.RawMessage: - return decodeHALConfigBytes(v) - case []byte: - return decodeHALConfigBytes(v) - default: - b, err := json.Marshal(v) - if err != nil { - return types.HALConfig{}, "payload_marshal_failed: " + err.Error() - } - return decodeHALConfigBytes(b) - } -} - -func decodeHALConfigBytes(b []byte) (types.HALConfig, string) { - var probe map[string]json.RawMessage - if err := json.Unmarshal(b, &probe); err != nil { - return types.HALConfig{}, "json_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(b) - } - if _, ok := probe["devices"]; !ok { - return types.HALConfig{}, "missing_devices_field; raw=" + truncateRawJSON(b) - } - - // Lua encodes empty tables as {} (object) not [] (array). - // Normalize known slice fields so Go unmarshal accepts them. - for _, key := range []string{"devices", "pollers"} { - if raw, ok := probe[key]; ok && len(raw) == 2 && raw[0] == '{' && raw[1] == '}' { - probe[key] = json.RawMessage("[]") - } - } - fixed, err := json.Marshal(probe) - if err != nil { - return types.HALConfig{}, "normalize_failed: " + err.Error() - } - - var out types.HALConfig - if err := json.Unmarshal(fixed, &out); err != nil { - return types.HALConfig{}, "hal_config_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(fixed) - } - return out, "" -} - -func truncateRawJSON(b []byte) string { - if len(b) == 0 { - return "" - } - const max = 160 - if len(b) <= max { - return string(b) - } - return string(b[:max]) + "..." -} diff --git a/services/fabric/bridge_test.go b/services/fabric/bridge_test.go deleted file mode 100644 index a8ca9fe..0000000 --- a/services/fabric/bridge_test.go +++ /dev/null @@ -1,290 +0,0 @@ -package fabric - -import ( - "context" - "encoding/json" - "testing" - "time" - - "devicecode-go/bus" - "devicecode-go/types" -) - -func TestBridgeMapsConfigDeviceToConfigHAL(t *testing.T) { - b := newBus() - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - bridgeConn := b.NewConnection("fabric-bridge") - go RunBridge(ctx, bridgeConn) - - writer := b.NewConnection("writer") - reader := b.NewConnection("reader") - sub := reader.Subscribe(bus.T("config", "hal")) - defer reader.Unsubscribe(sub) - - in := types.HALConfig{ - Devices: []types.HALDevice{ - {ID: "led0", Type: "gpio_led"}, - }, - } - raw, err := json.Marshal(in) - if err != nil { - t.Fatalf("Marshal: %v", err) - } - writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(raw), true)) - - select { - case msg := <-sub.Channel(): - if msg == nil { - t.Fatal("config/hal subscription closed") - } - out, ok := msg.Payload.(types.HALConfig) - if !ok { - t.Fatalf("payload type = %T, want types.HALConfig", msg.Payload) - } - if len(out.Devices) != 1 || out.Devices[0].ID != "led0" { - t.Fatalf("unexpected config: %+v", out) - } - if !msg.Retained { - t.Fatal("config/hal message was not retained") - } - case <-time.After(2 * time.Second): - t.Fatal("timed out waiting for config/hal") - } -} - -func TestBridgeIgnoresNonHALConfigObject(t *testing.T) { - b := newBus() - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - bridgeConn := b.NewConnection("fabric-bridge") - go RunBridge(ctx, bridgeConn) - - writer := b.NewConnection("writer") - reader := b.NewConnection("reader") - sub := reader.Subscribe(bus.T("config", "hal")) - defer reader.Unsubscribe(sub) - - writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(`{"source":"monitor_auto_probe"}`), true)) - - select { - case msg := <-sub.Channel(): - t.Fatalf("unexpected config/hal message: %+v", msg) - case <-time.After(150 * time.Millisecond): - } - - req := b.NewConnection("requester") - reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) - defer reqCancel() - - replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( - bus.T("rpc", "hal", "dump"), - json.RawMessage(`{"ask":"status"}`), - false, - )) - if err != nil { - t.Fatalf("RequestWait: %v", err) - } - - reply, ok := replyMsg.Payload.(dumpReply) - if !ok { - t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) - } - if reply.Applied { - t.Fatal("reply.Applied = true, want false") - } - if reply.ConfigError == "" { - t.Fatal("reply.ConfigError = empty, want decode error") - } -} - -func TestBridgeRepliesToHALDump(t *testing.T) { - b := newBus() - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - bridgeConn := b.NewConnection("fabric-bridge") - go RunBridge(ctx, bridgeConn) - time.Sleep(10 * time.Millisecond) - - writer := b.NewConnection("writer") - req := b.NewConnection("requester") - - writer.Publish(writer.NewMessage(bus.T("hal", "state"), types.HALState{ - Level: "ready", - Status: "", - TS: 123, - }, true)) - - reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) - defer reqCancel() - - replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( - bus.T("rpc", "hal", "dump"), - json.RawMessage(`{"ask":"status"}`), - false, - )) - if err != nil { - t.Fatalf("RequestWait: %v", err) - } - - reply, ok := replyMsg.Payload.(dumpReply) - if !ok { - t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) - } - if !reply.OK { - t.Fatalf("reply.OK = false: %+v", reply) - } - if reply.Method != "dump" { - t.Fatalf("reply.Method = %q", reply.Method) - } - echo, ok := reply.Echo.(map[string]any) - if !ok { - t.Fatalf("reply.Echo type = %T, want map[string]any", reply.Echo) - } - if echo["ask"] != "status" { - t.Fatalf("reply.Echo.ask = %#v", echo["ask"]) - } - if reply.HAL == nil || reply.HAL.Level != "ready" || reply.HAL.TS != 123 { - t.Fatalf("reply.HAL = %+v", reply.HAL) - } - if reply.Applied { - t.Fatal("reply.Applied = true, want false") - } - if reply.ConfigCount != 0 { - t.Fatalf("reply.ConfigCount = %d, want 0", reply.ConfigCount) - } -} - -func TestBridgeDumpReflectsAppliedConfig(t *testing.T) { - b := newBus() - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - bridgeConn := b.NewConnection("fabric-bridge") - go RunBridge(ctx, bridgeConn) - time.Sleep(10 * time.Millisecond) - - writer := b.NewConnection("writer") - req := b.NewConnection("requester") - - in := types.HALConfig{ - Devices: []types.HALDevice{ - {ID: "led0", Type: "gpio_led"}, - }, - } - raw, err := json.Marshal(in) - if err != nil { - t.Fatalf("Marshal: %v", err) - } - writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(raw), true)) - - reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) - defer reqCancel() - - replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( - bus.T("rpc", "hal", "dump"), - json.RawMessage(`{"ask":"status"}`), - false, - )) - if err != nil { - t.Fatalf("RequestWait: %v", err) - } - - reply, ok := replyMsg.Payload.(dumpReply) - if !ok { - t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) - } - if !reply.Applied { - t.Fatal("reply.Applied = false, want true") - } - if reply.ConfigCount != 1 { - t.Fatalf("reply.ConfigCount = %d, want 1", reply.ConfigCount) - } - if reply.ConfigError != "" { - t.Fatalf("reply.ConfigError = %q, want empty", reply.ConfigError) - } -} - -func TestBridgeAcceptsLuaEmptyObjectsAsArrays(t *testing.T) { - b := newBus() - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - bridgeConn := b.NewConnection("fabric-bridge") - go RunBridge(ctx, bridgeConn) - time.Sleep(10 * time.Millisecond) - - writer := b.NewConnection("writer") - reader := b.NewConnection("reader") - sub := reader.Subscribe(bus.T("config", "hal")) - defer reader.Unsubscribe(sub) - - // Lua encodes empty tables as {} (objects) not [] (arrays). - writer.Publish(writer.NewMessage(bus.T("config", "device"), - json.RawMessage(`{"devices":{},"pollers":{}}`), true)) - - select { - case msg := <-sub.Channel(): - if msg == nil { - t.Fatal("config/hal subscription closed") - } - out, ok := msg.Payload.(types.HALConfig) - if !ok { - t.Fatalf("payload type = %T, want types.HALConfig", msg.Payload) - } - if out.Devices == nil || out.Pollers == nil { - t.Fatalf("expected non-nil slices, got devices=%v pollers=%v", out.Devices, out.Pollers) - } - case <-time.After(2 * time.Second): - t.Fatal("timed out waiting for config/hal") - } -} - -func TestBridgeDumpDrainsPendingConfigBeforeReply(t *testing.T) { - b := newBus() - ctx, cancel := context.WithCancel(context.Background()) - defer cancel() - - bridgeConn := b.NewConnection("fabric-bridge") - go RunBridge(ctx, bridgeConn) - time.Sleep(10 * time.Millisecond) - - writer := b.NewConnection("writer") - req := b.NewConnection("requester") - - in := types.HALConfig{ - Devices: []types.HALDevice{}, - Pollers: []types.PollSpec{}, - } - raw, err := json.Marshal(in) - if err != nil { - t.Fatalf("Marshal: %v", err) - } - writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(raw), true)) - - reqCtx, reqCancel := context.WithTimeout(context.Background(), 2*time.Second) - defer reqCancel() - - replyMsg, err := req.RequestWait(reqCtx, req.NewMessage( - bus.T("rpc", "hal", "dump"), - json.RawMessage(`{"ask":"status"}`), - false, - )) - if err != nil { - t.Fatalf("RequestWait: %v", err) - } - - reply, ok := replyMsg.Payload.(dumpReply) - if !ok { - t.Fatalf("reply payload type = %T, want dumpReply", replyMsg.Payload) - } - if !reply.Applied { - t.Fatal("reply.Applied = false, want true") - } - if reply.ConfigCount != 1 { - t.Fatalf("reply.ConfigCount = %d, want 1", reply.ConfigCount) - } -} diff --git a/services/fabric/config.go b/services/fabric/config.go new file mode 100644 index 0000000..28b7cb1 --- /dev/null +++ b/services/fabric/config.go @@ -0,0 +1,124 @@ +package fabric + +import ( + "encoding/json" + + "devicecode-go/types" +) + +// decodeHALConfig extracts a HALConfig from an arbitrary payload, +// normalizing Lua empty-table encoding ({} → []) for known slice fields. +func decodeHALConfig(payload any) (types.HALConfig, string) { + switch v := payload.(type) { + case types.HALConfig: + return v, "" + case *types.HALConfig: + if v == nil { + return types.HALConfig{}, "nil_hal_config" + } + return *v, "" + case json.RawMessage: + return decodeHALConfigBytes(v) + case []byte: + return decodeHALConfigBytes(v) + default: + b, err := json.Marshal(v) + if err != nil { + return types.HALConfig{}, "payload_marshal_failed: " + err.Error() + } + return decodeHALConfigBytes(b) + } +} + +func decodeHALConfigBytes(b []byte) (types.HALConfig, string) { + var probe map[string]json.RawMessage + if err := json.Unmarshal(b, &probe); err != nil { + return types.HALConfig{}, "json_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(b) + } + if _, ok := probe["devices"]; !ok { + return types.HALConfig{}, "missing_devices_field; raw=" + truncateRawJSON(b) + } + + // Lua encodes empty tables as {} (object) not [] (array). + // Normalize known slice fields so Go unmarshal accepts them. + for _, key := range []string{"devices", "pollers"} { + if raw, ok := probe[key]; ok && len(raw) == 2 && raw[0] == '{' && raw[1] == '}' { + probe[key] = json.RawMessage("[]") + } + } + fixed, err := json.Marshal(probe) + if err != nil { + return types.HALConfig{}, "normalize_failed: " + err.Error() + } + + var out types.HALConfig + if err := json.Unmarshal(fixed, &out); err != nil { + return types.HALConfig{}, "hal_config_unmarshal_failed: " + err.Error() + "; raw=" + truncateRawJSON(fixed) + } + return out, "" +} + +func decodeHALState(payload any) (types.HALState, bool) { + switch v := payload.(type) { + case types.HALState: + return v, true + case *types.HALState: + if v == nil { + return types.HALState{}, false + } + return *v, true + case json.RawMessage: + var out types.HALState + return out, json.Unmarshal(v, &out) == nil + case []byte: + var out types.HALState + return out, json.Unmarshal(v, &out) == nil + default: + b, err := json.Marshal(v) + if err != nil { + return types.HALState{}, false + } + var out types.HALState + return out, json.Unmarshal(b, &out) == nil + } +} + +func decodePayload(payload any) any { + switch v := payload.(type) { + case nil: + return nil + case json.RawMessage: + if len(v) == 0 { + return nil + } + var out any + if err := json.Unmarshal(v, &out); err == nil { + return out + } + return []byte(v) + case []byte: + if len(v) == 0 { + return nil + } + var out any + if err := json.Unmarshal(v, &out); err == nil { + return out + } + cp := make([]byte, len(v)) + copy(cp, v) + return cp + default: + return v + } +} + +func truncateRawJSON(b []byte) string { + if len(b) == 0 { + return "" + } + const max = 160 + if len(b) <= max { + return string(b) + } + return string(b[:max]) + "..." +} diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index e737efb..f0816c6 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -568,7 +568,7 @@ func TestImportPublishTopic(t *testing.T) { wire []string want string }{ - {[]string{"config", "device"}, "config/device"}, + {[]string{"config", "device"}, "config/hal"}, {[]string{"config", "other"}, ""}, {[]string{"unknown", "x"}, ""}, {nil, ""}, @@ -585,7 +585,7 @@ func TestImportCallTopic(t *testing.T) { wire []string want string }{ - {[]string{"rpc", "hal", "dump"}, "rpc/hal/dump"}, + // rpc/hal/dump is handled directly by onCall, not via import rules. {[]string{"rpc", "hal", "other"}, ""}, {[]string{"config", "device"}, ""}, {nil, ""}, @@ -675,13 +675,13 @@ func TestPubImport(t *testing.T) { bringUp(t, cm5) reader := b.NewConnection("test") - sub := reader.Subscribe(bus.T("config", "device")) + sub := reader.Subscribe(bus.T("config", "hal")) sendMsg(t, cm5, protoPub{ T: "pub", Topic: []string{"config", "device"}, - Payload: json.RawMessage(`{"mode":"normal"}`), - Retain: false, + Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), + Retain: true, }) select { @@ -689,15 +689,8 @@ func TestPubImport(t *testing.T) { if m == nil { t.Fatal("nil message") } - raw, ok := m.Payload.(json.RawMessage) - if !ok { - t.Fatalf("payload type = %T, want json.RawMessage", m.Payload) - } - if string(raw) != `{"mode":"normal"}` { - t.Errorf("payload = %s", raw) - } case <-time.After(2 * time.Second): - t.Fatal("timeout waiting for imported pub") + t.Fatal("timeout waiting for imported config on config/hal") } } @@ -995,7 +988,7 @@ func TestCallNoRoute(t *testing.T) { } } -func TestCallHandlerError(t *testing.T) { +func TestDumpCallReturnsConfigState(t *testing.T) { mcu, cm5 := pipePair() b := newBus() fabricConn := b.NewConnection("fabric") @@ -1004,35 +997,41 @@ func TestCallHandlerError(t *testing.T) { go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") bringUp(t, cm5) - handler := b.NewConnection("handler") - sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) - go func() { - for m := range sub.Channel() { - handler.Reply(m, struct { - OK bool `json:"ok"` - Error string `json:"error"` - }{OK: false, Error: "device_busy"}, false) - } - }() + // Send config first so the session has state. + sendMsg(t, cm5, protoPub{ + T: "pub", + Topic: []string{"config", "device"}, + Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), + Retain: true, + }) + time.Sleep(100 * time.Millisecond) + // Call dump. sendMsg(t, cm5, protoCall{ - T: "call", ID: "err-1", Topic: []string{"rpc", "hal", "dump"}, - Payload: json.RawMessage(`{}`), TimeoutMs: 5000, + T: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, + Payload: json.RawMessage(`{"ask":"status"}`), TimeoutMs: 5000, }) reply := readMsg[protoReply](t, cm5) - if reply.Corr != "err-1" { + if reply.Corr != "dump-1" { t.Errorf("corr = %q", reply.Corr) } - if reply.OK { - t.Error("expected ok=false for handler error") + if !reply.OK { + t.Errorf("expected ok=true, got err=%q", reply.Err) + } + var dump dumpReply + if err := json.Unmarshal(reply.Payload, &dump); err != nil { + t.Fatalf("unmarshal dump reply: %v", err) + } + if !dump.Applied { + t.Error("expected applied=true") } - if reply.Err != "device_busy" { - t.Errorf("err = %q, want device_busy", reply.Err) + if dump.ConfigCount != 1 { + t.Errorf("config_count = %d, want 1", dump.ConfigCount) } } -func TestCallDoesNotBlockPing(t *testing.T) { +func TestDumpCallDoesNotBlockPing(t *testing.T) { mcu, cm5 := pipePair() b := newBus() fabricConn := b.NewConnection("fabric") @@ -1041,56 +1040,35 @@ func TestCallDoesNotBlockPing(t *testing.T) { go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") bringUp(t, cm5) - handler := b.NewConnection("handler") - sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) - go func() { - for m := range sub.Channel() { - time.Sleep(300 * time.Millisecond) - handler.Reply(m, map[string]string{"result": "ok"}, false) - } - }() - + // Send dump call and ping back-to-back. sendMsg(t, cm5, protoCall{ - T: "call", ID: "slow-1", Topic: []string{"rpc", "hal", "dump"}, + T: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 1000, }) - sendMsg(t, cm5, protoPing{T: "ping", TS: 77, SID: "s1"}) + sendMsg(t, cm5, protoPing{T: "ping", TS: 77, SID: testCM5SID}) type readResult struct { line []byte err error } - first := make(chan readResult, 1) - go func() { - line, err := cm5.ReadLine() - first <- readResult{line: line, err: err} - }() - - select { - case res := <-first: - if res.err != nil { - t.Fatalf("ReadLine: %v", res.err) - } - if got := protoType(res.line); got != "pong" { - t.Fatalf("first response type = %q, want pong", got) - } - var pong protoPong - if err := json.Unmarshal(res.line, &pong); err != nil { - t.Fatalf("Unmarshal pong: %v", err) - } - if pong.TS != 77 || pong.SID == "" { - t.Fatalf("bad pong: %+v", pong) + // Both should arrive — dump reply and pong, in either order. + var gotReply, gotPong bool + for i := 0; i < 2; i++ { + msg := readMsg[protoMsg](t, cm5) + switch msg.T { + case msgReply: + gotReply = true + case msgPong: + gotPong = true + default: + t.Fatalf("unexpected message type %q", msg.T) } - case <-time.After(150 * time.Millisecond): - t.Fatal("ping blocked behind slow call") } - - reply := readMsg[protoReply](t, cm5) - if reply.Corr != "slow-1" { - t.Errorf("corr = %q", reply.Corr) + if !gotReply { + t.Error("missing dump reply") } - if !reply.OK { - t.Errorf("reply not ok: %s", reply.Err) + if !gotPong { + t.Error("missing pong") } } diff --git a/services/fabric/remap.go b/services/fabric/remap.go index 13ee063..175c44d 100644 --- a/services/fabric/remap.go +++ b/services/fabric/remap.go @@ -10,10 +10,10 @@ import "devicecode-go/bus" // config side. // // CM5 -> MCU wire publish: -// ["config","device"] -> config/device +// ["config","device"] -> config/hal (with Lua empty-table normalization) // // CM5 -> MCU wire call: -// ["rpc","hal","dump"] -> rpc/hal/dump +// ["rpc","hal","dump"] -> handled directly by session (not via import rules) // // MCU local bus publish -> wire: // hal/cap/env/# -> ["state","env",...] @@ -34,16 +34,12 @@ type busExportRule struct { var importPublishRules = []importRule{ { wire: []string{"config", "device"}, - local: []string{"config", "device"}, + local: []string{"config", "hal"}, }, } -var importCallRules = []importRule{ - { - wire: []string{"rpc", "hal", "dump"}, - local: []string{"rpc", "hal", "dump"}, - }, -} +// rpc/hal/dump is handled directly by onCall, not via import rules. +var importCallRules = []importRule{} var exportPublishRules = []busExportRule{ { diff --git a/services/fabric/session.go b/services/fabric/session.go index 2a7c74f..ed4b2a1 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -95,9 +95,16 @@ type session struct { exportSubs []*bus.Subscription exportCallSubs []*bus.Subscription + halStateSub *bus.Subscription inboundCalls []*inboundCall outboundCalls []*outboundCall nextOutboundID uint64 + + // Config state — tracks config/device → config/hal translation. + lastHALState *types.HALState + configApplied bool + configCount int + lastConfigErr string } func (s *session) log(msg string) { @@ -211,6 +218,7 @@ func (s *session) run(ctx context.Context) { resetTimer(stale, staleTimeout) case <-exportTick.C: + s.drainHALState() s.drainExports() s.drainInbound(time.Now()) s.drainOutbound(time.Now()) @@ -495,6 +503,23 @@ func (s *session) onPub(msg *protoMsg) { s.log("incoming pub dropped: no_route") return } + + // config/device → config/hal: normalize and track. + if topicEquals(localTopic, tConfigHAL) { + cfg, err := decodeHALConfig(msg.Payload) + if err != "" { + s.lastConfigErr = err + s.log("config/device rejected: " + err) + return + } + s.configApplied = true + s.configCount++ + s.lastConfigErr = "" + s.log("config/device applied to config/hal") + s.conn.Publish(s.conn.NewMessage(localTopic, cfg, true)) + return + } + s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) } @@ -508,6 +533,22 @@ func (s *session) onUnretain(msg *protoMsg) { } func (s *session) onCall(msg *protoMsg) { + // rpc/hal/dump: handle directly — reply with config and HAL state. + if slicesEqualStrings(msg.Topic, dumpCallTopic) { + s.drainHALState() + reply := dumpReply{ + OK: true, + Method: "dump", + Echo: decodePayload(msg.Payload), + HAL: s.lastHALState, + Applied: s.configApplied, + ConfigCount: s.configCount, + ConfigError: s.lastConfigErr, + } + s.sendFrame(marshal(protoReply{T: msgReply, Corr: msg.ID, OK: true, Payload: mustMarshal(reply)})) + return + } + localTopic := importCallTopic(msg.Topic) if localTopic == nil { s.log("incoming call dropped: no_route") @@ -567,6 +608,65 @@ func checkBusError(payload any) string { return "" } +func mustMarshal(v any) json.RawMessage { + b, err := json.Marshal(v) + if err != nil { + return json.RawMessage(`{"error":"marshal_failed"}`) + } + return json.RawMessage(b) +} + +// ---- config/dump types and helpers ---- + +var ( + tConfigHAL = bus.T("config", "hal") + dumpCallTopic = []string{"rpc", "hal", "dump"} +) + +type dumpReply struct { + OK bool `json:"ok"` + Method string `json:"method"` + Echo any `json:"echo,omitempty"` + HAL *types.HALState `json:"hal,omitempty"` + Applied bool `json:"applied"` + ConfigCount int `json:"config_count,omitempty"` + ConfigError string `json:"config_error,omitempty"` +} + +func topicEquals(t bus.Topic, expected bus.Topic) bool { + if t.Len() != expected.Len() { + return false + } + for i := 0; i < t.Len(); i++ { + a, _ := t.At(i).(string) + b, _ := expected.At(i).(string) + if a != b { + return false + } + } + return true +} + +func (s *session) drainHALState() { + if s.halStateSub == nil { + return + } + for { + select { + case msg, ok := <-s.halStateSub.Channel(): + if !ok || msg == nil { + return + } + if st, ok := decodeHALState(msg.Payload); ok { + stCopy := st + s.lastHALState = &stCopy + } + default: + return + } + } +} + func marshalPayload(payload any) (json.RawMessage, error) { b, err := json.Marshal(payload) if err != nil { @@ -590,6 +690,7 @@ func (s *session) setupExports() { for _, p := range exportCallPatterns() { s.exportCallSubs = append(s.exportCallSubs, s.conn.Subscribe(p)) } + s.halStateSub = s.conn.Subscribe(bus.T("hal", "state")) } func (s *session) teardownExports() { @@ -601,6 +702,10 @@ func (s *session) teardownExports() { s.conn.Unsubscribe(sub) } s.exportCallSubs = nil + if s.halStateSub != nil { + s.conn.Unsubscribe(s.halStateSub) + s.halStateSub = nil + } } func (s *session) teardownInbound() { From 36e071f4c7191219db568343c893f8f4e6921c28 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 21:03:21 +0000 Subject: [PATCH 22/65] refactor: move types and vars to top of session.go, drain HAL state only on dump MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Move dumpReply, inboundCall, outboundCall, readResult, linkStatePayload types and tConfigHAL/dumpCallTopic vars to the top of the file with other declarations. Remove drainHALState from the 50ms export tick — it only needs to run when a dump call arrives, not every tick. --- services/fabric/session.go | 109 +++++++++++++++++++------------------ 1 file changed, 55 insertions(+), 54 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index ed4b2a1..8fdd473 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -70,6 +70,61 @@ const ( reasonTimeout = "timeout" ) +// ---- bus topics for config handling ---- + +var ( + tConfigHAL = bus.T("config", "hal") + dumpCallTopic = []string{"rpc", "hal", "dump"} +) + +// ---- types ---- + +type dumpReply struct { + OK bool `json:"ok"` + Method string `json:"method"` + Echo any `json:"echo,omitempty"` + HAL *types.HALState `json:"hal,omitempty"` + Applied bool `json:"applied"` + ConfigCount int `json:"config_count,omitempty"` + ConfigError string `json:"config_error,omitempty"` +} + +type inboundCall struct { + id string + sub *bus.Subscription + deadline time.Time +} + +type outboundCall struct { + id string + req *bus.Message + deadline time.Time +} + +type readResult struct { + line []byte + err error +} + +type linkStatePayload struct { + LinkID string `json:"link_id"` + Status string `json:"status"` + Ready bool `json:"ready"` + Established bool `json:"established"` + PeerID string `json:"peer_id"` + LocalSID string `json:"local_sid"` + PeerSID string `json:"peer_sid,omitempty"` + PeerNode string `json:"peer_node,omitempty"` + PeerProto int `json:"peer_proto,omitempty"` + LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` + LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` + LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` + InboundCalls int `json:"inbound_calls"` + OutboundCalls int `json:"outbound_calls"` + Reason string `json:"reason,omitempty"` + Err string `json:"err,omitempty"` +} + // session manages the fabric link state machine over a Transport. // // All bus access happens in the main loop goroutine only. TinyGo's @@ -115,42 +170,6 @@ func (s *session) logKV(msg, key, value string) { println("[fabric]", "sid", s.localSID, msg, key, value) } -type inboundCall struct { - id string - sub *bus.Subscription - deadline time.Time -} - -type outboundCall struct { - id string - req *bus.Message - deadline time.Time -} - -type readResult struct { - line []byte - err error -} - -type linkStatePayload struct { - LinkID string `json:"link_id"` - Status string `json:"status"` - Ready bool `json:"ready"` - Established bool `json:"established"` - PeerID string `json:"peer_id"` - LocalSID string `json:"local_sid"` - PeerSID string `json:"peer_sid,omitempty"` - PeerNode string `json:"peer_node,omitempty"` - PeerProto int `json:"peer_proto,omitempty"` - LastRxUnixMilli int64 `json:"last_rx_unix_ms,omitempty"` - LastTxUnixMilli int64 `json:"last_tx_unix_ms,omitempty"` - LastPongUnixMilli int64 `json:"last_pong_unix_ms,omitempty"` - InboundCalls int `json:"inbound_calls"` - OutboundCalls int `json:"outbound_calls"` - Reason string `json:"reason,omitempty"` - Err string `json:"err,omitempty"` -} - // run is the main loop. Blocks until ctx is cancelled. func (s *session) run(ctx context.Context) { lines := make(chan readResult, 1) @@ -218,7 +237,6 @@ func (s *session) run(ctx context.Context) { resetTimer(stale, staleTimeout) case <-exportTick.C: - s.drainHALState() s.drainExports() s.drainInbound(time.Now()) s.drainOutbound(time.Now()) @@ -616,23 +634,6 @@ func mustMarshal(v any) json.RawMessage { return json.RawMessage(b) } -// ---- config/dump types and helpers ---- - -var ( - tConfigHAL = bus.T("config", "hal") - dumpCallTopic = []string{"rpc", "hal", "dump"} -) - -type dumpReply struct { - OK bool `json:"ok"` - Method string `json:"method"` - Echo any `json:"echo,omitempty"` - HAL *types.HALState `json:"hal,omitempty"` - Applied bool `json:"applied"` - ConfigCount int `json:"config_count,omitempty"` - ConfigError string `json:"config_error,omitempty"` -} - func topicEquals(t bus.Topic, expected bus.Topic) bool { if t.Len() != expected.Len() { return false From ec543256f8a79cde28253f78b9cdb6c5dd3656b1 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 2 Apr 2026 21:07:14 +0000 Subject: [PATCH 23/65] refactor: remove halStateSub, read HAL state on demand in dump handler Replace the persistent halStateSub subscription and drainHALState polling loop with a one-shot subscribe/read/unsubscribe in onCall when a dump is requested. The hal/state topic is retained on the bus, so the subscribe immediately delivers the latest value. Removes halStateSub and lastHALState from the session struct. --- services/fabric/session.go | 43 +++++++++++++------------------------- 1 file changed, 14 insertions(+), 29 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 8fdd473..474c08b 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -150,13 +150,11 @@ type session struct { exportSubs []*bus.Subscription exportCallSubs []*bus.Subscription - halStateSub *bus.Subscription inboundCalls []*inboundCall outboundCalls []*outboundCall nextOutboundID uint64 // Config state — tracks config/device → config/hal translation. - lastHALState *types.HALState configApplied bool configCount int lastConfigErr string @@ -553,12 +551,24 @@ func (s *session) onUnretain(msg *protoMsg) { func (s *session) onCall(msg *protoMsg) { // rpc/hal/dump: handle directly — reply with config and HAL state. if slicesEqualStrings(msg.Topic, dumpCallTopic) { - s.drainHALState() + var halState *types.HALState + sub := s.conn.Subscribe(bus.T("hal", "state")) + select { + case m := <-sub.Channel(): + if m != nil { + if st, ok := decodeHALState(m.Payload); ok { + halState = &st + } + } + default: + } + s.conn.Unsubscribe(sub) + reply := dumpReply{ OK: true, Method: "dump", Echo: decodePayload(msg.Payload), - HAL: s.lastHALState, + HAL: halState, Applied: s.configApplied, ConfigCount: s.configCount, ConfigError: s.lastConfigErr, @@ -648,26 +658,6 @@ func topicEquals(t bus.Topic, expected bus.Topic) bool { return true } -func (s *session) drainHALState() { - if s.halStateSub == nil { - return - } - for { - select { - case msg, ok := <-s.halStateSub.Channel(): - if !ok || msg == nil { - return - } - if st, ok := decodeHALState(msg.Payload); ok { - stCopy := st - s.lastHALState = &stCopy - } - default: - return - } - } -} - func marshalPayload(payload any) (json.RawMessage, error) { b, err := json.Marshal(payload) if err != nil { @@ -691,7 +681,6 @@ func (s *session) setupExports() { for _, p := range exportCallPatterns() { s.exportCallSubs = append(s.exportCallSubs, s.conn.Subscribe(p)) } - s.halStateSub = s.conn.Subscribe(bus.T("hal", "state")) } func (s *session) teardownExports() { @@ -703,10 +692,6 @@ func (s *session) teardownExports() { s.conn.Unsubscribe(sub) } s.exportCallSubs = nil - if s.halStateSub != nil { - s.conn.Unsubscribe(s.halStateSub) - s.halStateSub = nil - } } func (s *session) teardownInbound() { From 31762fd293be67c59c1aedf47f10cfe4dc9908a1 Mon Sep 17 00:00:00 2001 From: cpunt Date: Wed, 8 Apr 2026 14:50:58 +0000 Subject: [PATCH 24/65] feat: add firmware transfer receive over fabric protocol Implements the MCU-side receive path for binary file/firmware transfer over fabric. Uses chunked streaming with CRC32 per-chunk and SHA256 whole-file verification. The RP2350 sink delegates to ab-bringup/abupdate for A/B slot writes and reboot. --- go.mod | 3 + services/fabric/fabric.go | 13 +- services/fabric/fabric_test.go | 7 + services/fabric/protocol.go | 100 +++++++- services/fabric/session.go | 31 ++- services/fabric/transfer.go | 310 +++++++++++++++++++++++ services/fabric/transfer_sink_rp2350.go | 71 ++++++ services/fabric/transfer_sink_stub.go | 18 ++ services/fabric/transfer_test.go | 314 ++++++++++++++++++++++++ 9 files changed, 846 insertions(+), 21 deletions(-) create mode 100644 services/fabric/transfer.go create mode 100644 services/fabric/transfer_sink_rp2350.go create mode 100644 services/fabric/transfer_sink_stub.go create mode 100644 services/fabric/transfer_test.go diff --git a/go.mod b/go.mod index f60f86a..424f179 100644 --- a/go.mod +++ b/go.mod @@ -3,9 +3,12 @@ module devicecode-go go 1.25.0 require ( + ab-bringup v0.0.0 github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3 golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 tinygo.org/x/drivers v0.33.0 ) require github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect + +replace ab-bringup => ../pico2-a-b diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index d0e3cd9..0db1b6e 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -32,12 +32,13 @@ func newLocalSID() string { // arrives within the timeout. func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string) { s := session{ - linkID: defaultLinkID, - nodeID: nodeID, - peerID: peerID, - localSID: newLocalSID(), - tr: tr, - conn: conn, + linkID: defaultLinkID, + nodeID: nodeID, + peerID: peerID, + localSID: newLocalSID(), + tr: tr, + conn: conn, + transferFactory: newTransferFactory(), } s.run(ctx) } diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index f0816c6..48e2497 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -121,6 +121,13 @@ func TestCodecAllTypes(t *testing.T) { {protoUnretain{T: "unretain", Topic: []string{"a"}}, "unretain"}, {protoCall{T: "call", ID: "c1"}, "call"}, {protoReply{T: "reply", Corr: "c1", OK: true}, "reply"}, + {protoXferBegin{T: "xfer_begin", ID: "x1"}, "xfer_begin"}, + {protoXferReady{T: "xfer_ready", ID: "x1", OK: true}, "xfer_ready"}, + {protoXferChunk{T: "xfer_chunk", ID: "x1"}, "xfer_chunk"}, + {protoXferNeed{T: "xfer_need", ID: "x1"}, "xfer_need"}, + {protoXferCommit{T: "xfer_commit", ID: "x1"}, "xfer_commit"}, + {protoXferDone{T: "xfer_done", ID: "x1", OK: true}, "xfer_done"}, + {protoXferAbort{T: "xfer_abort", ID: "x1", Reason: "aborted"}, "xfer_abort"}, } { b := marshal(tc.v) if got := protoType(b[:len(b)-1]); got != tc.want { diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 6d9e153..70aea53 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -5,14 +5,21 @@ import "encoding/json" // ---- Wire message type identifiers (fabric.md §4) ---- const ( - msgHello = "hello" - msgHelloAck = "hello_ack" - msgPing = "ping" - msgPong = "pong" - msgPub = "pub" - msgUnretain = "unretain" - msgCall = "call" - msgReply = "reply" + msgHello = "hello" + msgHelloAck = "hello_ack" + msgPing = "ping" + msgPong = "pong" + msgPub = "pub" + msgUnretain = "unretain" + msgCall = "call" + msgReply = "reply" + msgXferBegin = "xfer_begin" + msgXferReady = "xfer_ready" + msgXferChunk = "xfer_chunk" + msgXferNeed = "xfer_need" + msgXferCommit = "xfer_commit" + msgXferDone = "xfer_done" + msgXferAbort = "xfer_abort" ) // ---- Wire message structs ---- @@ -83,6 +90,66 @@ type protoReply struct { Err string `json:"err,omitempty"` } +type protoXferBegin struct { + T string `json:"t"` + ID string `json:"id"` + Kind string `json:"kind"` + Name string `json:"name"` + Format string `json:"format"` + Enc string `json:"enc"` + Size uint32 `json:"size"` + ChunkRaw uint32 `json:"chunk_raw"` + Chunks uint32 `json:"chunks"` + SHA256 string `json:"sha256"` + Meta json.RawMessage `json:"meta,omitempty"` +} + +type protoXferReady struct { + T string `json:"t"` + ID string `json:"id"` + OK bool `json:"ok"` + Next *uint32 `json:"next,omitempty"` + Err string `json:"err,omitempty"` +} + +type protoXferChunk struct { + T string `json:"t"` + ID string `json:"id"` + Seq uint32 `json:"seq"` + Off uint32 `json:"off"` + N uint32 `json:"n"` + CRC32 string `json:"crc32"` + Data string `json:"data"` +} + +type protoXferNeed struct { + T string `json:"t"` + ID string `json:"id"` + Next uint32 `json:"next"` + Err string `json:"err,omitempty"` +} + +type protoXferCommit struct { + T string `json:"t"` + ID string `json:"id"` + Size uint32 `json:"size"` + SHA256 string `json:"sha256"` +} + +type protoXferDone struct { + T string `json:"t"` + ID string `json:"id"` + OK bool `json:"ok"` + Info json.RawMessage `json:"info,omitempty"` + Err string `json:"err,omitempty"` +} + +type protoXferAbort struct { + T string `json:"t"` + ID string `json:"id"` + Reason string `json:"reason"` +} + // protoMsg is a union struct for single-pass unmarshal in dispatch. // Fields are the superset of all message types. Only the fields // relevant to the T value are populated; the rest are zero. @@ -102,6 +169,23 @@ type protoMsg struct { Corr string `json:"corr,omitempty"` TimeoutMs int `json:"timeout_ms,omitempty"` Err string `json:"err,omitempty"` + Kind string `json:"kind,omitempty"` + Name string `json:"name,omitempty"` + Format string `json:"format,omitempty"` + Enc string `json:"enc,omitempty"` + Size uint32 `json:"size,omitempty"` + ChunkRaw uint32 `json:"chunk_raw,omitempty"` + Chunks uint32 `json:"chunks,omitempty"` + SHA256 string `json:"sha256,omitempty"` + Meta json.RawMessage `json:"meta,omitempty"` + Seq uint32 `json:"seq,omitempty"` + Off uint32 `json:"off,omitempty"` + N uint32 `json:"n,omitempty"` + CRC32 string `json:"crc32,omitempty"` + Data string `json:"data,omitempty"` + Next uint32 `json:"next,omitempty"` + Reason string `json:"reason,omitempty"` + Info json.RawMessage `json:"info,omitempty"` } // ---- codec helpers ---- diff --git a/services/fabric/session.go b/services/fabric/session.go index 474c08b..868c802 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -148,11 +148,13 @@ type session struct { exportReadyAt time.Time exportsEnabled bool - exportSubs []*bus.Subscription - exportCallSubs []*bus.Subscription - inboundCalls []*inboundCall - outboundCalls []*outboundCall - nextOutboundID uint64 + exportSubs []*bus.Subscription + exportCallSubs []*bus.Subscription + inboundCalls []*inboundCall + outboundCalls []*outboundCall + nextOutboundID uint64 + incomingTransfer *incomingTransfer + transferFactory transferFactory // Config state — tracks config/device → config/hal translation. configApplied bool @@ -170,6 +172,10 @@ func (s *session) logKV(msg, key, value string) { // run is the main loop. Blocks until ctx is cancelled. func (s *session) run(ctx context.Context) { + if s.transferFactory == nil { + s.transferFactory = newTransferFactory() + } + lines := make(chan readResult, 1) go func() { @@ -201,6 +207,7 @@ func (s *session) run(ctx context.Context) { defer s.teardownExports() defer s.teardownInbound() defer s.teardownOutbound(reasonLinkDown) + defer s.abortTransfer(reasonLinkDown) defer s.log("run stop") stale := time.NewTimer(staleTimeout) @@ -330,6 +337,7 @@ func (s *session) handleLinkDown(reason, err string) { s.teardownExports() s.teardownInbound() s.teardownOutbound(pendingReason) + s.abortTransfer(pendingReason) s.publishLinkState(reason, err) if err != "" { s.logKV("link down", "err", err) @@ -341,11 +349,12 @@ func (s *session) handleLinkDown(reason, err string) { // promoteLink transitions to linkUp, tearing down any prior session state. func (s *session) promoteLink(reason string) { if s.link == linkUp { - s.teardownExports() - s.teardownInbound() if reason == "" { reason = reasonPeerReset } + s.abortTransfer(reason) + s.teardownExports() + s.teardownInbound() s.teardownOutbound(reason) } s.link = linkUp @@ -403,6 +412,14 @@ func (s *session) dispatch(line []byte) { s.onCall(&msg) case msgReply: s.onReply(&msg) + case msgXferBegin: + s.onTransferBegin(&msg) + case msgXferChunk: + s.onTransferChunk(&msg) + case msgXferCommit: + s.onTransferCommit(&msg) + case msgXferAbort: + s.onTransferAbort(&msg) default: s.logKV("unknown message type dropped", "type", msg.T) } diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go new file mode 100644 index 0000000..180081b --- /dev/null +++ b/services/fabric/transfer.go @@ -0,0 +1,310 @@ +package fabric + +import ( + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "fmt" + "hash" + "hash/crc32" + "strings" + "time" +) + +const postTransferDoneSettle = 10 * time.Millisecond + +type transferMeta struct { + ID string + Kind string + Name string + Format string + Enc string + Size uint32 + ChunkRaw uint32 + Chunks uint32 + SHA256 string + Meta json.RawMessage +} + +type transferInfo struct { + BytesWritten uint32 `json:"bytes_written,omitempty"` + SlotXIPAddr uint32 `json:"slot_xip_addr,omitempty"` +} + +func (i transferInfo) isZero() bool { + return i.BytesWritten == 0 && i.SlotXIPAddr == 0 +} + +type transferFactory interface { + Begin(meta transferMeta) (transferSink, error) +} + +type transferSink interface { + WriteChunk(seq, off uint32, data []byte) error + Commit() (transferInfo, error) + Apply() error + Abort(reason string) error +} + +type incomingTransfer struct { + meta transferMeta + sink transferSink + expectedNext uint32 + bytesWritten uint32 + chunksSeen uint32 + hasher hash.Hash +} + +func lowerHex(s string) string { + return strings.ToLower(strings.TrimSpace(s)) +} + +func crc32Hex(data []byte) string { + return fmt.Sprintf("%08x", crc32.ChecksumIEEE(data)) +} + +func sha256Hex(h hash.Hash) string { + sum := h.Sum(nil) + return hex.EncodeToString(sum) +} + +func readyNext(v uint32) *uint32 { + return &v +} + +func infoPayload(info transferInfo) json.RawMessage { + if info.isZero() { + return nil + } + b, err := json.Marshal(info) + if err != nil { + return nil + } + return json.RawMessage(b) +} + +func (s *session) sendTransferReady(id string, ok bool, next *uint32, errStr string) bool { + return s.sendFrame(marshal(protoXferReady{ + T: msgXferReady, + ID: id, + OK: ok, + Next: next, + Err: errStr, + })) +} + +func (s *session) sendTransferNeed(id string, next uint32, errStr string) bool { + return s.sendFrame(marshal(protoXferNeed{ + T: msgXferNeed, + ID: id, + Next: next, + Err: errStr, + })) +} + +func (s *session) sendTransferDone(id string, ok bool, info transferInfo, errStr string) bool { + return s.sendFrame(marshal(protoXferDone{ + T: msgXferDone, + ID: id, + OK: ok, + Info: infoPayload(info), + Err: errStr, + })) +} + +func (s *session) sendTransferAbort(id, reason string) bool { + return s.sendFrame(marshal(protoXferAbort{ + T: msgXferAbort, + ID: id, + Reason: reason, + })) +} + +func (s *session) clearTransfer() *incomingTransfer { + cur := s.incomingTransfer + s.incomingTransfer = nil + return cur +} + +func (s *session) abortTransfer(reason string) { + cur := s.clearTransfer() + if cur == nil { + return + } + if err := cur.sink.Abort(reason); err != nil { + s.logKV("transfer abort failed", "err", err.Error()) + } +} + +func validateTransferBegin(msg *protoMsg) (transferMeta, string) { + if msg.ID == "" { + return transferMeta{}, "xfer_begin.id" + } + if msg.Kind == "" { + return transferMeta{}, "xfer_begin.kind" + } + if msg.Name == "" { + return transferMeta{}, "xfer_begin.name" + } + if msg.Format == "" { + return transferMeta{}, "xfer_begin.format" + } + if msg.Enc == "" { + return transferMeta{}, "xfer_begin.enc" + } + if msg.Size == 0 { + return transferMeta{}, "xfer_begin.size" + } + if msg.ChunkRaw == 0 { + return transferMeta{}, "xfer_begin.chunk_raw" + } + if msg.Chunks == 0 { + return transferMeta{}, "xfer_begin.chunks" + } + if msg.SHA256 == "" { + return transferMeta{}, "xfer_begin.sha256" + } + return transferMeta{ + ID: msg.ID, + Kind: msg.Kind, + Name: msg.Name, + Format: msg.Format, + Enc: msg.Enc, + Size: msg.Size, + ChunkRaw: msg.ChunkRaw, + Chunks: msg.Chunks, + SHA256: lowerHex(msg.SHA256), + Meta: append(json.RawMessage(nil), msg.Meta...), + }, "" +} + +func (s *session) onTransferBegin(msg *protoMsg) { + meta, errStr := validateTransferBegin(msg) + if errStr != "" { + if msg.ID != "" { + s.sendTransferReady(msg.ID, false, nil, "bad_message: "+errStr) + } + s.logKV("xfer_begin dropped", "err", errStr) + return + } + if s.incomingTransfer != nil { + s.sendTransferReady(meta.ID, false, nil, "busy") + return + } + if meta.Enc != "b64url" { + s.sendTransferReady(meta.ID, false, nil, "unsupported_encoding") + return + } + if s.transferFactory == nil { + s.transferFactory = newTransferFactory() + } + sink, err := s.transferFactory.Begin(meta) + if err != nil { + s.sendTransferReady(meta.ID, false, nil, err.Error()) + return + } + s.incomingTransfer = &incomingTransfer{ + meta: meta, + sink: sink, + hasher: sha256.New(), + } + s.sendTransferReady(meta.ID, true, readyNext(0), "") +} + +func (s *session) onTransferChunk(msg *protoMsg) { + cur := s.incomingTransfer + if cur == nil || cur.meta.ID != msg.ID { + s.logKV("xfer_chunk dropped", "id", msg.ID) + return + } + if msg.Seq != cur.expectedNext { + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_seq") + return + } + if msg.Off != cur.bytesWritten { + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_offset") + return + } + raw, err := base64.RawURLEncoding.DecodeString(msg.Data) + if err != nil { + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") + return + } + if uint32(len(raw)) != msg.N || msg.N == 0 { + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") + return + } + if crc32Hex(raw) != lowerHex(msg.CRC32) { + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "bad_crc") + return + } + if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") + return + } + if err := cur.sink.WriteChunk(msg.Seq, msg.Off, raw); err != nil { + _ = cur.sink.Abort(err.Error()) + s.clearTransfer() + s.sendTransferDone(cur.meta.ID, false, transferInfo{}, err.Error()) + return + } + _, _ = cur.hasher.Write(raw) + cur.expectedNext++ + cur.bytesWritten += uint32(len(raw)) + cur.chunksSeen++ + s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "") +} + +func (s *session) onTransferCommit(msg *protoMsg) { + cur := s.incomingTransfer + if cur == nil || cur.meta.ID != msg.ID { + s.logKV("xfer_commit dropped", "id", msg.ID) + return + } + if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { + _ = cur.sink.Abort("size_mismatch") + s.clearTransfer() + s.sendTransferDone(cur.meta.ID, false, transferInfo{}, "size_mismatch") + return + } + if lowerHex(msg.SHA256) != cur.meta.SHA256 || sha256Hex(cur.hasher) != cur.meta.SHA256 { + _ = cur.sink.Abort("sha256_mismatch") + s.clearTransfer() + s.sendTransferDone(cur.meta.ID, false, transferInfo{}, "sha256_mismatch") + return + } + info, err := cur.sink.Commit() + if err != nil { + _ = cur.sink.Abort(err.Error()) + s.clearTransfer() + s.sendTransferDone(cur.meta.ID, false, transferInfo{}, err.Error()) + return + } + sink := cur.sink + id := cur.meta.ID + s.clearTransfer() + if !s.sendTransferDone(id, true, info, "") { + return + } + time.Sleep(postTransferDoneSettle) + if err := sink.Apply(); err != nil { + s.logKV("transfer apply failed", "err", err.Error()) + } +} + +func (s *session) onTransferAbort(msg *protoMsg) { + cur := s.incomingTransfer + if cur == nil || cur.meta.ID != msg.ID { + s.logKV("xfer_abort dropped", "id", msg.ID) + return + } + reason := msg.Reason + if reason == "" { + reason = "remote_abort" + } + if err := cur.sink.Abort(reason); err != nil { + s.logKV("transfer abort failed", "err", err.Error()) + } + s.clearTransfer() +} diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go new file mode 100644 index 0000000..705e80b --- /dev/null +++ b/services/fabric/transfer_sink_rp2350.go @@ -0,0 +1,71 @@ +//go:build tinygo && rp2350 + +package fabric + +import ( + "errors" + "fmt" + + "ab-bringup/abupdate" +) + +var errTransferUnsupported = errors.New("unsupported") + +type rp2350TransferFactory struct{} + +type rp2350TransferSink struct { + updater *abupdate.Updater +} + +func newTransferFactory() transferFactory { + return rp2350TransferFactory{} +} + +func (rp2350TransferFactory) Begin(meta transferMeta) (transferSink, error) { + if meta.Kind != "firmware.rp2350" || meta.Format != "bin" { + return nil, errTransferUnsupported + } + + var updater abupdate.Updater + if rc := updater.Init(); rc != 0 { + return nil, fmt.Errorf("updater_init:%d", rc) + } + if rc := updater.BeginUpdate(meta.Size); rc != 0 { + return nil, fmt.Errorf("begin_update:%d", rc) + } + + return &rp2350TransferSink{updater: &updater}, nil +} + +func (s *rp2350TransferSink) WriteChunk(seq, off uint32, data []byte) error { + _ = seq + if got := s.updater.BytesWritten(); got != off { + return fmt.Errorf("unexpected_offset:%d", got) + } + if rc := s.updater.WriteChunk(data); rc != 0 { + return fmt.Errorf("write_chunk:%d", rc) + } + return nil +} + +func (s *rp2350TransferSink) Commit() (transferInfo, error) { + if rc := s.updater.FlushFinal(); rc != 0 { + return transferInfo{}, fmt.Errorf("flush_final:%d", rc) + } + return transferInfo{ + BytesWritten: s.updater.BytesWritten(), + SlotXIPAddr: s.updater.SlotStorageAddr(), + }, nil +} + +func (s *rp2350TransferSink) Apply() error { + if rc := s.updater.RebootIntoSlot(); rc != 0 { + return fmt.Errorf("reboot:%d", rc) + } + return nil +} + +func (s *rp2350TransferSink) Abort(reason string) error { + _ = reason + return nil +} diff --git a/services/fabric/transfer_sink_stub.go b/services/fabric/transfer_sink_stub.go new file mode 100644 index 0000000..038ce7f --- /dev/null +++ b/services/fabric/transfer_sink_stub.go @@ -0,0 +1,18 @@ +//go:build !(tinygo && rp2350) + +package fabric + +import "errors" + +var errTransferUnsupported = errors.New("unsupported") + +type unsupportedTransferFactory struct{} + +func newTransferFactory() transferFactory { + return unsupportedTransferFactory{} +} + +func (unsupportedTransferFactory) Begin(meta transferMeta) (transferSink, error) { + _ = meta + return nil, errTransferUnsupported +} diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go new file mode 100644 index 0000000..7af5a59 --- /dev/null +++ b/services/fabric/transfer_test.go @@ -0,0 +1,314 @@ +package fabric + +import ( + "context" + "crypto/sha256" + "encoding/base64" + "encoding/hex" + "encoding/json" + "strings" + "testing" + "time" + + "devicecode-go/bus" +) + +type fakeTransferFactory struct { + beginMeta transferMeta + beginErr error + sink *fakeTransferSink +} + +func (f *fakeTransferFactory) Begin(meta transferMeta) (transferSink, error) { + f.beginMeta = meta + if f.beginErr != nil { + return nil, f.beginErr + } + if f.sink == nil { + f.sink = &fakeTransferSink{} + } + return f.sink, nil +} + +type fakeTransferSink struct { + seqs []uint32 + offs []uint32 + writes [][]byte + writeErr error + commitErr error + applyErr error + commitInfo transferInfo + committed bool + applied bool + abortReasons []string +} + +func (s *fakeTransferSink) WriteChunk(seq, off uint32, data []byte) error { + if s.writeErr != nil { + return s.writeErr + } + s.seqs = append(s.seqs, seq) + s.offs = append(s.offs, off) + s.writes = append(s.writes, append([]byte(nil), data...)) + return nil +} + +func (s *fakeTransferSink) Commit() (transferInfo, error) { + if s.commitErr != nil { + return transferInfo{}, s.commitErr + } + s.committed = true + return s.commitInfo, nil +} + +func (s *fakeTransferSink) Apply() error { + s.applied = true + return s.applyErr +} + +func (s *fakeTransferSink) Abort(reason string) error { + s.abortReasons = append(s.abortReasons, reason) + return nil +} + +func runSessionWithFactory(ctx context.Context, tr Transport, conn *bus.Connection, factory transferFactory) { + s := session{ + linkID: defaultLinkID, + nodeID: "mcu-1", + peerID: "cm5-local", + localSID: "mcu-sid-test", + tr: tr, + conn: conn, + transferFactory: factory, + } + s.run(ctx) +} + +func rawURL(data []byte) string { + return base64.RawURLEncoding.EncodeToString(data) +} + +func sha256String(data []byte) string { + sum := sha256.Sum256(data) + return hex.EncodeToString(sum[:]) +} + +func TestTransferBeginUnsupportedOnHost(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + + bringUp(t, cm5) + + payload := []byte("firmware") + sendMsg(t, cm5, protoXferBegin{ + T: msgXferBegin, + ID: "xfer-1", + Kind: "firmware.rp2350", + Name: "fw.bin", + Format: "bin", + Enc: "b64url", + Size: uint32(len(payload)), + ChunkRaw: 4, + Chunks: 2, + SHA256: sha256String(payload), + }) + + ready := readMsg[protoXferReady](t, cm5) + if ready.T != msgXferReady || ready.ID != "xfer-1" || ready.OK { + t.Fatalf("bad xfer_ready: %+v", ready) + } + if ready.Err != "unsupported" { + t.Fatalf("xfer_ready.Err = %q, want unsupported", ready.Err) + } +} + +func TestTransferReceiveSuccess(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + factory := &fakeTransferFactory{ + sink: &fakeTransferSink{ + commitInfo: transferInfo{ + BytesWritten: 10, + SlotXIPAddr: 0x10280000, + }, + }, + } + + go runSessionWithFactory(ctx, mcu, b.NewConnection("fabric"), factory) + bringUp(t, cm5) + + payload := []byte("abcdefghij") + sendMsg(t, cm5, protoXferBegin{ + T: msgXferBegin, + ID: "xfer-2", + Kind: "firmware.rp2350", + Name: "fw.bin", + Format: "bin", + Enc: "b64url", + Size: uint32(len(payload)), + ChunkRaw: 4, + Chunks: 3, + SHA256: sha256String(payload), + }) + + ready := readMsg[protoXferReady](t, cm5) + if !ready.OK || ready.Next == nil || *ready.Next != 0 { + t.Fatalf("bad xfer_ready: %+v", ready) + } + + parts := [][]byte{ + payload[:4], + payload[4:8], + payload[8:], + } + off := uint32(0) + for i, part := range parts { + sendMsg(t, cm5, protoXferChunk{ + T: msgXferChunk, + ID: "xfer-2", + Seq: uint32(i), + Off: off, + N: uint32(len(part)), + CRC32: crc32Hex(part), + Data: rawURL(part), + }) + need := readMsg[protoXferNeed](t, cm5) + if need.Next != uint32(i+1) || need.Err != "" { + t.Fatalf("bad xfer_need[%d]: %+v", i, need) + } + off += uint32(len(part)) + } + + sendMsg(t, cm5, protoXferCommit{ + T: msgXferCommit, + ID: "xfer-2", + Size: uint32(len(payload)), + SHA256: sha256String(payload), + }) + + done := readMsg[protoXferDone](t, cm5) + if !done.OK || done.ID != "xfer-2" { + t.Fatalf("bad xfer_done: %+v", done) + } + var info transferInfo + if err := json.Unmarshal(done.Info, &info); err != nil { + t.Fatalf("unmarshal info: %v", err) + } + if info.BytesWritten != 10 || info.SlotXIPAddr != 0x10280000 { + t.Fatalf("bad transfer info: %+v", info) + } + + time.Sleep(20 * time.Millisecond) + + if got := string(factory.sink.writes[0]) + string(factory.sink.writes[1]) + string(factory.sink.writes[2]); got != string(payload) { + t.Fatalf("sink writes = %q, want %q", got, payload) + } + if !factory.sink.committed { + t.Fatal("sink.Commit was not called") + } + if !factory.sink.applied { + t.Fatal("sink.Apply was not called") + } +} + +func TestTransferChunkBadCRCRequestsResend(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + factory := &fakeTransferFactory{sink: &fakeTransferSink{}} + go runSessionWithFactory(ctx, mcu, b.NewConnection("fabric"), factory) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, protoXferBegin{ + T: msgXferBegin, + ID: "xfer-3", + Kind: "firmware.rp2350", + Name: "fw.bin", + Format: "bin", + Enc: "b64url", + Size: uint32(len(payload)), + ChunkRaw: 4, + Chunks: 1, + SHA256: sha256String(payload), + }) + _ = readMsg[protoXferReady](t, cm5) + + sendMsg(t, cm5, protoXferChunk{ + T: msgXferChunk, + ID: "xfer-3", + Seq: 0, + Off: 0, + N: uint32(len(payload)), + CRC32: "deadbeef", + Data: rawURL(payload), + }) + + need := readMsg[protoXferNeed](t, cm5) + if need.Next != 0 || need.Err != "bad_crc" { + t.Fatalf("bad xfer_need: %+v", need) + } + if len(factory.sink.writes) != 0 { + t.Fatalf("sink received %d writes, want 0", len(factory.sink.writes)) + } +} + +func TestTransferCommitHashMismatchReturnsDoneError(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + factory := &fakeTransferFactory{sink: &fakeTransferSink{}} + go runSessionWithFactory(ctx, mcu, b.NewConnection("fabric"), factory) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, protoXferBegin{ + T: msgXferBegin, + ID: "xfer-4", + Kind: "firmware.rp2350", + Name: "fw.bin", + Format: "bin", + Enc: "b64url", + Size: uint32(len(payload)), + ChunkRaw: 4, + Chunks: 1, + SHA256: sha256String(payload), + }) + _ = readMsg[protoXferReady](t, cm5) + + sendMsg(t, cm5, protoXferChunk{ + T: msgXferChunk, + ID: "xfer-4", + Seq: 0, + Off: 0, + N: uint32(len(payload)), + CRC32: crc32Hex(payload), + Data: rawURL(payload), + }) + _ = readMsg[protoXferNeed](t, cm5) + + sendMsg(t, cm5, protoXferCommit{ + T: msgXferCommit, + ID: "xfer-4", + Size: uint32(len(payload)), + SHA256: strings.Repeat("0", 64), + }) + + done := readMsg[protoXferDone](t, cm5) + if done.OK || done.Err != "sha256_mismatch" { + t.Fatalf("bad xfer_done: %+v", done) + } + if len(factory.sink.abortReasons) == 0 { + t.Fatal("expected sink abort on hash mismatch") + } +} From 362983be62340c37dcf9332addb9582e79463995 Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 11:14:45 +0000 Subject: [PATCH 25/65] chore: bump go toolchain to 1.25.1 --- go.mod | 2 +- go.sum | 2 -- 2 files changed, 1 insertion(+), 3 deletions(-) diff --git a/go.mod b/go.mod index 424f179..307ca29 100644 --- a/go.mod +++ b/go.mod @@ -1,6 +1,6 @@ module devicecode-go -go 1.25.0 +go 1.25.1 require ( ab-bringup v0.0.0 diff --git a/go.sum b/go.sum index b2f089d..a00618c 100644 --- a/go.sum +++ b/go.sum @@ -1,7 +1,5 @@ github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 h1:El6M4kTTCOh6aBiKaUGG7oYTSPP8MxqL4YI3kZKwcP4= github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510/go.mod h1:pupxD2MaaD3pAXIBCelhxNneeOaAeabZDe5s4K6zSpQ= -github.com/jangala-dev/tinygo-uartx v0.0.0-20251008020047-bc80b114e3cc h1:HU2VI0lw5wlu1rUgjzSuVH7IWQMNdZEbpDaoxCTVMmY= -github.com/jangala-dev/tinygo-uartx v0.0.0-20251008020047-bc80b114e3cc/go.mod h1:e3HxjGzBZBIsn/oYvWr707ug3IbkglEyivyYVxHRph4= github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3 h1:b6mCDQEeeICoGpsbKyh/kfIRnr2DMK/wACLLi0t8uoU= github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3/go.mod h1:e3HxjGzBZBIsn/oYvWr707ug3IbkglEyivyYVxHRph4= golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 h1:TQwNpfvNkxAVlItJf6Cr5JTsVZoC/Sj7K3OZv2Pc14A= From 680ec7ece627747485c3831a139420af15c7ceff Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 11:14:50 +0000 Subject: [PATCH 26/65] fix: raise pico2 stack to 8KB for fabric transfer workload --- .vscode/tasks.json | 2 +- README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/.vscode/tasks.json b/.vscode/tasks.json index ea251d8..347c341 100644 --- a/.vscode/tasks.json +++ b/.vscode/tasks.json @@ -23,7 +23,7 @@ "args": [ "build", "-o", "${workspaceFolder}/build/devicecode.elf", - "-stack-size=3KB", + "-stack-size=8KB", "-serial=none", "-target=pico2", "-tags", "pico_bb_proto_1", diff --git a/README.md b/README.md index a94e2d6..5d77520 100644 --- a/README.md +++ b/README.md @@ -4,7 +4,7 @@ tinygo flash -stack-size=3KB -monitor -scheduler tasks -target=pico -tags "pico_bb_proto_1" main.go ## Flashing ISOC Power Board via USB port on Pico2 -tinygo flash -stack-size=3KB -monitor -scheduler tasks -target=pico2 -tags "pico_bb_proto_1" main.go +tinygo flash -stack-size=8KB -monitor -scheduler tasks -target=pico2 -tags "pico_bb_proto_1" main.go ------------------- From fa7ed75402d7ff31c60370c14c5b96527daa0aa3 Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 11:15:23 +0000 Subject: [PATCH 27/65] fix: enlarge fabric UART RX/TX buffers to 4KB --- main.go | 10 ++++++++-- .../hal/internal/provider/setups/pico_bb_proto_1.go | 4 ++-- services/hal/internal/provider/setups/pico_rich_dev.go | 4 ++-- 3 files changed, 12 insertions(+), 6 deletions(-) diff --git a/main.go b/main.go index 8e02640..b5905fc 100644 --- a/main.go +++ b/main.go @@ -20,6 +20,8 @@ import ( const halTimeout = 5 * time.Second const pwmTop = 4095 const fabricWaitLogInterval = 2 * time.Second +const fabricSerialRXSize = 4096 +const fabricSerialTXSize = 4096 // Thermal (deci-°C) const ( @@ -444,9 +446,13 @@ func main() { ) subSessOpenFabric := uiConn.Subscribe(tSessOpened(uartFabric)) subSessClosedFabric := uiConn.Subscribe(tSessClosed(uartFabric)) + fabricOpenReq := types.SerialSessionOpen{ + RXSize: fabricSerialRXSize, + TXSize: fabricSerialTXSize, + } // Kick open requests - uiConn.Publish(uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) + uiConn.Publish(uiConn.NewMessage(tSessOpen(uartFabric), fabricOpenReq, false)) var retryFabricAt time.Time var fabricCancel context.CancelFunc @@ -505,7 +511,7 @@ func main() { fabricSessionOpen = false nextFabricWaitLog = time.Now() if time.Now().After(retryFabricAt) { - uiConn.Publish(uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) + uiConn.Publish(uiConn.NewMessage(tSessOpen(uartFabric), fabricOpenReq, false)) retryFabricAt = time.Now().Add(2 * time.Second) } diff --git a/services/hal/internal/provider/setups/pico_bb_proto_1.go b/services/hal/internal/provider/setups/pico_bb_proto_1.go index ae3d94d..fd78e55 100644 --- a/services/hal/internal/provider/setups/pico_bb_proto_1.go +++ b/services/hal/internal/provider/setups/pico_bb_proto_1.go @@ -58,8 +58,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 256, - TXSize: 2048, + RXSize: 4096, + TXSize: 4096, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ diff --git a/services/hal/internal/provider/setups/pico_rich_dev.go b/services/hal/internal/provider/setups/pico_rich_dev.go index d2bd8e8..fa20d51 100644 --- a/services/hal/internal/provider/setups/pico_rich_dev.go +++ b/services/hal/internal/provider/setups/pico_rich_dev.go @@ -52,8 +52,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 32, - TXSize: 512, + RXSize: 4096, + TXSize: 4096, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ From d6d8ce7dac1afb7c9b84432d992a60601ad5539e Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 11:15:53 +0000 Subject: [PATCH 28/65] fix: enlarge fabric session line queue --- services/fabric/session.go | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 868c802..7590377 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -26,6 +26,7 @@ const ( statusReady = "ready" statusOpening = "opening" statusDown = "down" + lineQueueSize = 32 ) // ---- timeouts (local policy) ---- @@ -176,7 +177,7 @@ func (s *session) run(ctx context.Context) { s.transferFactory = newTransferFactory() } - lines := make(chan readResult, 1) + lines := make(chan readResult, lineQueueSize) go func() { defer close(lines) From 4baaf8c68d003da18544b85f2023407baf570c01 Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 11:16:14 +0000 Subject: [PATCH 29/65] diag: trace fabric transfer path and stamp firmware version --- main.go | 2 + services/fabric/session.go | 11 +++++ services/fabric/transfer.go | 99 +++++++++++++++++++++++++++++++++++++ 3 files changed, 112 insertions(+) diff --git a/main.go b/main.go index b5905fc..831c639 100644 --- a/main.go +++ b/main.go @@ -22,6 +22,7 @@ const pwmTop = 4095 const fabricWaitLogInterval = 2 * time.Second const fabricSerialRXSize = 4096 const fabricSerialTXSize = 4096 +const firmwareVersion = "2026-04-09-transfer-rxfix-1" // Thermal (deci-°C) const ( @@ -410,6 +411,7 @@ func main() { ctx := context.Background() + log.Println("[main] firmware version ", firmwareVersion) log.Println("[main] bootstrapping bus …") // Queue length must cover the retained replay burst when fabric // subscribes to wildcard export patterns (hal/cap/env/#, diff --git a/services/fabric/session.go b/services/fabric/session.go index 7590377..0b0b0ef 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -389,6 +389,17 @@ func (s *session) validateInbound(msg *protoMsg) bool { func (s *session) dispatch(line []byte) { var msg protoMsg if err := json.Unmarshal(line, &msg); err != nil { + if cur := s.incomingTransfer; cur != nil { + println( + "[fabric]", "sid", s.localSID, + "malformed frame dropped", + "transfer_id", cur.meta.ID, + "expected_next", strconvx.Itoa(int(cur.expectedNext)), + "line_len", strconvx.Itoa(len(line)), + "err", err.Error(), + ) + return + } s.logKV("malformed frame dropped", "err", err.Error()) return } diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 180081b..26448c2 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -10,9 +10,12 @@ import ( "hash/crc32" "strings" "time" + + "devicecode-go/x/strconvx" ) const postTransferDoneSettle = 10 * time.Millisecond +const transferProgressLogEvery = 32 type transferMeta struct { ID string @@ -73,6 +76,10 @@ func readyNext(v uint32) *uint32 { return &v } +func u32s(v uint32) string { + return strconvx.Itoa(int(v)) +} + func infoPayload(info transferInfo) json.RawMessage { if info.isZero() { return nil @@ -209,6 +216,15 @@ func (s *session) onTransferBegin(msg *protoMsg) { sink: sink, hasher: sha256.New(), } + println( + "[fabric]", "sid", s.localSID, + "xfer_begin accepted", + "id", meta.ID, + "kind", meta.Kind, + "size", u32s(meta.Size), + "chunks", u32s(meta.Chunks), + "chunk_raw", u32s(meta.ChunkRaw), + ) s.sendTransferReady(meta.ID, true, readyNext(0), "") } @@ -219,31 +235,83 @@ func (s *session) onTransferChunk(msg *protoMsg) { return } if msg.Seq != cur.expectedNext { + println( + "[fabric]", "sid", s.localSID, + "xfer_need sent", + "id", cur.meta.ID, + "next", u32s(cur.expectedNext), + "err", "unexpected_seq", + "seq", u32s(msg.Seq), + ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_seq") return } if msg.Off != cur.bytesWritten { + println( + "[fabric]", "sid", s.localSID, + "xfer_need sent", + "id", cur.meta.ID, + "next", u32s(cur.expectedNext), + "err", "unexpected_offset", + "off", u32s(msg.Off), + "want_off", u32s(cur.bytesWritten), + ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_offset") return } raw, err := base64.RawURLEncoding.DecodeString(msg.Data) if err != nil { + println( + "[fabric]", "sid", s.localSID, + "xfer_need sent", + "id", cur.meta.ID, + "next", u32s(cur.expectedNext), + "err", "decode_failed", + ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") return } if uint32(len(raw)) != msg.N || msg.N == 0 { + println( + "[fabric]", "sid", s.localSID, + "xfer_need sent", + "id", cur.meta.ID, + "next", u32s(cur.expectedNext), + "err", "size_mismatch", + "n", u32s(msg.N), + "decoded", u32s(uint32(len(raw))), + ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") return } if crc32Hex(raw) != lowerHex(msg.CRC32) { + println( + "[fabric]", "sid", s.localSID, + "xfer_need sent", + "id", cur.meta.ID, + "next", u32s(cur.expectedNext), + "err", "bad_crc", + "seq", u32s(msg.Seq), + ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "bad_crc") return } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { + println( + "[fabric]", "sid", s.localSID, + "xfer_need sent", + "id", cur.meta.ID, + "next", u32s(cur.expectedNext), + "err", "size_mismatch", + "bytes_written", u32s(cur.bytesWritten), + "raw_len", u32s(uint32(len(raw))), + "total", u32s(cur.meta.Size), + ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") return } if err := cur.sink.WriteChunk(msg.Seq, msg.Off, raw); err != nil { + s.logKV("transfer write failed", "err", err.Error()) _ = cur.sink.Abort(err.Error()) s.clearTransfer() s.sendTransferDone(cur.meta.ID, false, transferInfo{}, err.Error()) @@ -253,6 +321,17 @@ func (s *session) onTransferChunk(msg *protoMsg) { cur.expectedNext++ cur.bytesWritten += uint32(len(raw)) cur.chunksSeen++ + if cur.chunksSeen == 1 || (cur.chunksSeen%transferProgressLogEvery) == 0 { + println( + "[fabric]", "sid", s.localSID, + "xfer_chunk accepted", + "id", cur.meta.ID, + "seq", u32s(msg.Seq), + "off", u32s(msg.Off), + "n", u32s(msg.N), + "bytes_written", u32s(cur.bytesWritten), + ) + } s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "") } @@ -263,12 +342,22 @@ func (s *session) onTransferCommit(msg *protoMsg) { return } if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { + println( + "[fabric]", "sid", s.localSID, + "xfer_commit failed", + "id", cur.meta.ID, + "err", "size_mismatch", + "bytes_written", u32s(cur.bytesWritten), + "msg_size", u32s(msg.Size), + "meta_size", u32s(cur.meta.Size), + ) _ = cur.sink.Abort("size_mismatch") s.clearTransfer() s.sendTransferDone(cur.meta.ID, false, transferInfo{}, "size_mismatch") return } if lowerHex(msg.SHA256) != cur.meta.SHA256 || sha256Hex(cur.hasher) != cur.meta.SHA256 { + println("[fabric]", "sid", s.localSID, "xfer_commit failed", "id", cur.meta.ID, "err", "sha256_mismatch") _ = cur.sink.Abort("sha256_mismatch") s.clearTransfer() s.sendTransferDone(cur.meta.ID, false, transferInfo{}, "sha256_mismatch") @@ -276,6 +365,7 @@ func (s *session) onTransferCommit(msg *protoMsg) { } info, err := cur.sink.Commit() if err != nil { + s.logKV("transfer commit failed", "err", err.Error()) _ = cur.sink.Abort(err.Error()) s.clearTransfer() s.sendTransferDone(cur.meta.ID, false, transferInfo{}, err.Error()) @@ -284,13 +374,21 @@ func (s *session) onTransferCommit(msg *protoMsg) { sink := cur.sink id := cur.meta.ID s.clearTransfer() + println( + "[fabric]", "sid", s.localSID, + "xfer_commit accepted", + "id", id, + "bytes_written", u32s(info.BytesWritten), + ) if !s.sendTransferDone(id, true, info, "") { return } time.Sleep(postTransferDoneSettle) if err := sink.Apply(); err != nil { s.logKV("transfer apply failed", "err", err.Error()) + return } + println("[fabric]", "sid", s.localSID, "transfer apply ok", "id", id) } func (s *session) onTransferAbort(msg *protoMsg) { @@ -306,5 +404,6 @@ func (s *session) onTransferAbort(msg *protoMsg) { if err := cur.sink.Abort(reason); err != nil { s.logKV("transfer abort failed", "err", err.Error()) } + println("[fabric]", "sid", s.localSID, "xfer_abort received", "id", cur.meta.ID, "reason", reason) s.clearTransfer() } From 3e22ad8afd054459ee6b1c86cbdf19d0bdba45c7 Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 12:03:54 +0000 Subject: [PATCH 30/65] diag: log active partition at boot and include in hal/dump reply Calls abupdate.ActivePartition at startup to record the booted partition (visible over USB monitor) and adds the same value as a 'partition' field on the rpc/hal/dump reply so CM5-side tooling can observe slot switching after each firmware update without needing a parallel USB capture. --- main.go | 7 +++++++ services/fabric/session.go | 9 +++++++++ 2 files changed, 16 insertions(+) diff --git a/main.go b/main.go index 831c639..192c224 100644 --- a/main.go +++ b/main.go @@ -5,6 +5,8 @@ import ( "runtime" "time" + "ab-bringup/abupdate" + "devicecode-go/bus" "devicecode-go/services/fabric" "devicecode-go/services/hal" @@ -412,6 +414,11 @@ func main() { ctx := context.Background() log.Println("[main] firmware version ", firmwareVersion) + if pp, rc := abupdate.ActivePartition(); rc == 0 { + log.Println("[main] active partition ", abupdate.FormatPartition(pp)) + } else { + log.Println("[main] active partition unknown rc=", strconvx.Itoa(int(rc))) + } log.Println("[main] bootstrapping bus …") // Queue length must cover the retained replay burst when fabric // subscribes to wildcard export patterns (hal/cap/env/#, diff --git a/services/fabric/session.go b/services/fabric/session.go index 0b0b0ef..202d892 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -6,6 +6,8 @@ import ( "errors" "time" + "ab-bringup/abupdate" + "devicecode-go/bus" "devicecode-go/types" "devicecode-go/x/strconvx" @@ -88,6 +90,7 @@ type dumpReply struct { Applied bool `json:"applied"` ConfigCount int `json:"config_count,omitempty"` ConfigError string `json:"config_error,omitempty"` + Partition string `json:"partition,omitempty"` } type inboundCall struct { @@ -593,6 +596,11 @@ func (s *session) onCall(msg *protoMsg) { } s.conn.Unsubscribe(sub) + partition := "" + if pp, rc := abupdate.ActivePartition(); rc == 0 { + partition = abupdate.FormatPartition(pp) + } + reply := dumpReply{ OK: true, Method: "dump", @@ -601,6 +609,7 @@ func (s *session) onCall(msg *protoMsg) { Applied: s.configApplied, ConfigCount: s.configCount, ConfigError: s.lastConfigErr, + Partition: partition, } s.sendFrame(marshal(protoReply{T: msgReply, Corr: msg.ID, OK: true, Payload: mustMarshal(reply)})) return From 3c9eef3a8a38a397abb03365cca079cc731550e5 Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 13 Apr 2026 17:34:00 +0000 Subject: [PATCH 31/65] fix: enlarge tinygo-uartx RX ring and surface drop counters MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The upstream tinygo-uartx software RX ring was hard-coded to 128 bytes — only ~11 ms of headroom at 115200 baud. Any reactor-side stall longer than that (a flash page program, a TinyGo GC pause, a periodic logger burst) caused bytes to be silently dropped at the ISR's `Buffer.Put` and corrupted incoming `xfer_chunk` frames mid-flight. This produced ~80-byte payload truncations and duplicate-seq merged frames during firmware update transfers. This commit: - Adds a replace directive in go.mod pointing at a vendored copy of tinygo-uartx that bumps the ring to 4 KiB with uint16 indices (~355 ms of headroom at 115200) and adds rxHwDrops / rxSwDrops counters on the UART struct. The replace stays in place until the upstream PR (jangala-dev/tinygo-uartx#5) lands and we can re-pin to a tagged release. - Plumbs the new uartx.RXDrops accessor through rp2SerialPort and surfaces it from serial_raw via a [serial-raw] log line that also reports cumulative RX bytes and shmring-full events. The line is throttled (1 Hz minimum, only on counter delta or 64 KiB byte quantum) so it stays silent in steady state. - Logs the active A/B partition on the fabric session's "waiting for connection start" message so post-reboot debugging can confirm which slot the MCU is running. - Extends the existing decode_failed / size_mismatch xfer_chunk error logs with seq, off and data_len for clearer diagnostics on any future receive-side parse failure. In a clean steady-state run all three drop counters (ring_full, rx_hw_drops, rx_sw_drops) stay at zero across a full ~336 KiB firmware update transfer. --- go.mod | 2 + services/fabric/fabric.go | 10 +++ services/fabric/session.go | 15 ++-- services/fabric/transfer.go | 6 ++ services/hal/devices/serial_raw/builder.go | 75 +++++++++++++++++++ .../hal/internal/provider/rp2_resources.go | 11 +-- 6 files changed, 107 insertions(+), 12 deletions(-) diff --git a/go.mod b/go.mod index 307ca29..9bcd86c 100644 --- a/go.mod +++ b/go.mod @@ -12,3 +12,5 @@ require ( require github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect replace ab-bringup => ../pico2-a-b + +replace github.com/jangala-dev/tinygo-uartx => ../tinygo-uartx diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 0db1b6e..bb54545 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -4,6 +4,8 @@ import ( "context" "sync/atomic" + "ab-bringup/abupdate" + "devicecode-go/bus" "devicecode-go/x/strconvx" ) @@ -36,9 +38,17 @@ func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID nodeID: nodeID, peerID: peerID, localSID: newLocalSID(), + activePartition: activePartitionForLogs(), tr: tr, conn: conn, transferFactory: newTransferFactory(), } s.run(ctx) } + +func activePartitionForLogs() string { + if pp, rc := abupdate.ActivePartition(); rc == 0 { + return abupdate.FormatPartition(pp) + } + return "unknown" +} diff --git a/services/fabric/session.go b/services/fabric/session.go index 202d892..746627b 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -135,12 +135,13 @@ type linkStatePayload struct { // cooperative scheduler panics if multiple goroutines contend on // the bus's internal sync.Mutex. type session struct { - linkID string - nodeID string - peerID string - localSID string - tr Transport - conn *bus.Connection + linkID string + nodeID string + peerID string + localSID string + activePartition string + tr Transport + conn *bus.Connection link linkState peerNode string @@ -953,5 +954,5 @@ func (s *session) logWaiting() { if s.peerSID != "" { return } - s.log("waiting for connection start") + s.logKV("waiting for connection start", "partition", s.activePartition) } diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 26448c2..2ddc37d 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -267,6 +267,9 @@ func (s *session) onTransferChunk(msg *protoMsg) { "id", cur.meta.ID, "next", u32s(cur.expectedNext), "err", "decode_failed", + "seq", u32s(msg.Seq), + "off", u32s(msg.Off), + "data_len", u32s(uint32(len(msg.Data))), ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") return @@ -278,7 +281,10 @@ func (s *session) onTransferChunk(msg *protoMsg) { "id", cur.meta.ID, "next", u32s(cur.expectedNext), "err", "size_mismatch", + "seq", u32s(msg.Seq), + "off", u32s(msg.Off), "n", u32s(msg.N), + "data_len", u32s(uint32(len(msg.Data))), "decoded", u32s(uint32(len(raw))), ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index 5726798..45d9370 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -9,6 +9,7 @@ import ( "devicecode-go/services/hal/internal/core" "devicecode-go/types" "devicecode-go/x/shmring" + "devicecode-go/x/strconvx" ) // ---- Parameters ---- @@ -50,6 +51,18 @@ type session struct { txHandle shmring.Handle txRing *shmring.Ring + // Reactor-owned observability. Single writer (the reactor goroutine), + // so plain fields are sufficient; no atomics required. + rxBytesTotal uint64 // cumulative bytes drained from UART RX into rxRing + rxRingFullHits uint32 // RX drain breaks because rxRing had no free span + + // Throttle state for the [serial-raw] log emitter. + rxLogLastAt time.Time + rxLogLastHits uint32 + rxLogLastBytesQuantum uint64 + rxLogLastHwDrops uint32 + rxLogLastSwDrops uint32 + // Single worker (reactor) for the port. ctx context.Context cancel context.CancelFunc @@ -311,6 +324,63 @@ func (d *Device) stopSession() { // ---- Reactor (single goroutine) ---- +// rxDropsProvider is implemented by hardware-specific serial ports that can +// report RX-side drop counts from below the reactor (typically from the UART +// driver's ISR, before bytes reach the shmring producer). +type rxDropsProvider interface { + RXDrops() (hw, sw uint32) +} + +// logRxCountersIfDue emits a [serial-raw] line summarising the reactor's +// RX-side counters. When force is false it throttles to at most one line per +// rxLogMinInterval AND suppresses lines where nothing has changed since the +// last emit. When force is true it always emits (used for the final snapshot +// before the reactor exits). +func (d *Device) logRxCountersIfDue(s *session, force bool) { + const ( + rxLogMinInterval = 1 * time.Second + rxLogBytesQuantum = 64 * 1024 + ) + + hits := s.rxRingFullHits + bytes := s.rxBytesTotal + quantum := bytes / rxLogBytesQuantum + + var hwDrops, swDrops uint32 + if rp, ok := d.port.(rxDropsProvider); ok { + hwDrops, swDrops = rp.RXDrops() + } + + if !force { + now := time.Now() + if now.Sub(s.rxLogLastAt) < rxLogMinInterval { + return + } + if hits == s.rxLogLastHits && + quantum == s.rxLogLastBytesQuantum && + hwDrops == s.rxLogLastHwDrops && + swDrops == s.rxLogLastSwDrops { + return + } + s.rxLogLastAt = now + } else { + s.rxLogLastAt = time.Now() + } + + println( + "[serial-raw]", "rx", + "uart", d.a.Name, + "bytes_total", strconvx.Utoa64(bytes), + "ring_full", strconvx.Utoa64(uint64(hits)), + "rx_hw_drops", strconvx.Utoa64(uint64(hwDrops)), + "rx_sw_drops", strconvx.Utoa64(uint64(swDrops)), + ) + s.rxLogLastHits = hits + s.rxLogLastBytesQuantum = quantum + s.rxLogLastHwDrops = hwDrops + s.rxLogLastSwDrops = swDrops +} + func (d *Device) reactor(s *session) { defer close(s.done) @@ -325,6 +395,7 @@ func (d *Device) reactor(s *session) { for { p1, p2 := rxR.WriteAcquire() if len(p1) == 0 { + s.rxRingFullHits++ break } n1 := u.TryRead(p1) @@ -333,6 +404,7 @@ func (d *Device) reactor(s *session) { } if n1 < len(p1) { rxR.WriteCommit(n1) + s.rxBytesTotal += uint64(n1) made = true continue } @@ -341,6 +413,7 @@ func (d *Device) reactor(s *session) { n2 = u.TryRead(p2) } rxR.WriteCommit(n1 + n2) + s.rxBytesTotal += uint64(n1 + n2) made = true } @@ -372,8 +445,10 @@ func (d *Device) reactor(s *session) { } // Idle: wait for any edge, then re-check. + d.logRxCountersIfDue(s, false) select { case <-s.ctx.Done(): + d.logRxCountersIfDue(s, true) return case <-u.Readable(): case <-u.Writable(): diff --git a/services/hal/internal/provider/rp2_resources.go b/services/hal/internal/provider/rp2_resources.go index 28e24e6..7120e05 100644 --- a/services/hal/internal/provider/rp2_resources.go +++ b/services/hal/internal/provider/rp2_resources.go @@ -705,11 +705,12 @@ func (r *rp2Registry) ReadOnDieMilliC() int32 { // rp2SerialPort adapts uartx.UART to serialPortX. type rp2SerialPort struct{ u *uartx.UART } -func (p *rp2SerialPort) Readable() <-chan struct{} { return p.u.Readable() } -func (p *rp2SerialPort) Writable() <-chan struct{} { return p.u.Writable() } -func (p *rp2SerialPort) TryRead(b []byte) int { return p.u.TryRead(b) } -func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } -func (p *rp2SerialPort) Flush() error { return p.u.Flush() } +func (p *rp2SerialPort) Readable() <-chan struct{} { return p.u.Readable() } +func (p *rp2SerialPort) Writable() <-chan struct{} { return p.u.Writable() } +func (p *rp2SerialPort) TryRead(b []byte) int { return p.u.TryRead(b) } +func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } +func (p *rp2SerialPort) Flush() error { return p.u.Flush() } +func (p *rp2SerialPort) RXDrops() (hw, sw uint32) { return p.u.RXDrops() } func (p *rp2SerialPort) SetBaudRate(br uint32) error { p.u.SetBaudRate(br); return nil } From 4f7b5d9f0864df22e99d05464fa4aa2b8316e206 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 14 Apr 2026 16:51:45 +0000 Subject: [PATCH 32/65] fix: eliminate firmware-transfer byte drops with safe-window GC Under TinyGo's cooperative scheduler, any > 11 ms CPU burst on the shared thread starves the serial_raw reactor goroutine long enough that uartx's 128-byte SW ring overflows and drops incoming bytes. In firmware-update transfers the culprits were: - Periodic runtime.GC() forced from emitMemSnapshot, amplified by the allocating log.Println path used in the same function. - Natural TinyGo GC firing on heap pressure from the fabric hot-path (json.Unmarshal + base64.DecodeString on every chunk) during a chunk's wire-time. Changes: - main.go emitMemSnapshot: drop explicit runtime.GC() and switch to the builtin println so the periodic mem snapshot is allocation-free and cannot trigger a collection. - services/fabric/transfer.go onTransferChunk: force runtime.GC() in the ack safe window (after sink.WriteChunk, before sendTransferNeed). Lua is stop-waiting for the ack in that window, nothing is on the wire, so the GC stall can't drop anything. The heap resets to live baseline each chunk and natural GC never fires during subsequent chunk wire-times. - services/fabric/transfer_sink_rp2350.go: protocol-owned 4 KiB stage buffer so flash writes happen in larger batches, not on every UART chunk. - services/fabric/transport_shmring.go + services/hal/devices/serial_raw /builder.go: per-line CRC32 diagnostics at both the serial_raw RX ingress and fabric-rx transport layer so any byte-level regression is localised immediately. - Fabric UART RX/TX buffer sizing centralised in main.go and threaded through session_open. - go.mod: drop the tinygo-uartx replace directive. Enlarging the driver ring was the wrong layer (the event-driven uartx ring is designed small on purpose and proven at higher rates); the correct fix lives in the protocol layer and in eliminating reactor-starving stalls. --- go.mod | 2 - main.go | 13 ++- services/fabric/session.go | 11 ++- services/fabric/trace.go | 29 ++++++ services/fabric/transfer.go | 16 +++ services/fabric/transfer_sink_rp2350.go | 85 +++++++++++++++- services/fabric/transport_shmring.go | 11 +++ services/hal/devices/serial_raw/builder.go | 98 +++++++++++-------- .../hal/internal/provider/rp2_resources.go | 11 +-- .../provider/setups/pico_bb_proto_1.go | 4 +- .../internal/provider/setups/pico_rich_dev.go | 4 +- 11 files changed, 218 insertions(+), 66 deletions(-) diff --git a/go.mod b/go.mod index 9bcd86c..307ca29 100644 --- a/go.mod +++ b/go.mod @@ -12,5 +12,3 @@ require ( require github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect replace ab-bringup => ../pico2-a-b - -replace github.com/jangala-dev/tinygo-uartx => ../tinygo-uartx diff --git a/main.go b/main.go index 192c224..bf71c1e 100644 --- a/main.go +++ b/main.go @@ -391,14 +391,13 @@ func (r *Reactor) OnTempDeciC(label string, deci int) { func (r *Reactor) emitMemSnapshot() { var ms runtime.MemStats - runtime.GC() runtime.ReadMemStats(&ms) - log.Println( - "[mem] ", - "alloc:", int(ms.Alloc), " ", - "heapSys:", int(ms.HeapSys), " ", - "mallocs:", int(ms.Mallocs), " ", - "frees:", int(ms.Frees), + println( + "[mem]", + "alloc", int(ms.Alloc), + "heapSys", int(ms.HeapSys), + "mallocs", int(ms.Mallocs), + "frees", int(ms.Frees), ) } diff --git a/services/fabric/session.go b/services/fabric/session.go index 746627b..1ac37b7 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -400,11 +400,20 @@ func (s *session) dispatch(line []byte) { "transfer_id", cur.meta.ID, "expected_next", strconvx.Itoa(int(cur.expectedNext)), "line_len", strconvx.Itoa(len(line)), + "line_head", tracePreview(line), + "line_tail", traceTailPreview(line), "err", err.Error(), ) return } - s.logKV("malformed frame dropped", "err", err.Error()) + println( + "[fabric]", "sid", s.localSID, + "malformed frame dropped", + "line_len", strconvx.Itoa(len(line)), + "line_head", tracePreview(line), + "line_tail", traceTailPreview(line), + "err", err.Error(), + ) return } s.markRx() diff --git a/services/fabric/trace.go b/services/fabric/trace.go index 4c2637b..4a89b50 100644 --- a/services/fabric/trace.go +++ b/services/fabric/trace.go @@ -36,6 +36,35 @@ func tracePreview(data []byte) string { return string(out) } +func traceTailPreview(data []byte) string { + const max = 200 + if len(data) > max { + data = data[len(data)-max:] + } + out := make([]byte, 0, len(data)*2+3) + for _, b := range data { + switch b { + case '\n': + out = append(out, '\\', 'n') + case '\r': + out = append(out, '\\', 'r') + case '\t': + out = append(out, '\\', 't') + default: + if b < 0x20 || b > 0x7e { + out = append(out, '\\', 'x') + out = append(out, hexNibble(b>>4), hexNibble(b)) + } else { + out = append(out, b) + } + } + } + if len(data) == max { + out = append([]byte("..."), out...) + } + return string(out) +} + func hexNibble(v byte) byte { v &= 0x0f if v < 10 { diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 2ddc37d..87dde82 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -8,6 +8,7 @@ import ( "fmt" "hash" "hash/crc32" + "runtime" "strings" "time" @@ -80,6 +81,14 @@ func u32s(v uint32) string { return strconvx.Itoa(int(v)) } +func textPreview(s string) string { + return tracePreview([]byte(s)) +} + +func textTailPreview(s string) string { + return traceTailPreview([]byte(s)) +} + func infoPayload(info transferInfo) json.RawMessage { if info.isZero() { return nil @@ -270,6 +279,8 @@ func (s *session) onTransferChunk(msg *protoMsg) { "seq", u32s(msg.Seq), "off", u32s(msg.Off), "data_len", u32s(uint32(len(msg.Data))), + "data_head", textPreview(msg.Data), + "data_tail", textTailPreview(msg.Data), ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") return @@ -286,6 +297,8 @@ func (s *session) onTransferChunk(msg *protoMsg) { "n", u32s(msg.N), "data_len", u32s(uint32(len(msg.Data))), "decoded", u32s(uint32(len(raw))), + "data_head", textPreview(msg.Data), + "data_tail", textTailPreview(msg.Data), ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") return @@ -335,9 +348,12 @@ func (s *session) onTransferChunk(msg *protoMsg) { "seq", u32s(msg.Seq), "off", u32s(msg.Off), "n", u32s(msg.N), + "data_len", u32s(uint32(len(msg.Data))), "bytes_written", u32s(cur.bytesWritten), ) } + raw = nil + runtime.GC() s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "") } diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 705e80b..efbc58b 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -5,16 +5,25 @@ package fabric import ( "errors" "fmt" + "time" "ab-bringup/abupdate" ) +const rp2350TransferStageSize = 4096 + var errTransferUnsupported = errors.New("unsupported") type rp2350TransferFactory struct{} type rp2350TransferSink struct { updater *abupdate.Updater + + // Stage verified transfer bytes in protocol code so flash writes happen in + // larger batches instead of directly on every UART chunk. + stage [rp2350TransferStageSize]byte + stageUsed uint32 + accepted uint32 } func newTransferFactory() transferFactory { @@ -37,18 +46,83 @@ func (rp2350TransferFactory) Begin(meta transferMeta) (transferSink, error) { return &rp2350TransferSink{updater: &updater}, nil } -func (s *rp2350TransferSink) WriteChunk(seq, off uint32, data []byte) error { - _ = seq - if got := s.updater.BytesWritten(); got != off { - return fmt.Errorf("unexpected_offset:%d", got) +func (s *rp2350TransferSink) flushStage(seq uint32, force bool) error { + if s.stageUsed == 0 { + return nil } - if rc := s.updater.WriteChunk(data); rc != 0 { + + before := s.updater.BytesWritten() + expected := s.accepted - s.stageUsed + if before != expected { + return fmt.Errorf("unexpected_offset:%d", before) + } + + flushed := s.stageUsed + start := time.Now() + if rc := s.updater.WriteChunk(s.stage[:flushed]); rc != 0 { return fmt.Errorf("write_chunk:%d", rc) } + after := s.updater.BytesWritten() + s.stageUsed = 0 + + dt := time.Since(start) + if force || seq == 0 || (seq%32) == 31 || dt >= 2*time.Millisecond { + println( + "[fabric]", "xfer_sink_flush", + "seq", u32s(seq), + "stage_n", u32s(flushed), + "bytes_before", u32s(before), + "bytes_after", u32s(after), + "dt_us", u32s(uint32(dt/time.Microsecond)), + ) + } + + return nil +} + +func (s *rp2350TransferSink) WriteChunk(seq, off uint32, data []byte) error { + if s.accepted != off { + return fmt.Errorf("unexpected_offset:%d", s.accepted) + } + + remaining := data + for len(remaining) > 0 { + if s.stageUsed == uint32(len(s.stage)) { + if err := s.flushStage(seq, false); err != nil { + return err + } + } + + n := copy(s.stage[s.stageUsed:], remaining) + s.stageUsed += uint32(n) + s.accepted += uint32(n) + remaining = remaining[n:] + + if s.stageUsed == uint32(len(s.stage)) { + if err := s.flushStage(seq, false); err != nil { + return err + } + } + } + + if seq == 0 || (seq%32) == 31 { + println( + "[fabric]", "xfer_sink_stage", + "seq", u32s(seq), + "off", u32s(off), + "n", u32s(uint32(len(data))), + "stage_used", u32s(s.stageUsed), + "accepted", u32s(s.accepted), + ) + } + return nil } func (s *rp2350TransferSink) Commit() (transferInfo, error) { + if err := s.flushStage(0, true); err != nil { + return transferInfo{}, err + } if rc := s.updater.FlushFinal(); rc != 0 { return transferInfo{}, fmt.Errorf("flush_final:%d", rc) } @@ -67,5 +141,6 @@ func (s *rp2350TransferSink) Apply() error { func (s *rp2350TransferSink) Abort(reason string) error { _ = reason + s.stageUsed = 0 return nil } diff --git a/services/fabric/transport_shmring.go b/services/fabric/transport_shmring.go index dece826..fe505d6 100644 --- a/services/fabric/transport_shmring.go +++ b/services/fabric/transport_shmring.go @@ -3,6 +3,7 @@ package fabric import ( "context" "fmt" + "hash/crc32" "devicecode-go/x/shmring" ) @@ -29,6 +30,14 @@ func NewShmringTransport(rx, tx *shmring.Ring) *ShmringTransport { } } +func logShmringRXLine(data []byte) { + println( + "[fabric-rx]", "line", + "line_len", len(data), + "line_crc32", fmt.Sprintf("%08x", crc32.ChecksumIEEE(data)), + ) +} + func (t *ShmringTransport) ReadLine() ([]byte, error) { t.buf = t.buf[:0] t.over = false @@ -60,6 +69,7 @@ func (t *ShmringTransport) ReadLine() ([]byte, error) { } out := make([]byte, len(t.buf)) copy(out, t.buf) + logShmringRXLine(out) traceLine("rx", out) return out, nil } @@ -84,6 +94,7 @@ func (t *ShmringTransport) ReadLine() ([]byte, error) { } out := make([]byte, len(t.buf)) copy(out, t.buf) + logShmringRXLine(out) traceLine("rx", out) return out, nil } diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index 45d9370..ca4c6d3 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -2,6 +2,8 @@ package serial_raw import ( "context" + "fmt" + "hash/crc32" "sync/atomic" "time" @@ -51,17 +53,15 @@ type session struct { txHandle shmring.Handle txRing *shmring.Ring - // Reactor-owned observability. Single writer (the reactor goroutine), - // so plain fields are sufficient; no atomics required. - rxBytesTotal uint64 // cumulative bytes drained from UART RX into rxRing - rxRingFullHits uint32 // RX drain breaks because rxRing had no free span - - // Throttle state for the [serial-raw] log emitter. - rxLogLastAt time.Time - rxLogLastHits uint32 - rxLogLastBytesQuantum uint64 - rxLogLastHwDrops uint32 - rxLogLastSwDrops uint32 + // Reactor-owned observability. Single writer only. + rxBytesTotal uint64 + rxRingFull uint32 + rxLogAt time.Time + rxLogHits uint32 + rxLogQuantum uint64 + rxLineCRC uint32 + rxLineLen uint32 + rxLineCount uint32 // Single worker (reactor) for the port. ctx context.Context @@ -182,6 +182,12 @@ func (d *Device) Control(_ core.CapAddr, verb string, payload any) (core.Enqueue } d.startSession(rxSize, txSize) + println( + "[serial-raw]", "session_open", + "uart", d.a.Name, + "rx_size", strconvx.Itoa(rxSize), + "tx_size", strconvx.Itoa(txSize), + ) // --- Device-level hygiene: drain spurious RX before signalling link up --- // Discard any pre-existing or immediately-arriving bytes on the UART RX path. @@ -324,47 +330,58 @@ func (d *Device) stopSession() { // ---- Reactor (single goroutine) ---- -// rxDropsProvider is implemented by hardware-specific serial ports that can -// report RX-side drop counts from below the reactor (typically from the UART -// driver's ISR, before bytes reach the shmring producer). -type rxDropsProvider interface { - RXDrops() (hw, sw uint32) +func (d *Device) noteRXBytes(s *session, chunk []byte) { + if len(chunk) == 0 { + return + } + start := 0 + for i, b := range chunk { + if b != '\n' { + continue + } + if i > start { + s.rxLineCRC = crc32.Update(s.rxLineCRC, crc32.IEEETable, chunk[start:i]) + s.rxLineLen += uint32(i - start) + } + s.rxLineCount++ + println( + "[serial-raw]", "rx_line", + "uart", d.a.Name, + "line_n", strconvx.Utoa64(uint64(s.rxLineCount)), + "line_len", strconvx.Utoa64(uint64(s.rxLineLen)), + "line_crc32", fmt.Sprintf("%08x", s.rxLineCRC), + ) + s.rxLineCRC = 0 + s.rxLineLen = 0 + start = i + 1 + } + if start < len(chunk) { + s.rxLineCRC = crc32.Update(s.rxLineCRC, crc32.IEEETable, chunk[start:]) + s.rxLineLen += uint32(len(chunk) - start) + } } -// logRxCountersIfDue emits a [serial-raw] line summarising the reactor's -// RX-side counters. When force is false it throttles to at most one line per -// rxLogMinInterval AND suppresses lines where nothing has changed since the -// last emit. When force is true it always emits (used for the final snapshot -// before the reactor exits). func (d *Device) logRxCountersIfDue(s *session, force bool) { const ( rxLogMinInterval = 1 * time.Second rxLogBytesQuantum = 64 * 1024 ) - hits := s.rxRingFullHits + hits := s.rxRingFull bytes := s.rxBytesTotal quantum := bytes / rxLogBytesQuantum - var hwDrops, swDrops uint32 - if rp, ok := d.port.(rxDropsProvider); ok { - hwDrops, swDrops = rp.RXDrops() - } - if !force { now := time.Now() - if now.Sub(s.rxLogLastAt) < rxLogMinInterval { + if now.Sub(s.rxLogAt) < rxLogMinInterval { return } - if hits == s.rxLogLastHits && - quantum == s.rxLogLastBytesQuantum && - hwDrops == s.rxLogLastHwDrops && - swDrops == s.rxLogLastSwDrops { + if hits == s.rxLogHits && quantum == s.rxLogQuantum { return } - s.rxLogLastAt = now + s.rxLogAt = now } else { - s.rxLogLastAt = time.Now() + s.rxLogAt = time.Now() } println( @@ -372,13 +389,9 @@ func (d *Device) logRxCountersIfDue(s *session, force bool) { "uart", d.a.Name, "bytes_total", strconvx.Utoa64(bytes), "ring_full", strconvx.Utoa64(uint64(hits)), - "rx_hw_drops", strconvx.Utoa64(uint64(hwDrops)), - "rx_sw_drops", strconvx.Utoa64(uint64(swDrops)), ) - s.rxLogLastHits = hits - s.rxLogLastBytesQuantum = quantum - s.rxLogLastHwDrops = hwDrops - s.rxLogLastSwDrops = swDrops + s.rxLogHits = hits + s.rxLogQuantum = quantum } func (d *Device) reactor(s *session) { @@ -395,7 +408,7 @@ func (d *Device) reactor(s *session) { for { p1, p2 := rxR.WriteAcquire() if len(p1) == 0 { - s.rxRingFullHits++ + s.rxRingFull++ break } n1 := u.TryRead(p1) @@ -403,6 +416,7 @@ func (d *Device) reactor(s *session) { break } if n1 < len(p1) { + d.noteRXBytes(s, p1[:n1]) rxR.WriteCommit(n1) s.rxBytesTotal += uint64(n1) made = true @@ -412,6 +426,8 @@ func (d *Device) reactor(s *session) { if len(p2) > 0 { n2 = u.TryRead(p2) } + d.noteRXBytes(s, p1[:n1]) + d.noteRXBytes(s, p2[:n2]) rxR.WriteCommit(n1 + n2) s.rxBytesTotal += uint64(n1 + n2) made = true diff --git a/services/hal/internal/provider/rp2_resources.go b/services/hal/internal/provider/rp2_resources.go index 7120e05..28e24e6 100644 --- a/services/hal/internal/provider/rp2_resources.go +++ b/services/hal/internal/provider/rp2_resources.go @@ -705,12 +705,11 @@ func (r *rp2Registry) ReadOnDieMilliC() int32 { // rp2SerialPort adapts uartx.UART to serialPortX. type rp2SerialPort struct{ u *uartx.UART } -func (p *rp2SerialPort) Readable() <-chan struct{} { return p.u.Readable() } -func (p *rp2SerialPort) Writable() <-chan struct{} { return p.u.Writable() } -func (p *rp2SerialPort) TryRead(b []byte) int { return p.u.TryRead(b) } -func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } -func (p *rp2SerialPort) Flush() error { return p.u.Flush() } -func (p *rp2SerialPort) RXDrops() (hw, sw uint32) { return p.u.RXDrops() } +func (p *rp2SerialPort) Readable() <-chan struct{} { return p.u.Readable() } +func (p *rp2SerialPort) Writable() <-chan struct{} { return p.u.Writable() } +func (p *rp2SerialPort) TryRead(b []byte) int { return p.u.TryRead(b) } +func (p *rp2SerialPort) TryWrite(b []byte) int { return p.u.TryWrite(b) } +func (p *rp2SerialPort) Flush() error { return p.u.Flush() } func (p *rp2SerialPort) SetBaudRate(br uint32) error { p.u.SetBaudRate(br); return nil } diff --git a/services/hal/internal/provider/setups/pico_bb_proto_1.go b/services/hal/internal/provider/setups/pico_bb_proto_1.go index fd78e55..ae3d94d 100644 --- a/services/hal/internal/provider/setups/pico_bb_proto_1.go +++ b/services/hal/internal/provider/setups/pico_bb_proto_1.go @@ -58,8 +58,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 4096, - TXSize: 4096, + RXSize: 256, + TXSize: 2048, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ diff --git a/services/hal/internal/provider/setups/pico_rich_dev.go b/services/hal/internal/provider/setups/pico_rich_dev.go index fa20d51..d2bd8e8 100644 --- a/services/hal/internal/provider/setups/pico_rich_dev.go +++ b/services/hal/internal/provider/setups/pico_rich_dev.go @@ -52,8 +52,8 @@ var SelectedSetup = types.HALConfig{ Domain: "io", Name: "uart1", Baud: 115_200, - RXSize: 4096, - TXSize: 4096, + RXSize: 32, + TXSize: 512, }}, {ID: "charger0", Type: "ltc4015", Params: ltc4015dev.Params{ From 9906b6546728999d3face0006124849934b7bb9f Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 14 Apr 2026 17:20:38 +0000 Subject: [PATCH 33/65] cleanup: drop per-line CRC32 debug logs from serial_raw and fabric-rx noteRXBytes in serial_raw and logShmringRXLine in transport_shmring were added as byte-level diagnostic tripwires during the firmware-transfer drop investigation. With the underlying bug fixed (see FABRIC_TRANSFER_FIX.md) they only add console noise on every received line. Drop them. The throttled [serial-raw] rx counter log (bytes_total / ring_full) and the throttled [fabric] xfer_sink_stage / xfer_sink_flush logs stay in place as steady-state observability. If per-line CRC32 is ever needed again it's ~30 lines of code to restore; check the repo git history. --- services/fabric/transport_shmring.go | 11 ------ services/hal/devices/serial_raw/builder.go | 39 ---------------------- 2 files changed, 50 deletions(-) diff --git a/services/fabric/transport_shmring.go b/services/fabric/transport_shmring.go index fe505d6..dece826 100644 --- a/services/fabric/transport_shmring.go +++ b/services/fabric/transport_shmring.go @@ -3,7 +3,6 @@ package fabric import ( "context" "fmt" - "hash/crc32" "devicecode-go/x/shmring" ) @@ -30,14 +29,6 @@ func NewShmringTransport(rx, tx *shmring.Ring) *ShmringTransport { } } -func logShmringRXLine(data []byte) { - println( - "[fabric-rx]", "line", - "line_len", len(data), - "line_crc32", fmt.Sprintf("%08x", crc32.ChecksumIEEE(data)), - ) -} - func (t *ShmringTransport) ReadLine() ([]byte, error) { t.buf = t.buf[:0] t.over = false @@ -69,7 +60,6 @@ func (t *ShmringTransport) ReadLine() ([]byte, error) { } out := make([]byte, len(t.buf)) copy(out, t.buf) - logShmringRXLine(out) traceLine("rx", out) return out, nil } @@ -94,7 +84,6 @@ func (t *ShmringTransport) ReadLine() ([]byte, error) { } out := make([]byte, len(t.buf)) copy(out, t.buf) - logShmringRXLine(out) traceLine("rx", out) return out, nil } diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index ca4c6d3..bb4fcdf 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -2,8 +2,6 @@ package serial_raw import ( "context" - "fmt" - "hash/crc32" "sync/atomic" "time" @@ -59,9 +57,6 @@ type session struct { rxLogAt time.Time rxLogHits uint32 rxLogQuantum uint64 - rxLineCRC uint32 - rxLineLen uint32 - rxLineCount uint32 // Single worker (reactor) for the port. ctx context.Context @@ -330,37 +325,6 @@ func (d *Device) stopSession() { // ---- Reactor (single goroutine) ---- -func (d *Device) noteRXBytes(s *session, chunk []byte) { - if len(chunk) == 0 { - return - } - start := 0 - for i, b := range chunk { - if b != '\n' { - continue - } - if i > start { - s.rxLineCRC = crc32.Update(s.rxLineCRC, crc32.IEEETable, chunk[start:i]) - s.rxLineLen += uint32(i - start) - } - s.rxLineCount++ - println( - "[serial-raw]", "rx_line", - "uart", d.a.Name, - "line_n", strconvx.Utoa64(uint64(s.rxLineCount)), - "line_len", strconvx.Utoa64(uint64(s.rxLineLen)), - "line_crc32", fmt.Sprintf("%08x", s.rxLineCRC), - ) - s.rxLineCRC = 0 - s.rxLineLen = 0 - start = i + 1 - } - if start < len(chunk) { - s.rxLineCRC = crc32.Update(s.rxLineCRC, crc32.IEEETable, chunk[start:]) - s.rxLineLen += uint32(len(chunk) - start) - } -} - func (d *Device) logRxCountersIfDue(s *session, force bool) { const ( rxLogMinInterval = 1 * time.Second @@ -416,7 +380,6 @@ func (d *Device) reactor(s *session) { break } if n1 < len(p1) { - d.noteRXBytes(s, p1[:n1]) rxR.WriteCommit(n1) s.rxBytesTotal += uint64(n1) made = true @@ -426,8 +389,6 @@ func (d *Device) reactor(s *session) { if len(p2) > 0 { n2 = u.TryRead(p2) } - d.noteRXBytes(s, p1[:n1]) - d.noteRXBytes(s, p2[:n2]) rxR.WriteCommit(n1 + n2) s.rxBytesTotal += uint64(n1 + n2) made = true From 89e368874a22466b73ad1bb5a95ee3bd2888e615 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 14:56:21 +0000 Subject: [PATCH 34/65] fix: extend post-xfer_done settle to 250ms before reboot 10ms was not enough for the serial reactor to drain the xfer_done response from the TX shmring before sink.Apply() rebooted the MCU. The CM5 never received xfer_done causing commit_timeout on iteration 8+. --- services/fabric/transfer.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 87dde82..64f80b6 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -15,7 +15,7 @@ import ( "devicecode-go/x/strconvx" ) -const postTransferDoneSettle = 10 * time.Millisecond +const postTransferDoneSettle = 250 * time.Millisecond const transferProgressLogEvery = 32 type transferMeta struct { From a5ab390971335c5deb05d24dd16d29b83e8d8610 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 15:33:01 +0000 Subject: [PATCH 35/65] rename: update ab-bringup -> pico2-a-b module references --- go.mod | 4 ++-- main.go | 2 +- services/fabric/fabric.go | 2 +- services/fabric/session.go | 2 +- services/fabric/transfer_sink_rp2350.go | 2 +- 5 files changed, 6 insertions(+), 6 deletions(-) diff --git a/go.mod b/go.mod index 307ca29..6962427 100644 --- a/go.mod +++ b/go.mod @@ -3,7 +3,7 @@ module devicecode-go go 1.25.1 require ( - ab-bringup v0.0.0 + pico2-a-b v0.0.0 github.com/jangala-dev/tinygo-uartx v0.0.0-20251028085354-58b6258234b3 golang.org/x/exp v0.0.0-20251002181428-27f1f14c8bb9 tinygo.org/x/drivers v0.33.0 @@ -11,4 +11,4 @@ require ( require github.com/google/shlex v0.0.0-20191202100458-e7afc7fbc510 // indirect -replace ab-bringup => ../pico2-a-b +replace pico2-a-b => ../pico2-a-b diff --git a/main.go b/main.go index 681c8a2..5974bb9 100644 --- a/main.go +++ b/main.go @@ -4,7 +4,7 @@ import ( "context" "time" - "ab-bringup/abupdate" + "pico2-a-b/abupdate" "devicecode-go/bus" "devicecode-go/services/hal" diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index bb54545..3841683 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -4,7 +4,7 @@ import ( "context" "sync/atomic" - "ab-bringup/abupdate" + "pico2-a-b/abupdate" "devicecode-go/bus" "devicecode-go/x/strconvx" diff --git a/services/fabric/session.go b/services/fabric/session.go index 1ac37b7..ad3b499 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -6,7 +6,7 @@ import ( "errors" "time" - "ab-bringup/abupdate" + "pico2-a-b/abupdate" "devicecode-go/bus" "devicecode-go/types" diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index efbc58b..e628629 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -7,7 +7,7 @@ import ( "fmt" "time" - "ab-bringup/abupdate" + "pico2-a-b/abupdate" ) const rp2350TransferStageSize = 4096 From b115173034a7e7e46b8f1b331019d1532123f20b Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 15:35:26 +0000 Subject: [PATCH 36/65] cleanup: remove activePartitionForLogs helper main.go already logs the partition at boot and hal/dump queries it directly. No need for a cached copy on the session struct. --- services/fabric/fabric.go | 10 ---------- services/fabric/session.go | 7 +++---- 2 files changed, 3 insertions(+), 14 deletions(-) diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 3841683..0db1b6e 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -4,8 +4,6 @@ import ( "context" "sync/atomic" - "pico2-a-b/abupdate" - "devicecode-go/bus" "devicecode-go/x/strconvx" ) @@ -38,17 +36,9 @@ func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID nodeID: nodeID, peerID: peerID, localSID: newLocalSID(), - activePartition: activePartitionForLogs(), tr: tr, conn: conn, transferFactory: newTransferFactory(), } s.run(ctx) } - -func activePartitionForLogs() string { - if pp, rc := abupdate.ActivePartition(); rc == 0 { - return abupdate.FormatPartition(pp) - } - return "unknown" -} diff --git a/services/fabric/session.go b/services/fabric/session.go index ad3b499..3eef3b7 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -138,9 +138,8 @@ type session struct { linkID string nodeID string peerID string - localSID string - activePartition string - tr Transport + localSID string + tr Transport conn *bus.Connection link linkState @@ -963,5 +962,5 @@ func (s *session) logWaiting() { if s.peerSID != "" { return } - s.logKV("waiting for connection start", "partition", s.activePartition) + s.log("waiting for connection start") } From 842fb8c89b11a8d437034d08571e565d8fc9f48f Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 15:40:43 +0000 Subject: [PATCH 37/65] refactor: replace transferFactory interface with plain function Build-tagged beginTransfer() function replaces the factory interface and struct. Tests override via a function field on the session. --- services/fabric/fabric.go | 5 +- services/fabric/session.go | 6 +-- services/fabric/transfer.go | 11 ++-- services/fabric/transfer_sink_rp2350.go | 8 +-- services/fabric/transfer_sink_stub.go | 9 +--- services/fabric/transfer_test.go | 69 ++++++++++--------------- 6 files changed, 35 insertions(+), 73 deletions(-) diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 0db1b6e..e9c7833 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -36,9 +36,8 @@ func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID nodeID: nodeID, peerID: peerID, localSID: newLocalSID(), - tr: tr, - conn: conn, - transferFactory: newTransferFactory(), + tr: tr, + conn: conn, } s.run(ctx) } diff --git a/services/fabric/session.go b/services/fabric/session.go index 3eef3b7..d5ced3d 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -158,7 +158,7 @@ type session struct { outboundCalls []*outboundCall nextOutboundID uint64 incomingTransfer *incomingTransfer - transferFactory transferFactory + beginTransfer func(transferMeta) (transferSink, error) // Config state — tracks config/device → config/hal translation. configApplied bool @@ -176,10 +176,6 @@ func (s *session) logKV(msg, key, value string) { // run is the main loop. Blocks until ctx is cancelled. func (s *session) run(ctx context.Context) { - if s.transferFactory == nil { - s.transferFactory = newTransferFactory() - } - lines := make(chan readResult, lineQueueSize) go func() { diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 64f80b6..581ae93 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -40,10 +40,6 @@ func (i transferInfo) isZero() bool { return i.BytesWritten == 0 && i.SlotXIPAddr == 0 } -type transferFactory interface { - Begin(meta transferMeta) (transferSink, error) -} - type transferSink interface { WriteChunk(seq, off uint32, data []byte) error Commit() (transferInfo, error) @@ -212,10 +208,11 @@ func (s *session) onTransferBegin(msg *protoMsg) { s.sendTransferReady(meta.ID, false, nil, "unsupported_encoding") return } - if s.transferFactory == nil { - s.transferFactory = newTransferFactory() + beginFn := s.beginTransfer + if beginFn == nil { + beginFn = beginTransfer } - sink, err := s.transferFactory.Begin(meta) + sink, err := beginFn(meta) if err != nil { s.sendTransferReady(meta.ID, false, nil, err.Error()) return diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index e628629..6ae14f9 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -14,8 +14,6 @@ const rp2350TransferStageSize = 4096 var errTransferUnsupported = errors.New("unsupported") -type rp2350TransferFactory struct{} - type rp2350TransferSink struct { updater *abupdate.Updater @@ -26,11 +24,7 @@ type rp2350TransferSink struct { accepted uint32 } -func newTransferFactory() transferFactory { - return rp2350TransferFactory{} -} - -func (rp2350TransferFactory) Begin(meta transferMeta) (transferSink, error) { +func beginTransfer(meta transferMeta) (transferSink, error) { if meta.Kind != "firmware.rp2350" || meta.Format != "bin" { return nil, errTransferUnsupported } diff --git a/services/fabric/transfer_sink_stub.go b/services/fabric/transfer_sink_stub.go index 038ce7f..6386f0a 100644 --- a/services/fabric/transfer_sink_stub.go +++ b/services/fabric/transfer_sink_stub.go @@ -6,13 +6,6 @@ import "errors" var errTransferUnsupported = errors.New("unsupported") -type unsupportedTransferFactory struct{} - -func newTransferFactory() transferFactory { - return unsupportedTransferFactory{} -} - -func (unsupportedTransferFactory) Begin(meta transferMeta) (transferSink, error) { - _ = meta +func beginTransfer(meta transferMeta) (transferSink, error) { return nil, errTransferUnsupported } diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 7af5a59..4b06411 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -13,23 +13,6 @@ import ( "devicecode-go/bus" ) -type fakeTransferFactory struct { - beginMeta transferMeta - beginErr error - sink *fakeTransferSink -} - -func (f *fakeTransferFactory) Begin(meta transferMeta) (transferSink, error) { - f.beginMeta = meta - if f.beginErr != nil { - return nil, f.beginErr - } - if f.sink == nil { - f.sink = &fakeTransferSink{} - } - return f.sink, nil -} - type fakeTransferSink struct { seqs []uint32 offs []uint32 @@ -71,15 +54,17 @@ func (s *fakeTransferSink) Abort(reason string) error { return nil } -func runSessionWithFactory(ctx context.Context, tr Transport, conn *bus.Connection, factory transferFactory) { +func runSessionWithSink(ctx context.Context, tr Transport, conn *bus.Connection, sink *fakeTransferSink) { s := session{ - linkID: defaultLinkID, - nodeID: "mcu-1", - peerID: "cm5-local", - localSID: "mcu-sid-test", - tr: tr, - conn: conn, - transferFactory: factory, + linkID: defaultLinkID, + nodeID: "mcu-1", + peerID: "cm5-local", + localSID: "mcu-sid-test", + tr: tr, + conn: conn, + beginTransfer: func(meta transferMeta) (transferSink, error) { + return sink, nil + }, } s.run(ctx) } @@ -131,16 +116,14 @@ func TestTransferReceiveSuccess(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - factory := &fakeTransferFactory{ - sink: &fakeTransferSink{ - commitInfo: transferInfo{ - BytesWritten: 10, - SlotXIPAddr: 0x10280000, - }, + sink := &fakeTransferSink{ + commitInfo: transferInfo{ + BytesWritten: 10, + SlotXIPAddr: 0x10280000, }, } - go runSessionWithFactory(ctx, mcu, b.NewConnection("fabric"), factory) + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) payload := []byte("abcdefghij") @@ -204,15 +187,15 @@ func TestTransferReceiveSuccess(t *testing.T) { t.Fatalf("bad transfer info: %+v", info) } - time.Sleep(20 * time.Millisecond) + time.Sleep(postTransferDoneSettle + 50*time.Millisecond) - if got := string(factory.sink.writes[0]) + string(factory.sink.writes[1]) + string(factory.sink.writes[2]); got != string(payload) { + if got := string(sink.writes[0]) + string(sink.writes[1]) + string(sink.writes[2]); got != string(payload) { t.Fatalf("sink writes = %q, want %q", got, payload) } - if !factory.sink.committed { + if !sink.committed { t.Fatal("sink.Commit was not called") } - if !factory.sink.applied { + if !sink.applied { t.Fatal("sink.Apply was not called") } } @@ -223,8 +206,8 @@ func TestTransferChunkBadCRCRequestsResend(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - factory := &fakeTransferFactory{sink: &fakeTransferSink{}} - go runSessionWithFactory(ctx, mcu, b.NewConnection("fabric"), factory) + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) payload := []byte("abcd") @@ -256,8 +239,8 @@ func TestTransferChunkBadCRCRequestsResend(t *testing.T) { if need.Next != 0 || need.Err != "bad_crc" { t.Fatalf("bad xfer_need: %+v", need) } - if len(factory.sink.writes) != 0 { - t.Fatalf("sink received %d writes, want 0", len(factory.sink.writes)) + if len(sink.writes) != 0 { + t.Fatalf("sink received %d writes, want 0", len(sink.writes)) } } @@ -267,8 +250,8 @@ func TestTransferCommitHashMismatchReturnsDoneError(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - factory := &fakeTransferFactory{sink: &fakeTransferSink{}} - go runSessionWithFactory(ctx, mcu, b.NewConnection("fabric"), factory) + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) bringUp(t, cm5) payload := []byte("abcd") @@ -308,7 +291,7 @@ func TestTransferCommitHashMismatchReturnsDoneError(t *testing.T) { if done.OK || done.Err != "sha256_mismatch" { t.Fatalf("bad xfer_done: %+v", done) } - if len(factory.sink.abortReasons) == 0 { + if len(sink.abortReasons) == 0 { t.Fatal("expected sink abort on hash mismatch") } } From ffdd1c4cc866a4d230335c519aa0e6b5f293561a Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 15:42:17 +0000 Subject: [PATCH 38/65] revert fabric.go spacing to match fabric-protocol --- services/fabric/fabric.go | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index e9c7833..d0e3cd9 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -32,12 +32,12 @@ func newLocalSID() string { // arrives within the timeout. func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string) { s := session{ - linkID: defaultLinkID, - nodeID: nodeID, - peerID: peerID, - localSID: newLocalSID(), - tr: tr, - conn: conn, + linkID: defaultLinkID, + nodeID: nodeID, + peerID: peerID, + localSID: newLocalSID(), + tr: tr, + conn: conn, } s.run(ctx) } From a2bdedf2fadead47f7fc93cf97ed11b8caea2967 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 15:49:32 +0000 Subject: [PATCH 39/65] refactor: replace protoMsg union with typed two-pass dispatch Unmarshal just the type header first, then dispatch into the correct typed struct via a generic helper. Each handler now receives its own protocol type instead of a shared superset union. Removes protoMsg and validateInbound. --- services/fabric/fabric_test.go | 6 +- services/fabric/protocol.go | 38 --------- services/fabric/session.go | 143 ++++++++++++++++++--------------- services/fabric/transfer.go | 10 +-- 4 files changed, 86 insertions(+), 111 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 48e2497..0ade6fc 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -1058,10 +1058,12 @@ func TestDumpCallDoesNotBlockPing(t *testing.T) { line []byte err error } - // Both should arrive — dump reply and pong, in either order. + type wireHeader struct { + T string `json:"t"` + } var gotReply, gotPong bool for i := 0; i < 2; i++ { - msg := readMsg[protoMsg](t, cm5) + msg := readMsg[wireHeader](t, cm5) switch msg.T { case msgReply: gotReply = true diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 70aea53..09f32eb 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -150,44 +150,6 @@ type protoXferAbort struct { Reason string `json:"reason"` } -// protoMsg is a union struct for single-pass unmarshal in dispatch. -// Fields are the superset of all message types. Only the fields -// relevant to the T value are populated; the rest are zero. -type protoMsg struct { - T string `json:"t"` - Node string `json:"node,omitempty"` - Peer string `json:"peer,omitempty"` - SID string `json:"sid,omitempty"` - Proto int `json:"proto,omitempty"` - OK bool `json:"ok,omitempty"` - Caps *protoCaps `json:"caps,omitempty"` - TS int64 `json:"ts,omitempty"` - Topic []string `json:"topic,omitempty"` - Payload json.RawMessage `json:"payload,omitempty"` - Retain bool `json:"retain,omitempty"` - ID string `json:"id,omitempty"` - Corr string `json:"corr,omitempty"` - TimeoutMs int `json:"timeout_ms,omitempty"` - Err string `json:"err,omitempty"` - Kind string `json:"kind,omitempty"` - Name string `json:"name,omitempty"` - Format string `json:"format,omitempty"` - Enc string `json:"enc,omitempty"` - Size uint32 `json:"size,omitempty"` - ChunkRaw uint32 `json:"chunk_raw,omitempty"` - Chunks uint32 `json:"chunks,omitempty"` - SHA256 string `json:"sha256,omitempty"` - Meta json.RawMessage `json:"meta,omitempty"` - Seq uint32 `json:"seq,omitempty"` - Off uint32 `json:"off,omitempty"` - N uint32 `json:"n,omitempty"` - CRC32 string `json:"crc32,omitempty"` - Data string `json:"data,omitempty"` - Next uint32 `json:"next,omitempty"` - Reason string `json:"reason,omitempty"` - Info json.RawMessage `json:"info,omitempty"` -} - // ---- codec helpers ---- // marshal returns compact JSON with a trailing newline. diff --git a/services/fabric/session.go b/services/fabric/session.go index d5ced3d..518843c 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -367,82 +367,93 @@ func (s *session) promoteLink(reason string) { // ---- dispatch ---- -// validateInbound checks whether a message should be processed. -// Handshake messages (hello, hello_ack) are always accepted. -// All others require an established link and a matching session ID. -func (s *session) validateInbound(msg *protoMsg) bool { - if msg.T == msgHello || msg.T == msgHelloAck { - return true - } - if s.link != linkUp { - s.logKV("dropped before handshake", "type", msg.T) - return false - } - if s.peerSID != "" && msg.SID != "" && msg.SID != s.peerSID { - s.logKV("dropped: wrong session", "type", msg.T) - return false - } - return true -} - func (s *session) dispatch(line []byte) { - var msg protoMsg - if err := json.Unmarshal(line, &msg); err != nil { - if cur := s.incomingTransfer; cur != nil { - println( - "[fabric]", "sid", s.localSID, - "malformed frame dropped", - "transfer_id", cur.meta.ID, - "expected_next", strconvx.Itoa(int(cur.expectedNext)), - "line_len", strconvx.Itoa(len(line)), - "line_head", tracePreview(line), - "line_tail", traceTailPreview(line), - "err", err.Error(), - ) - return - } - println( - "[fabric]", "sid", s.localSID, - "malformed frame dropped", - "line_len", strconvx.Itoa(len(line)), - "line_head", tracePreview(line), - "line_tail", traceTailPreview(line), - "err", err.Error(), - ) + t := protoType(line) + if t == "" { + s.logMalformed(line, nil) return } s.markRx() - if !s.validateInbound(&msg) { - return - } - switch msg.T { + + switch t { case msgHello: - s.onHello(&msg) + typedDispatch(s, line, s.onHello) case msgHelloAck: - s.onHelloAck(&msg) + typedDispatch(s, line, s.onHelloAck) case msgPing: - s.onPing(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onPing) + } case msgPong: - s.onPong(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onPong) + } case msgPub: - s.onPub(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onPub) + } case msgUnretain: - s.onUnretain(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onUnretain) + } case msgCall: - s.onCall(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onCall) + } case msgReply: - s.onReply(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onReply) + } case msgXferBegin: - s.onTransferBegin(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onTransferBegin) + } case msgXferChunk: - s.onTransferChunk(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onTransferChunk) + } case msgXferCommit: - s.onTransferCommit(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onTransferCommit) + } case msgXferAbort: - s.onTransferAbort(&msg) + if s.requireLinkUp(t) { + typedDispatch(s, line, s.onTransferAbort) + } default: - s.logKV("unknown message type dropped", "type", msg.T) + s.logKV("unknown message type dropped", "type", t) + } +} + +func typedDispatch[T any](s *session, line []byte, handler func(*T)) { + var msg T + if err := json.Unmarshal(line, &msg); err != nil { + s.logMalformed(line, err) + return + } + handler(&msg) +} + +func (s *session) requireLinkUp(t string) bool { + if s.link != linkUp { + s.logKV("dropped before handshake", "type", t) + return false + } + return true +} + +func (s *session) logMalformed(line []byte, err error) { + errStr := "" + if err != nil { + errStr = err.Error() } + println( + "[fabric]", "sid", s.localSID, + "malformed frame dropped", + "line_len", strconvx.Itoa(len(line)), + "line_head", tracePreview(line), + "err", errStr, + ) } // notePeerIdentity records the remote peer's node, SID, and proto version. @@ -489,7 +500,7 @@ func hasWirePrefix(topic, prefix []string) bool { return true } -func (s *session) onHello(msg *protoMsg) { +func (s *session) onHello(msg *protoHello) { if msg.Peer != "" && msg.Peer != s.nodeID { s.log("hello dropped: wrong peer") return @@ -515,7 +526,7 @@ func (s *session) onHello(msg *protoMsg) { s.promoteLink(reason) } -func (s *session) onHelloAck(msg *protoMsg) { +func (s *session) onHelloAck(msg *protoHelloAck) { if s.isSelfControlFrame(msg.Node, msg.SID) { s.log("echoed hello_ack ignored") return @@ -530,7 +541,7 @@ func (s *session) onHelloAck(msg *protoMsg) { s.promoteLink(reason) } -func (s *session) onPing(msg *protoMsg) { +func (s *session) onPing(msg *protoPing) { s.logKV("ping rx", "peer_sid", msg.SID) if !s.sendFrame(marshal(protoPong{T: msgPong, TS: msg.TS, SID: s.localSID})) { return @@ -538,7 +549,7 @@ func (s *session) onPing(msg *protoMsg) { s.log("pong tx") } -func (s *session) onPong(msg *protoMsg) { +func (s *session) onPong(msg *protoPong) { if s.isSelfControlFrame("", msg.SID) { s.log("echoed pong ignored") return @@ -546,7 +557,7 @@ func (s *session) onPong(msg *protoMsg) { s.lastPongAt = s.lastRxAt } -func (s *session) onPub(msg *protoMsg) { +func (s *session) onPub(msg *protoPub) { localTopic := importPublishTopic(msg.Topic) if localTopic == nil { if hasWirePrefix(msg.Topic, []string{"state"}) { @@ -576,7 +587,7 @@ func (s *session) onPub(msg *protoMsg) { s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) } -func (s *session) onUnretain(msg *protoMsg) { +func (s *session) onUnretain(msg *protoUnretain) { localTopic := importPublishTopic(msg.Topic) if localTopic == nil { s.log("incoming unretain dropped: no_route") @@ -585,7 +596,7 @@ func (s *session) onUnretain(msg *protoMsg) { s.conn.Publish(s.conn.NewMessage(localTopic, nil, true)) } -func (s *session) onCall(msg *protoMsg) { +func (s *session) onCall(msg *protoCall) { // rpc/hal/dump: handle directly — reply with config and HAL state. if slicesEqualStrings(msg.Topic, dumpCallTopic) { var halState *types.HALState @@ -640,7 +651,7 @@ func (s *session) onCall(msg *protoMsg) { }) } -func (s *session) onReply(msg *protoMsg) { +func (s *session) onReply(msg *protoReply) { for i, call := range s.outboundCalls { if call.id != msg.Corr { continue diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 581ae93..0672253 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -149,7 +149,7 @@ func (s *session) abortTransfer(reason string) { } } -func validateTransferBegin(msg *protoMsg) (transferMeta, string) { +func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { if msg.ID == "" { return transferMeta{}, "xfer_begin.id" } @@ -191,7 +191,7 @@ func validateTransferBegin(msg *protoMsg) (transferMeta, string) { }, "" } -func (s *session) onTransferBegin(msg *protoMsg) { +func (s *session) onTransferBegin(msg *protoXferBegin) { meta, errStr := validateTransferBegin(msg) if errStr != "" { if msg.ID != "" { @@ -234,7 +234,7 @@ func (s *session) onTransferBegin(msg *protoMsg) { s.sendTransferReady(meta.ID, true, readyNext(0), "") } -func (s *session) onTransferChunk(msg *protoMsg) { +func (s *session) onTransferChunk(msg *protoXferChunk) { cur := s.incomingTransfer if cur == nil || cur.meta.ID != msg.ID { s.logKV("xfer_chunk dropped", "id", msg.ID) @@ -354,7 +354,7 @@ func (s *session) onTransferChunk(msg *protoMsg) { s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "") } -func (s *session) onTransferCommit(msg *protoMsg) { +func (s *session) onTransferCommit(msg *protoXferCommit) { cur := s.incomingTransfer if cur == nil || cur.meta.ID != msg.ID { s.logKV("xfer_commit dropped", "id", msg.ID) @@ -410,7 +410,7 @@ func (s *session) onTransferCommit(msg *protoMsg) { println("[fabric]", "sid", s.localSID, "transfer apply ok", "id", id) } -func (s *session) onTransferAbort(msg *protoMsg) { +func (s *session) onTransferAbort(msg *protoXferAbort) { cur := s.incomingTransfer if cur == nil || cur.meta.ID != msg.ID { s.logKV("xfer_abort dropped", "id", msg.ID) From b6201079fbfd053f83ded5c62811dd4b5c71090d Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:04:17 +0000 Subject: [PATCH 40/65] refactor: split dispatch into pre- and post-handshake switches gofmt also realigns session struct fields. --- services/fabric/session.go | 57 +++++++++++++++----------------------- 1 file changed, 23 insertions(+), 34 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 518843c..065bce5 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -135,12 +135,12 @@ type linkStatePayload struct { // cooperative scheduler panics if multiple goroutines contend on // the bus's internal sync.Mutex. type session struct { - linkID string - nodeID string - peerID string + linkID string + nodeID string + peerID string localSID string tr Transport - conn *bus.Connection + conn *bus.Connection link linkState peerNode string @@ -378,48 +378,37 @@ func (s *session) dispatch(line []byte) { switch t { case msgHello: typedDispatch(s, line, s.onHello) + return case msgHelloAck: typedDispatch(s, line, s.onHelloAck) + return + } + + if !s.requireLinkUp(t) { + return + } + + switch t { case msgPing: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onPing) - } + typedDispatch(s, line, s.onPing) case msgPong: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onPong) - } + typedDispatch(s, line, s.onPong) case msgPub: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onPub) - } + typedDispatch(s, line, s.onPub) case msgUnretain: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onUnretain) - } + typedDispatch(s, line, s.onUnretain) case msgCall: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onCall) - } + typedDispatch(s, line, s.onCall) case msgReply: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onReply) - } + typedDispatch(s, line, s.onReply) case msgXferBegin: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onTransferBegin) - } + typedDispatch(s, line, s.onTransferBegin) case msgXferChunk: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onTransferChunk) - } + typedDispatch(s, line, s.onTransferChunk) case msgXferCommit: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onTransferCommit) - } + typedDispatch(s, line, s.onTransferCommit) case msgXferAbort: - if s.requireLinkUp(t) { - typedDispatch(s, line, s.onTransferAbort) - } + typedDispatch(s, line, s.onTransferAbort) default: s.logKV("unknown message type dropped", "type", t) } From 5de3f1dd53e6cfcda636147e585211a2566eb4b5 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:09:28 +0000 Subject: [PATCH 41/65] refactor: drop partition field from hal/dump reply Unused by CM5 tooling; abupdate import pulls an RP2350-specific dependency into board-agnostic fabric code. Boot-time log in main.go remains. --- services/fabric/session.go | 9 --------- 1 file changed, 9 deletions(-) diff --git a/services/fabric/session.go b/services/fabric/session.go index 065bce5..01407e5 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -6,8 +6,6 @@ import ( "errors" "time" - "pico2-a-b/abupdate" - "devicecode-go/bus" "devicecode-go/types" "devicecode-go/x/strconvx" @@ -90,7 +88,6 @@ type dumpReply struct { Applied bool `json:"applied"` ConfigCount int `json:"config_count,omitempty"` ConfigError string `json:"config_error,omitempty"` - Partition string `json:"partition,omitempty"` } type inboundCall struct { @@ -601,11 +598,6 @@ func (s *session) onCall(msg *protoCall) { } s.conn.Unsubscribe(sub) - partition := "" - if pp, rc := abupdate.ActivePartition(); rc == 0 { - partition = abupdate.FormatPartition(pp) - } - reply := dumpReply{ OK: true, Method: "dump", @@ -614,7 +606,6 @@ func (s *session) onCall(msg *protoCall) { Applied: s.configApplied, ConfigCount: s.configCount, ConfigError: s.lastConfigErr, - Partition: partition, } s.sendFrame(marshal(protoReply{T: msgReply, Corr: msg.ID, OK: true, Payload: mustMarshal(reply)})) return From ea394c1e3059ec0097787a204d6cb0de6a9cb91c Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:16:34 +0000 Subject: [PATCH 42/65] refactor: drop traceTailPreview and data_tail log fields Diagnostic added while chasing mid-frame byte-drop truncation; that bug is fixed and data_head + seq/off/data_len/n give enough context for any re-occurrence. --- services/fabric/trace.go | 29 ----------------------------- services/fabric/transfer.go | 6 ------ 2 files changed, 35 deletions(-) diff --git a/services/fabric/trace.go b/services/fabric/trace.go index 4a89b50..4c2637b 100644 --- a/services/fabric/trace.go +++ b/services/fabric/trace.go @@ -36,35 +36,6 @@ func tracePreview(data []byte) string { return string(out) } -func traceTailPreview(data []byte) string { - const max = 200 - if len(data) > max { - data = data[len(data)-max:] - } - out := make([]byte, 0, len(data)*2+3) - for _, b := range data { - switch b { - case '\n': - out = append(out, '\\', 'n') - case '\r': - out = append(out, '\\', 'r') - case '\t': - out = append(out, '\\', 't') - default: - if b < 0x20 || b > 0x7e { - out = append(out, '\\', 'x') - out = append(out, hexNibble(b>>4), hexNibble(b)) - } else { - out = append(out, b) - } - } - } - if len(data) == max { - out = append([]byte("..."), out...) - } - return string(out) -} - func hexNibble(v byte) byte { v &= 0x0f if v < 10 { diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 0672253..4e38dc2 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -81,10 +81,6 @@ func textPreview(s string) string { return tracePreview([]byte(s)) } -func textTailPreview(s string) string { - return traceTailPreview([]byte(s)) -} - func infoPayload(info transferInfo) json.RawMessage { if info.isZero() { return nil @@ -277,7 +273,6 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { "off", u32s(msg.Off), "data_len", u32s(uint32(len(msg.Data))), "data_head", textPreview(msg.Data), - "data_tail", textTailPreview(msg.Data), ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") return @@ -295,7 +290,6 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { "data_len", u32s(uint32(len(msg.Data))), "decoded", u32s(uint32(len(raw))), "data_head", textPreview(msg.Data), - "data_tail", textTailPreview(msg.Data), ) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") return From 39b9aad6ec9d0c230cce73093b2d4b98474f4435 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:19:21 +0000 Subject: [PATCH 43/65] refactor: drop firmwareVersion const and ActivePartition boot log Hardcoded version string rots and has no consumer; slot reporting belongs on the bus as telemetry (hal/cap/*), not as a board-specific import in board-agnostic main.go. --- main.go | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/main.go b/main.go index 5974bb9..74b705d 100644 --- a/main.go +++ b/main.go @@ -4,22 +4,17 @@ import ( "context" "time" - "pico2-a-b/abupdate" - "devicecode-go/bus" "devicecode-go/services/hal" "devicecode-go/services/reactor" "devicecode-go/types" "devicecode-go/utilities" - "devicecode-go/x/strconvx" ) // HAL const halTimeout = 5 * time.Second var halReadiness = bus.T("hal", "state") -const firmwareVersion = "2026-04-09-transfer-rxfix-1" - // ----------------------------------------------------------------------------- // Main // ----------------------------------------------------------------------------- @@ -52,13 +47,6 @@ func main() { } } - log.Println("[main] firmware version ", firmwareVersion) - if pp, rc := abupdate.ActivePartition(); rc == 0 { - log.Println("[main] active partition ", abupdate.FormatPartition(pp)) - } else { - log.Println("[main] active partition unknown rc=", strconvx.Itoa(int(rc))) - } - // Reactor r := reactor.NewReactor(b, uiConn) r.Run(ctx) From cf9e201b6551e925ac46418e367522f8e2f64e75 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:25:29 +0000 Subject: [PATCH 44/65] refactor: restore log.Println in emitMemSnapshot Keep the runtime.GC() drop (deliberate stop-the-world every 2 s corrupted in-flight transfers), but revert the one-site switch to println. Rest of codebase uses log.Println in hot paths; single-site divergence is noise without addressing the underlying concern. --- services/reactor/reactor.go | 13 +++++++------ 1 file changed, 7 insertions(+), 6 deletions(-) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 84158ab..f49bc64 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -471,12 +471,13 @@ func (r *Reactor) OnTempDeciC(label string, deci int, jsonKey string) { func (r *Reactor) emitMemSnapshot() { var ms runtime.MemStats runtime.ReadMemStats(&ms) - println( - "[mem]", - "alloc", int(ms.Alloc), - "heapSys", int(ms.HeapSys), - "mallocs", int(ms.Mallocs), - "frees", int(ms.Frees), + // log line + log.Println( + "[mem] ", + "alloc:", int(ms.Alloc), " ", + "heapSys:", int(ms.HeapSys), " ", + "mallocs:", int(ms.Mallocs), " ", + "frees:", int(ms.Frees), ) // JSON (minimal to keep overhead low) if r.jsonOut != nil { From 5df60c86d05650073e463ee72d008845a4fefdb1 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:33:36 +0000 Subject: [PATCH 45/65] tune: trim fabric TX shmring to 2048 (one max-line frame) TX drains at 115200 baud (~185 ms for 2048 B) and MCU does not produce back-to-back max-size frames, so sizing TX to exactly maxLineLen is principled and saves 2 KiB SRAM. RX stays at 4 KiB to match the uartx software ring. --- services/reactor/reactor.go | 8 +++++++- 1 file changed, 7 insertions(+), 1 deletion(-) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index f49bc64..0df92dc 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -18,8 +18,14 @@ import ( const fabricWaitLogInterval = 2 * time.Second const ( + // RX matches the uartx software RX ring so neither becomes the + // bottleneck during firmware transfer bursts. fabricSerialRXSize = 4096 - fabricSerialTXSize = 4096 + // TX is sized to hold one max-size wire frame (fabric.maxLineLen + // = 2048 + \n). MCU-outbound traffic is dominated by small frames; + // the rare 2 KiB frame (e.g. hal/dump reply) drains in ~185 ms at + // 115200 baud — well inside the main-loop stall budget. + fabricSerialTXSize = 2048 ) // ----------------------------------------------------------------------------- From 0c6eca48c0181be9d2eccd6ff1083b9749828e69 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:39:46 +0000 Subject: [PATCH 46/65] revert: restore fabric TX shmring to 4096 MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Trimming to 2048 coincided with an MCU→CM5 decode_failed during hardware transfer test. Theoretical derivation said 2048 should be enough, but empirical beats analytical — revert to match RX. --- services/reactor/reactor.go | 8 +------- 1 file changed, 1 insertion(+), 7 deletions(-) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 0df92dc..f49bc64 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -18,14 +18,8 @@ import ( const fabricWaitLogInterval = 2 * time.Second const ( - // RX matches the uartx software RX ring so neither becomes the - // bottleneck during firmware transfer bursts. fabricSerialRXSize = 4096 - // TX is sized to hold one max-size wire frame (fabric.maxLineLen - // = 2048 + \n). MCU-outbound traffic is dominated by small frames; - // the rare 2 KiB frame (e.g. hal/dump reply) drains in ~185 ms at - // 115200 baud — well inside the main-loop stall budget. - fabricSerialTXSize = 2048 + fabricSerialTXSize = 4096 ) // ----------------------------------------------------------------------------- From 0aad351b82536555b834b88d6ce2fa2f35d7cc1c Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:48:30 +0000 Subject: [PATCH 47/65] refactor: drop rxBytesTotal heartbeat from serial_raw MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rxBytesTotal + 64 KiB quantum existed only to emit a liveness heartbeat; fabric-layer xfer_chunk logging covers the same signal. Keep rxRingFull — that is the back-pressure alarm and is not duplicated elsewhere. --- services/hal/devices/serial_raw/builder.go | 25 ++++++---------------- 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index bb4fcdf..29db7dc 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -52,11 +52,9 @@ type session struct { txRing *shmring.Ring // Reactor-owned observability. Single writer only. - rxBytesTotal uint64 - rxRingFull uint32 - rxLogAt time.Time - rxLogHits uint32 - rxLogQuantum uint64 + rxRingFull uint32 + rxLogAt time.Time + rxLogHits uint32 // Single worker (reactor) for the port. ctx context.Context @@ -326,21 +324,16 @@ func (d *Device) stopSession() { // ---- Reactor (single goroutine) ---- func (d *Device) logRxCountersIfDue(s *session, force bool) { - const ( - rxLogMinInterval = 1 * time.Second - rxLogBytesQuantum = 64 * 1024 - ) + const rxLogMinInterval = 1 * time.Second hits := s.rxRingFull - bytes := s.rxBytesTotal - quantum := bytes / rxLogBytesQuantum if !force { now := time.Now() if now.Sub(s.rxLogAt) < rxLogMinInterval { return } - if hits == s.rxLogHits && quantum == s.rxLogQuantum { + if hits == s.rxLogHits { return } s.rxLogAt = now @@ -349,13 +342,11 @@ func (d *Device) logRxCountersIfDue(s *session, force bool) { } println( - "[serial-raw]", "rx", + "[serial-raw]", "rx_ring_full", "uart", d.a.Name, - "bytes_total", strconvx.Utoa64(bytes), - "ring_full", strconvx.Utoa64(uint64(hits)), + "hits", strconvx.Utoa64(uint64(hits)), ) s.rxLogHits = hits - s.rxLogQuantum = quantum } func (d *Device) reactor(s *session) { @@ -381,7 +372,6 @@ func (d *Device) reactor(s *session) { } if n1 < len(p1) { rxR.WriteCommit(n1) - s.rxBytesTotal += uint64(n1) made = true continue } @@ -390,7 +380,6 @@ func (d *Device) reactor(s *session) { n2 = u.TryRead(p2) } rxR.WriteCommit(n1 + n2) - s.rxBytesTotal += uint64(n1 + n2) made = true } From 3a2cc00fe87451db4f6743132fa4e50730e08342 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 16:54:19 +0000 Subject: [PATCH 48/65] refactor: rename logRxCountersIfDue -> logRingFullChange Describes the trigger (count changed) honestly and drops the awkward IfDue idiom. --- services/hal/devices/serial_raw/builder.go | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/services/hal/devices/serial_raw/builder.go b/services/hal/devices/serial_raw/builder.go index 29db7dc..f417760 100644 --- a/services/hal/devices/serial_raw/builder.go +++ b/services/hal/devices/serial_raw/builder.go @@ -323,7 +323,7 @@ func (d *Device) stopSession() { // ---- Reactor (single goroutine) ---- -func (d *Device) logRxCountersIfDue(s *session, force bool) { +func (d *Device) logRingFullChange(s *session, force bool) { const rxLogMinInterval = 1 * time.Second hits := s.rxRingFull @@ -411,10 +411,10 @@ func (d *Device) reactor(s *session) { } // Idle: wait for any edge, then re-check. - d.logRxCountersIfDue(s, false) + d.logRingFullChange(s, false) select { case <-s.ctx.Done(): - d.logRxCountersIfDue(s, true) + d.logRingFullChange(s, true) return case <-u.Readable(): case <-u.Writable(): From a1d1df2a9a33072a241452cd23d3fc80b9eb1790 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 17:12:44 +0000 Subject: [PATCH 49/65] revert: drop explicit fabric shmring sizing, use serial_raw default MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Per maintainer guidance (buffering should live in protocol/client layer, not the HAL): rp2350TransferSink.stage (4 KiB, flash-sector aligned) and the fabric session line queue are the real protocol buffers. Shmring goes back to the 512 B serial_raw default. The runtime.GC() drop in emitMemSnapshot stays — direct empirical cause of byte drops per FABRIC_TRANSFER_FIX.md. --- services/reactor/reactor.go | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index f49bc64..e58ba13 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -17,11 +17,6 @@ import ( const fabricWaitLogInterval = 2 * time.Second -const ( - fabricSerialRXSize = 4096 - fabricSerialTXSize = 4096 -) - // ----------------------------------------------------------------------------- // Thresholds & timing // ----------------------------------------------------------------------------- @@ -509,14 +504,9 @@ func (r *Reactor) Run(ctx context.Context) { subSessClosedTele := r.uiConn.Subscribe(tSessClosed(uartTele)) subSessClosedFabric := r.uiConn.Subscribe(tSessClosed(uartFabric)) - fabricOpenReq := types.SerialSessionOpen{ - RXSize: fabricSerialRXSize, - TXSize: fabricSerialTXSize, - } - // Kick open requests (fire-and-forget; events carry handles) r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartTele), nil, false)) - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), fabricOpenReq, false)) + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) // Retry back-off guards var retryTeleAt, retryFabricAt time.Time @@ -592,7 +582,7 @@ func (r *Reactor) Run(ctx context.Context) { nextFabricWaitLog = time.Now() log.Println("[uart1] fabric session closed") if time.Now().After(retryFabricAt) { - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), fabricOpenReq, false)) + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) retryFabricAt = time.Now().Add(2 * time.Second) } From 6e3934995e024ef435ae26dd8033eaf6e71325c7 Mon Sep 17 00:00:00 2001 From: cpunt Date: Thu, 16 Apr 2026 19:53:45 +0000 Subject: [PATCH 50/65] refactor: trim transfer.go + drop rp2350 prefix - Inline readyNext (one-shot call) - Inline textPreview (only called from error logs; removed data_head) - Drop infoPayload dead error branch (json.Marshal of 2-uint32 can't fail) - Compact the six xfer_need error blocks (keep seq/off/data_len, drop data_head noise) - Route sink.Abort + clearTransfer through abortTransfer for consistent logging - Rename rp2350TransferSink/rp2350TransferStageSize -> transferSinkImpl/stageSize Net 58 lines removed, wire behaviour unchanged, tests pass. --- services/fabric/transfer.go | 138 +++++++----------------- services/fabric/transfer_sink_rp2350.go | 18 ++-- 2 files changed, 49 insertions(+), 107 deletions(-) diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 4e38dc2..428a34d 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -69,26 +69,15 @@ func sha256Hex(h hash.Hash) string { return hex.EncodeToString(sum) } -func readyNext(v uint32) *uint32 { - return &v -} - func u32s(v uint32) string { return strconvx.Itoa(int(v)) } -func textPreview(s string) string { - return tracePreview([]byte(s)) -} - func infoPayload(info transferInfo) json.RawMessage { if info.isZero() { return nil } - b, err := json.Marshal(info) - if err != nil { - return nil - } + b, _ := json.Marshal(info) return json.RawMessage(b) } @@ -227,7 +216,7 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { "chunks", u32s(meta.Chunks), "chunk_raw", u32s(meta.ChunkRaw), ) - s.sendTransferReady(meta.ID, true, readyNext(0), "") + s.sendTransferReady(meta.ID, true, new(uint32), "") } func (s *session) onTransferChunk(msg *protoXferChunk) { @@ -237,94 +226,57 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { return } if msg.Seq != cur.expectedNext { - println( - "[fabric]", "sid", s.localSID, - "xfer_need sent", - "id", cur.meta.ID, - "next", u32s(cur.expectedNext), - "err", "unexpected_seq", - "seq", u32s(msg.Seq), - ) + println("[fabric]", "sid", s.localSID, "xfer_need sent", + "id", cur.meta.ID, "next", u32s(cur.expectedNext), + "err", "unexpected_seq", "seq", u32s(msg.Seq)) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_seq") return } if msg.Off != cur.bytesWritten { - println( - "[fabric]", "sid", s.localSID, - "xfer_need sent", - "id", cur.meta.ID, - "next", u32s(cur.expectedNext), - "err", "unexpected_offset", - "off", u32s(msg.Off), - "want_off", u32s(cur.bytesWritten), - ) + println("[fabric]", "sid", s.localSID, "xfer_need sent", + "id", cur.meta.ID, "next", u32s(cur.expectedNext), + "err", "unexpected_offset", "off", u32s(msg.Off), "want_off", u32s(cur.bytesWritten)) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_offset") return } raw, err := base64.RawURLEncoding.DecodeString(msg.Data) if err != nil { - println( - "[fabric]", "sid", s.localSID, - "xfer_need sent", - "id", cur.meta.ID, - "next", u32s(cur.expectedNext), - "err", "decode_failed", - "seq", u32s(msg.Seq), - "off", u32s(msg.Off), - "data_len", u32s(uint32(len(msg.Data))), - "data_head", textPreview(msg.Data), - ) + println("[fabric]", "sid", s.localSID, "xfer_need sent", + "id", cur.meta.ID, "next", u32s(cur.expectedNext), + "err", "decode_failed", "seq", u32s(msg.Seq), "off", u32s(msg.Off), + "data_len", u32s(uint32(len(msg.Data)))) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") return } if uint32(len(raw)) != msg.N || msg.N == 0 { - println( - "[fabric]", "sid", s.localSID, - "xfer_need sent", - "id", cur.meta.ID, - "next", u32s(cur.expectedNext), - "err", "size_mismatch", - "seq", u32s(msg.Seq), - "off", u32s(msg.Off), - "n", u32s(msg.N), - "data_len", u32s(uint32(len(msg.Data))), - "decoded", u32s(uint32(len(raw))), - "data_head", textPreview(msg.Data), - ) + println("[fabric]", "sid", s.localSID, "xfer_need sent", + "id", cur.meta.ID, "next", u32s(cur.expectedNext), + "err", "size_mismatch", "seq", u32s(msg.Seq), "off", u32s(msg.Off), + "n", u32s(msg.N), "data_len", u32s(uint32(len(msg.Data))), "decoded", u32s(uint32(len(raw)))) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") return } if crc32Hex(raw) != lowerHex(msg.CRC32) { - println( - "[fabric]", "sid", s.localSID, - "xfer_need sent", - "id", cur.meta.ID, - "next", u32s(cur.expectedNext), - "err", "bad_crc", - "seq", u32s(msg.Seq), - ) + println("[fabric]", "sid", s.localSID, "xfer_need sent", + "id", cur.meta.ID, "next", u32s(cur.expectedNext), + "err", "bad_crc", "seq", u32s(msg.Seq)) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "bad_crc") return } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { - println( - "[fabric]", "sid", s.localSID, - "xfer_need sent", - "id", cur.meta.ID, - "next", u32s(cur.expectedNext), - "err", "size_mismatch", - "bytes_written", u32s(cur.bytesWritten), - "raw_len", u32s(uint32(len(raw))), - "total", u32s(cur.meta.Size), - ) + println("[fabric]", "sid", s.localSID, "xfer_need sent", + "id", cur.meta.ID, "next", u32s(cur.expectedNext), + "err", "size_mismatch", "bytes_written", u32s(cur.bytesWritten), + "raw_len", u32s(uint32(len(raw))), "total", u32s(cur.meta.Size)) s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") return } if err := cur.sink.WriteChunk(msg.Seq, msg.Off, raw); err != nil { s.logKV("transfer write failed", "err", err.Error()) - _ = cur.sink.Abort(err.Error()) - s.clearTransfer() - s.sendTransferDone(cur.meta.ID, false, transferInfo{}, err.Error()) + id := cur.meta.ID + reason := err.Error() + s.abortTransfer(reason) + s.sendTransferDone(id, false, transferInfo{}, reason) return } _, _ = cur.hasher.Write(raw) @@ -354,38 +306,31 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { s.logKV("xfer_commit dropped", "id", msg.ID) return } + id := cur.meta.ID if msg.Size != cur.meta.Size || cur.bytesWritten != cur.meta.Size { - println( - "[fabric]", "sid", s.localSID, - "xfer_commit failed", - "id", cur.meta.ID, - "err", "size_mismatch", + println("[fabric]", "sid", s.localSID, "xfer_commit failed", + "id", id, "err", "size_mismatch", "bytes_written", u32s(cur.bytesWritten), - "msg_size", u32s(msg.Size), - "meta_size", u32s(cur.meta.Size), - ) - _ = cur.sink.Abort("size_mismatch") - s.clearTransfer() - s.sendTransferDone(cur.meta.ID, false, transferInfo{}, "size_mismatch") + "msg_size", u32s(msg.Size), "meta_size", u32s(cur.meta.Size)) + s.abortTransfer("size_mismatch") + s.sendTransferDone(id, false, transferInfo{}, "size_mismatch") return } if lowerHex(msg.SHA256) != cur.meta.SHA256 || sha256Hex(cur.hasher) != cur.meta.SHA256 { - println("[fabric]", "sid", s.localSID, "xfer_commit failed", "id", cur.meta.ID, "err", "sha256_mismatch") - _ = cur.sink.Abort("sha256_mismatch") - s.clearTransfer() - s.sendTransferDone(cur.meta.ID, false, transferInfo{}, "sha256_mismatch") + s.logKV("xfer_commit failed: sha256_mismatch", "id", id) + s.abortTransfer("sha256_mismatch") + s.sendTransferDone(id, false, transferInfo{}, "sha256_mismatch") return } info, err := cur.sink.Commit() if err != nil { s.logKV("transfer commit failed", "err", err.Error()) - _ = cur.sink.Abort(err.Error()) - s.clearTransfer() - s.sendTransferDone(cur.meta.ID, false, transferInfo{}, err.Error()) + reason := err.Error() + s.abortTransfer(reason) + s.sendTransferDone(id, false, transferInfo{}, reason) return } sink := cur.sink - id := cur.meta.ID s.clearTransfer() println( "[fabric]", "sid", s.localSID, @@ -414,9 +359,6 @@ func (s *session) onTransferAbort(msg *protoXferAbort) { if reason == "" { reason = "remote_abort" } - if err := cur.sink.Abort(reason); err != nil { - s.logKV("transfer abort failed", "err", err.Error()) - } println("[fabric]", "sid", s.localSID, "xfer_abort received", "id", cur.meta.ID, "reason", reason) - s.clearTransfer() + s.abortTransfer(reason) } diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 6ae14f9..f117f8f 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -10,16 +10,16 @@ import ( "pico2-a-b/abupdate" ) -const rp2350TransferStageSize = 4096 +const stageSize = 4096 var errTransferUnsupported = errors.New("unsupported") -type rp2350TransferSink struct { +type transferSinkImpl struct { updater *abupdate.Updater // Stage verified transfer bytes in protocol code so flash writes happen in // larger batches instead of directly on every UART chunk. - stage [rp2350TransferStageSize]byte + stage [stageSize]byte stageUsed uint32 accepted uint32 } @@ -37,10 +37,10 @@ func beginTransfer(meta transferMeta) (transferSink, error) { return nil, fmt.Errorf("begin_update:%d", rc) } - return &rp2350TransferSink{updater: &updater}, nil + return &transferSinkImpl{updater: &updater}, nil } -func (s *rp2350TransferSink) flushStage(seq uint32, force bool) error { +func (s *transferSinkImpl) flushStage(seq uint32, force bool) error { if s.stageUsed == 0 { return nil } @@ -74,7 +74,7 @@ func (s *rp2350TransferSink) flushStage(seq uint32, force bool) error { return nil } -func (s *rp2350TransferSink) WriteChunk(seq, off uint32, data []byte) error { +func (s *transferSinkImpl) WriteChunk(seq, off uint32, data []byte) error { if s.accepted != off { return fmt.Errorf("unexpected_offset:%d", s.accepted) } @@ -113,7 +113,7 @@ func (s *rp2350TransferSink) WriteChunk(seq, off uint32, data []byte) error { return nil } -func (s *rp2350TransferSink) Commit() (transferInfo, error) { +func (s *transferSinkImpl) Commit() (transferInfo, error) { if err := s.flushStage(0, true); err != nil { return transferInfo{}, err } @@ -126,14 +126,14 @@ func (s *rp2350TransferSink) Commit() (transferInfo, error) { }, nil } -func (s *rp2350TransferSink) Apply() error { +func (s *transferSinkImpl) Apply() error { if rc := s.updater.RebootIntoSlot(); rc != 0 { return fmt.Errorf("reboot:%d", rc) } return nil } -func (s *rp2350TransferSink) Abort(reason string) error { +func (s *transferSinkImpl) Abort(reason string) error { _ = reason s.stageUsed = 0 return nil From 5aa43e78bfd9ca3811ef6c018c62fd903d62c1b5 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 09:23:31 +0000 Subject: [PATCH 51/65] fabric: align wire schema with devicecode-lua@2c88090 Implements W1, W2, W4, W8 from docs/firmware-alignment-protocol.md: - xxhash32 port at x/xxhash/ with 4 KAT vectors (empty/a/abc/123456789) - Frame discriminator t -> type; reply {corr,payload} -> {id,ok,value,err} - Transfer wire fields renamed (id->xfer_id, etc.); xfer_chunk shape reduced to {xfer_id,offset,data}; checksum moves to xfer_begin/commit - Wire integrity SHA-256 -> xxHash32 hex (no algorithm field) - xfer_need.next is byte offset (not seq) - maxLineLen 2048 -> 4096 (covers chunk_size=2048 + base64url + envelope) - protoXferBegin.Meta preserved as opaque RawMessage (transfer_mgr passes meta.receiver through) - onTransferChunk aborts on every chunk-level fault (matches Lua transfer_mgr.lua: unexpected_offset / decode_failed / empty_chunk / size_overflow / sink errors all clear active transfer + send xfer_abort) - RP2350 default sink now refuses transfers (signed-image receiver lands in fabric-update); direct abupdate flashing gated behind flash_unsafe build tag - FABRIC_TRANSFER_FIX GC fix preserved with regression-guard comment Out of scope (deferred): W3 link config + idle-chunk watchdog, W5 3-lane writer, W6 active ping/session_reset/bounded helper, W7 UART role swap. --- services/fabric/fabric_test.go | 176 +++++----- services/fabric/protocol.go | 129 +++---- services/fabric/session.go | 26 +- services/fabric/transfer.go | 273 +++++++-------- services/fabric/transfer_sink_rp2350.go | 149 +-------- .../fabric/transfer_sink_rp2350_unsafe.go | 148 ++++++++ services/fabric/transfer_test.go | 316 ++++++++++++------ services/fabric/transport_rw.go | 8 +- x/xxhash/xxhash.go | 167 +++++++++ x/xxhash/xxhash_test.go | 157 +++++++++ 10 files changed, 988 insertions(+), 561 deletions(-) create mode 100644 services/fabric/transfer_sink_rp2350_unsafe.go create mode 100644 x/xxhash/xxhash.go create mode 100644 x/xxhash/xxhash_test.go diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 0ade6fc..8e57731 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -67,7 +67,7 @@ const testCM5SID = "s1" func bringUp(t *testing.T, cm5 Transport) protoHelloAck { t.Helper() sendMsg(t, cm5, protoHello{ - T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: testCM5SID, Proto: protoVersion, + Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: testCM5SID, Proto: protoVersion, }) ack := readMsg[protoHelloAck](t, cm5) if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { @@ -79,17 +79,17 @@ func bringUp(t *testing.T, cm5 Transport) protoHelloAck { func unlockExports(t *testing.T, cm5 Transport) { t.Helper() - sendMsg(t, cm5, protoPing{T: "ping", TS: 77, SID: testCM5SID}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 77, SID: testCM5SID}) pong := readMsg[protoPong](t, cm5) - if pong.T != "pong" { - t.Fatalf("expected pong, got %q", pong.T) + if pong.Type != "pong" { + t.Fatalf("expected pong, got %q", pong.Type) } } // ---- codec ---- func TestCodecRoundTrip(t *testing.T) { - orig := protoHello{T: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} + orig := protoHello{Type: "hello", Node: "mcu-1", Peer: "cm5-local", SID: "abc", Proto: protoVersion} data := marshal(orig) if !bytes.HasSuffix(data, []byte("\n")) { t.Error("marshal should end with newline") @@ -113,21 +113,21 @@ func TestCodecAllTypes(t *testing.T) { v any want string }{ - {protoHello{T: "hello"}, "hello"}, - {protoHelloAck{T: "hello_ack"}, "hello_ack"}, - {protoPing{T: "ping", TS: 1}, "ping"}, - {protoPong{T: "pong", TS: 2}, "pong"}, - {protoPub{T: "pub", Topic: []string{"a"}}, "pub"}, - {protoUnretain{T: "unretain", Topic: []string{"a"}}, "unretain"}, - {protoCall{T: "call", ID: "c1"}, "call"}, - {protoReply{T: "reply", Corr: "c1", OK: true}, "reply"}, - {protoXferBegin{T: "xfer_begin", ID: "x1"}, "xfer_begin"}, - {protoXferReady{T: "xfer_ready", ID: "x1", OK: true}, "xfer_ready"}, - {protoXferChunk{T: "xfer_chunk", ID: "x1"}, "xfer_chunk"}, - {protoXferNeed{T: "xfer_need", ID: "x1"}, "xfer_need"}, - {protoXferCommit{T: "xfer_commit", ID: "x1"}, "xfer_commit"}, - {protoXferDone{T: "xfer_done", ID: "x1", OK: true}, "xfer_done"}, - {protoXferAbort{T: "xfer_abort", ID: "x1", Reason: "aborted"}, "xfer_abort"}, + {protoHello{Type: "hello"}, "hello"}, + {protoHelloAck{Type: "hello_ack"}, "hello_ack"}, + {protoPing{Type: "ping", TS: 1}, "ping"}, + {protoPong{Type: "pong", TS: 2}, "pong"}, + {protoPub{Type: "pub", Topic: []string{"a"}}, "pub"}, + {protoUnretain{Type: "unretain", Topic: []string{"a"}}, "unretain"}, + {protoCall{Type: "call", ID: "c1"}, "call"}, + {protoReply{Type: "reply", Corr: "c1", OK: true}, "reply"}, + {protoXferBegin{Type: "xfer_begin", XferID: "x1"}, "xfer_begin"}, + {protoXferReady{Type: "xfer_ready", XferID: "x1"}, "xfer_ready"}, + {protoXferChunk{Type: "xfer_chunk", XferID: "x1"}, "xfer_chunk"}, + {protoXferNeed{Type: "xfer_need", XferID: "x1"}, "xfer_need"}, + {protoXferCommit{Type: "xfer_commit", XferID: "x1"}, "xfer_commit"}, + {protoXferDone{Type: "xfer_done", XferID: "x1"}, "xfer_done"}, + {protoXferAbort{Type: "xfer_abort", XferID: "x1", Err: "aborted"}, "xfer_abort"}, } { b := marshal(tc.v) if got := protoType(b[:len(b)-1]); got != tc.want { @@ -137,7 +137,7 @@ func TestCodecAllTypes(t *testing.T) { } func TestWireTypeBadInput(t *testing.T) { - for _, b := range [][]byte{[]byte("not json"), []byte(`{"no_t":true}`), nil} { + for _, b := range [][]byte{[]byte("not json"), []byte(`{"no_type":true}`), nil} { if got := protoType(b); got != "" { t.Errorf("protoType(%q) = %q, want empty", b, got) } @@ -156,11 +156,11 @@ func TestTransportRoundTrip(t *testing.T) { t.Errorf("ReadLine: %v", err) return } - if string(line) != `{"t":"ping","ts":99}` { + if string(line) != `{"type":"ping","ts":99}` { t.Errorf("got %q", line) } }() - sendMsg(t, a, protoPing{T: "ping", TS: 99}) + sendMsg(t, a, protoPing{Type: "ping", TS: 99}) select { case <-done: case <-time.After(2 * time.Second): @@ -169,8 +169,8 @@ func TestTransportRoundTrip(t *testing.T) { } func TestOversizeLineRecovery(t *testing.T) { - big := `{"t":"ping","ts":0,"x":"` + strings.Repeat("x", maxLineLen+100) + `"}` - input := big + "\n" + `{"t":"ping","ts":3}` + "\n" + big := `{"type":"ping","ts":0,"x":"` + strings.Repeat("x", maxLineLen+100) + `"}` + input := big + "\n" + `{"type":"ping","ts":3}` + "\n" tr := NewRWTransport(strings.NewReader(input), io.Discard) _, err := tr.ReadLine() if !errors.Is(err, ErrLineTooLong) { @@ -180,7 +180,7 @@ func TestOversizeLineRecovery(t *testing.T) { if err != nil { t.Fatalf("second ReadLine: %v", err) } - if string(line) != `{"t":"ping","ts":3}` { + if string(line) != `{"type":"ping","ts":3}` { t.Errorf("got %q", line) } } @@ -193,21 +193,21 @@ func TestShmringTransportRoundTrip(t *testing.T) { mcuTr := NewShmringTransport(rx, tx) defer mcuTr.Close() - rx.TryWriteFrom([]byte(`{"t":"ping","ts":42}` + "\n")) + rx.TryWriteFrom([]byte(`{"type":"ping","ts":42}` + "\n")) line, err := mcuTr.ReadLine() if err != nil { t.Fatalf("ReadLine: %v", err) } - if string(line) != `{"t":"ping","ts":42}` { + if string(line) != `{"type":"ping","ts":42}` { t.Errorf("got %q", line) } - if err := mcuTr.WriteLine([]byte(`{"t":"pong","ts":42}`)); err != nil { + if err := mcuTr.WriteLine([]byte(`{"type":"pong","ts":42}`)); err != nil { t.Fatalf("WriteLine: %v", err) } var out [128]byte n := tx.TryReadInto(out[:]) - if string(out[:n]) != `{"t":"pong","ts":42}`+"\n" { + if string(out[:n]) != `{"type":"pong","ts":42}`+"\n" { t.Errorf("tx got %q", out[:n]) } } @@ -216,13 +216,13 @@ func TestShmringTransportMultiLine(t *testing.T) { rx := shmring.New(256) tr := NewShmringTransport(rx, shmring.New(256)) defer tr.Close() - rx.TryWriteFrom([]byte(`{"t":"ping","ts":1}` + "\n" + `{"t":"ping","ts":2}` + "\n")) + rx.TryWriteFrom([]byte(`{"type":"ping","ts":1}` + "\n" + `{"type":"ping","ts":2}` + "\n")) line1, _ := tr.ReadLine() line2, _ := tr.ReadLine() - if string(line1) != `{"t":"ping","ts":1}` { + if string(line1) != `{"type":"ping","ts":1}` { t.Errorf("line1 = %q", line1) } - if string(line2) != `{"t":"ping","ts":2}` { + if string(line2) != `{"type":"ping","ts":2}` { t.Errorf("line2 = %q", line2) } } @@ -274,7 +274,11 @@ func TestShmringTransportWriteLineWrapsAcrossSegments(t *testing.T) { } func TestShmringTransportOversize(t *testing.T) { - rx := shmring.New(4096) + // Ring must be larger than maxLineLen+100 + newline + the trailing ping + // frame so the producer can deposit both lines without blocking. The rx + // ring used to be 4096 when maxLineLen=2048, leaving comfortable + // headroom; now that maxLineLen=4096, bump to 8192. + rx := shmring.New(8192) tr := NewShmringTransport(rx, shmring.New(256)) defer tr.Close() big := make([]byte, maxLineLen+100) @@ -283,7 +287,7 @@ func TestShmringTransportOversize(t *testing.T) { } rx.TryWriteFrom(big) rx.TryWriteFrom([]byte("\n")) - rx.TryWriteFrom([]byte(`{"t":"ping","ts":7}` + "\n")) + rx.TryWriteFrom([]byte(`{"type":"ping","ts":7}` + "\n")) _, err := tr.ReadLine() if !errors.Is(err, ErrLineTooLong) { t.Fatalf("expected ErrLineTooLong, got %v", err) @@ -292,7 +296,7 @@ func TestShmringTransportOversize(t *testing.T) { if err != nil { t.Fatalf("second ReadLine: %v", err) } - if string(line) != `{"t":"ping","ts":7}` { + if string(line) != `{"type":"ping","ts":7}` { t.Errorf("got %q", line) } } @@ -319,14 +323,14 @@ func TestHandshake(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") sendMsg(t, cm5, protoHello{ - T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, + Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, }) ack := readMsg[protoHelloAck](t, cm5) if !ack.OK || ack.Node != "mcu-1" || ack.SID == "" || ack.Proto != protoVersion { t.Errorf("bad ack: %+v", ack) } time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, protoPing{T: "ping", TS: 99, SID: "s1"}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 99, SID: "s1"}) pong := readMsg[protoPong](t, cm5) if pong.TS != 99 || pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) @@ -341,12 +345,12 @@ func TestSessionReset(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) - sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) ack := readMsg[protoHelloAck](t, cm5) if !ack.OK || ack.SID == "" || ack.Proto != protoVersion { t.Error("hello_ack.OK = false") } - sendMsg(t, cm5, protoPing{T: "ping", TS: 55, SID: "s2"}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 55, SID: "s2"}) pong := readMsg[protoPong](t, cm5) if pong.TS != 55 || pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) @@ -360,7 +364,7 @@ func TestRejectsWrongPeer(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") - sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) gotLine := make(chan readResult, 1) go func() { line, err := cm5.ReadLine() @@ -371,7 +375,7 @@ func TestRejectsWrongPeer(t *testing.T) { t.Fatal("got response to wrong-peer hello") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) select { case res := <-gotLine: if res.err != nil { @@ -402,14 +406,14 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { gotLine <- readResult{line: line, err: err} }() - sendMsg(t, cm5, protoHello{T: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Peer: "mcu-1", SID: "s1", Proto: protoVersion}) select { case <-gotLine: t.Fatal("got response to hello without node") case <-time.After(200 * time.Millisecond): } - sendMsg(t, cm5, protoHello{T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) + sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) select { case res := <-gotLine: if res.err != nil { @@ -434,7 +438,7 @@ func TestPingPong(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") ack := bringUp(t, cm5) - sendMsg(t, cm5, protoPing{T: "ping", TS: 42, SID: "s1"}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: "s1"}) pong := readMsg[protoPong](t, cm5) if pong.TS != 42 || pong.SID != ack.SID { t.Errorf("bad pong: %+v ack=%+v", pong, ack) @@ -464,8 +468,8 @@ func TestUnknownTypeIgnored(t *testing.T) { defer cancel() go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) - cm5.WriteLine([]byte(`{"t":"future_msg"}`)) - sendMsg(t, cm5, protoPing{T: "ping", TS: 1}) + cm5.WriteLine([]byte(`{"type":"future_msg"}`)) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 1}) pong := readMsg[protoPong](t, cm5) if pong.TS != 1 { t.Errorf("pong.TS = %d", pong.TS) @@ -480,7 +484,7 @@ func TestMalformedJSONIgnored(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") bringUp(t, cm5) cm5.WriteLine([]byte("not json")) - sendMsg(t, cm5, protoPing{T: "ping", TS: 2}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 2}) pong := readMsg[protoPong](t, cm5) if pong.TS != 2 { t.Errorf("pong.TS = %d", pong.TS) @@ -685,7 +689,7 @@ func TestPubImport(t *testing.T) { sub := reader.Subscribe(bus.T("config", "hal")) sendMsg(t, cm5, protoPub{ - T: "pub", + Type: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), Retain: true, @@ -721,8 +725,8 @@ func TestPubExport(t *testing.T) { )) msg := readMsg[protoPub](t, cm5) - if msg.T != "pub" { - t.Fatalf("expected pub, got %q", msg.T) + if msg.Type != "pub" { + t.Fatalf("expected pub, got %q", msg.Type) } want := []string{"state", "env", "temperature", "core", "value"} if !slicesEqual(msg.Topic, want) { @@ -751,8 +755,8 @@ func TestUnretainExport(t *testing.T) { true, )) pub := readMsg[protoPub](t, cm5) - if pub.T != "pub" || !pub.Retain { - t.Fatalf("expected retained pub, got t=%q retain=%v", pub.T, pub.Retain) + if pub.Type != "pub" || !pub.Retain { + t.Fatalf("expected retained pub, got t=%q retain=%v", pub.Type, pub.Retain) } // Clear retained state (retain=true, payload=nil). @@ -762,8 +766,8 @@ func TestUnretainExport(t *testing.T) { true, )) unr := readMsg[protoUnretain](t, cm5) - if unr.T != "unretain" { - t.Fatalf("expected unretain, got %q", unr.T) + if unr.Type != "unretain" { + t.Fatalf("expected unretain, got %q", unr.Type) } want := []string{"state", "env", "temperature", "core", "value"} if !slicesEqual(unr.Topic, want) { @@ -840,7 +844,7 @@ func TestPubIgnoredBeforeHandshake(t *testing.T) { go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") sendMsg(t, cm5, protoPub{ - T: "pub", Topic: []string{"config", "device"}, + Type: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"v":1}`), Retain: true, }) time.Sleep(50 * time.Millisecond) @@ -877,7 +881,7 @@ func TestUnretainIgnoredBeforeHandshake(t *testing.T) { t.Fatal("timed out waiting for retained config/device") } - sendMsg(t, cm5, protoUnretain{T: "unretain", Topic: []string{"config", "device"}}) + sendMsg(t, cm5, protoUnretain{Type: "unretain", Topic: []string{"config", "device"}}) select { case m := <-sub.Channel(): t.Fatalf("unexpected pre-handshake unretain effect: %+v", m) @@ -895,11 +899,11 @@ func TestUnretain(t *testing.T) { bringUp(t, cm5) sendMsg(t, cm5, protoPub{ - T: "pub", Topic: []string{"config", "device"}, + Type: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"v":1}`), Retain: true, }) time.Sleep(50 * time.Millisecond) - sendMsg(t, cm5, protoUnretain{T: "unretain", Topic: []string{"config", "device"}}) + sendMsg(t, cm5, protoUnretain{Type: "unretain", Topic: []string{"config", "device"}}) time.Sleep(50 * time.Millisecond) reader := b.NewConnection("test") @@ -928,7 +932,7 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { defer handler.Unsubscribe(sub) sendMsg(t, cm5, protoCall{ - T: "call", ID: "pre-hello-1", Topic: []string{"rpc", "hal", "dump"}, + Type: "call", ID: "pre-hello-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) @@ -957,7 +961,7 @@ func TestCallImport(t *testing.T) { }() sendMsg(t, cm5, protoCall{ - T: "call", ID: "test-corr-1", Topic: []string{"rpc", "hal", "dump"}, + Type: "call", ID: "test-corr-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 5000, }) @@ -979,7 +983,7 @@ func TestCallNoRoute(t *testing.T) { bringUp(t, cm5) sendMsg(t, cm5, protoCall{ - T: "call", ID: "no-route-1", Topic: []string{"unknown", "endpoint"}, + Type: "call", ID: "no-route-1", Topic: []string{"unknown", "endpoint"}, Payload: json.RawMessage(`{}`), TimeoutMs: 1000, }) @@ -1006,7 +1010,7 @@ func TestDumpCallReturnsConfigState(t *testing.T) { // Send config first so the session has state. sendMsg(t, cm5, protoPub{ - T: "pub", + Type: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), Retain: true, @@ -1015,7 +1019,7 @@ func TestDumpCallReturnsConfigState(t *testing.T) { // Call dump. sendMsg(t, cm5, protoCall{ - T: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, + Type: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{"ask":"status"}`), TimeoutMs: 5000, }) @@ -1027,7 +1031,7 @@ func TestDumpCallReturnsConfigState(t *testing.T) { t.Errorf("expected ok=true, got err=%q", reply.Err) } var dump dumpReply - if err := json.Unmarshal(reply.Payload, &dump); err != nil { + if err := json.Unmarshal(reply.Value, &dump); err != nil { t.Fatalf("unmarshal dump reply: %v", err) } if !dump.Applied { @@ -1049,28 +1053,28 @@ func TestDumpCallDoesNotBlockPing(t *testing.T) { // Send dump call and ping back-to-back. sendMsg(t, cm5, protoCall{ - T: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, + Type: "call", ID: "dump-1", Topic: []string{"rpc", "hal", "dump"}, Payload: json.RawMessage(`{}`), TimeoutMs: 1000, }) - sendMsg(t, cm5, protoPing{T: "ping", TS: 77, SID: testCM5SID}) + sendMsg(t, cm5, protoPing{Type: "ping", TS: 77, SID: testCM5SID}) type readResult struct { line []byte err error } type wireHeader struct { - T string `json:"t"` + Type string `json:"type"` } var gotReply, gotPong bool for i := 0; i < 2; i++ { msg := readMsg[wireHeader](t, cm5) - switch msg.T { + switch msg.Type { case msgReply: gotReply = true case msgPong: gotPong = true default: - t.Fatalf("unexpected message type %q", msg.T) + t.Fatalf("unexpected message type %q", msg.Type) } } if !gotReply { @@ -1107,8 +1111,8 @@ func TestCallExport(t *testing.T) { }() call := readMsg[protoCall](t, cm5) - if call.T != "call" { - t.Fatalf("expected call, got %q", call.T) + if call.Type != "call" { + t.Fatalf("expected call, got %q", call.Type) } want := []string{"rpc", "hal", "dump"} if !slicesEqual(call.Topic, want) { @@ -1123,10 +1127,10 @@ func TestCallExport(t *testing.T) { } sendMsg(t, cm5, protoReply{ - T: "reply", - Corr: call.ID, - OK: true, - Payload: json.RawMessage(`{"ok":true,"remote":"cm5"}`), + Type: "reply", + Corr: call.ID, + OK: true, + Value: json.RawMessage(`{"ok":true,"remote":"cm5"}`), }) select { @@ -1430,12 +1434,12 @@ func TestCallExportPeerReset(t *testing.T) { }() call := readMsg[protoCall](t, cm5) - if call.T != "call" { - t.Fatalf("expected call, got %q", call.T) + if call.Type != "call" { + t.Fatalf("expected call, got %q", call.Type) } sendMsg(t, cm5, protoHello{ - T: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "fresh-session", Proto: protoVersion, + Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "fresh-session", Proto: protoVersion, }) _ = readMsg[protoHelloAck](t, cm5) @@ -1488,20 +1492,20 @@ func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { }() call := readMsg[protoCall](t, cm5) - if call.T != "call" { - t.Fatalf("expected call, got %q", call.T) + if call.Type != "call" { + t.Fatalf("expected call, got %q", call.Type) } // Send an echoed hello_ack (our own SID) — should be ignored. sendMsg(t, cm5, protoHelloAck{ - T: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, + Type: "hello_ack", Node: "mcu-1", SID: ack.SID, Proto: protoVersion, OK: true, }) sendMsg(t, cm5, protoReply{ - T: "reply", - Corr: call.ID, - OK: true, - Payload: json.RawMessage(`{"ok":true,"remote":"cm5"}`), + Type: "reply", + Corr: call.ID, + OK: true, + Value: json.RawMessage(`{"ok":true,"remote":"cm5"}`), }) select { diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 09f32eb..5ca38a2 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -2,7 +2,14 @@ package fabric import "encoding/json" -// ---- Wire message type identifiers (fabric.md §4) ---- +// ---- Wire message type identifiers ---- +// +// Wire schema mirrors devicecode-lua/src/services/fabric/protocol.lua at +// update-migration tip (commit 2c88090). The frame discriminator field is +// "type" (not "t"). Reply frames carry {id, ok, value, err}. Transfer frames +// use xfer_id/offset/checksum/data with a minimal xfer_chunk shape and +// xxHash32 hex wire integrity (no algorithm field; Lua source treats checksum +// as opaque hex). const ( msgHello = "hello" @@ -32,7 +39,7 @@ type protoCaps struct { } type protoHello struct { - T string `json:"t"` + Type string `json:"type"` Node string `json:"node"` Peer string `json:"peer"` SID string `json:"sid"` @@ -41,7 +48,7 @@ type protoHello struct { } type protoHelloAck struct { - T string `json:"t"` + Type string `json:"type"` Node string `json:"node"` SID string `json:"sid,omitempty"` Proto int `json:"proto,omitempty"` @@ -49,105 +56,105 @@ type protoHelloAck struct { } type protoPing struct { - T string `json:"t"` - TS int64 `json:"ts"` - SID string `json:"sid,omitempty"` + Type string `json:"type"` + TS int64 `json:"ts"` + SID string `json:"sid,omitempty"` } type protoPong struct { - T string `json:"t"` - TS int64 `json:"ts"` - SID string `json:"sid,omitempty"` + Type string `json:"type"` + TS int64 `json:"ts"` + SID string `json:"sid,omitempty"` } -// Not wired yet — defined for forward compatibility. - type protoPub struct { - T string `json:"t"` + Type string `json:"type"` Topic []string `json:"topic"` Payload json.RawMessage `json:"payload"` Retain bool `json:"retain"` } type protoUnretain struct { - T string `json:"t"` + Type string `json:"type"` Topic []string `json:"topic"` } type protoCall struct { - T string `json:"t"` + Type string `json:"type"` ID string `json:"id"` Topic []string `json:"topic"` Payload json.RawMessage `json:"payload"` TimeoutMs int `json:"timeout_ms"` } +// protoReply mirrors Lua's reply frame: {type, id, ok, value, err}. The Go +// field for the correlation id keeps the name "Corr" for readability — the +// wire spelling is "id" because the reply correlates to a prior call.id. type protoReply struct { - T string `json:"t"` - Corr string `json:"corr"` - OK bool `json:"ok"` - Payload json.RawMessage `json:"payload,omitempty"` - Err string `json:"err,omitempty"` -} - + Type string `json:"type"` + Corr string `json:"id"` + OK bool `json:"ok"` + Value json.RawMessage `json:"value,omitempty"` + Err string `json:"err,omitempty"` +} + +// protoXferBegin (control lane) — required fields per protocol.lua +// validate_control: xfer_id, size, checksum (xxHash32 hex). meta is +// optional but source-used: transfer_mgr.lua sends it on xfer_begin and +// later does conn:call(meta.receiver, …) before xfer_done. Preserve the +// blob opaquely so fabric-update's receiver can pull meta.receiver out. type protoXferBegin struct { - T string `json:"t"` - ID string `json:"id"` - Kind string `json:"kind"` - Name string `json:"name"` - Format string `json:"format"` - Enc string `json:"enc"` + Type string `json:"type"` + XferID string `json:"xfer_id"` Size uint32 `json:"size"` - ChunkRaw uint32 `json:"chunk_raw"` - Chunks uint32 `json:"chunks"` - SHA256 string `json:"sha256"` + Checksum string `json:"checksum"` Meta json.RawMessage `json:"meta,omitempty"` } +// protoXferReady (control) carries only xfer_id; success/failure is implicit +// (failure is signalled via xfer_abort). type protoXferReady struct { - T string `json:"t"` - ID string `json:"id"` - OK bool `json:"ok"` - Next *uint32 `json:"next,omitempty"` - Err string `json:"err,omitempty"` + Type string `json:"type"` + XferID string `json:"xfer_id"` } +// protoXferChunk (bulk) — minimal {xfer_id, offset, data}. No chunk-level +// checksum, no sequence number; ack is by byte offset via xfer_need.next. type protoXferChunk struct { - T string `json:"t"` - ID string `json:"id"` - Seq uint32 `json:"seq"` - Off uint32 `json:"off"` - N uint32 `json:"n"` - CRC32 string `json:"crc32"` - Data string `json:"data"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Offset uint32 `json:"offset"` + Data string `json:"data"` } +// protoXferNeed (control) acks the receiver's expected next byte offset. type protoXferNeed struct { - T string `json:"t"` - ID string `json:"id"` - Next uint32 `json:"next"` - Err string `json:"err,omitempty"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Next uint32 `json:"next"` } +// protoXferCommit (control) carries the same wire-integrity shape as +// xfer_begin: xfer_id, size, checksum (xxHash32 hex over the payload bytes). type protoXferCommit struct { - T string `json:"t"` - ID string `json:"id"` - Size uint32 `json:"size"` - SHA256 string `json:"sha256"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Size uint32 `json:"size"` + Checksum string `json:"checksum"` } +// protoXferDone (control) carries only xfer_id; failure is signalled via +// xfer_abort. type protoXferDone struct { - T string `json:"t"` - ID string `json:"id"` - OK bool `json:"ok"` - Info json.RawMessage `json:"info,omitempty"` - Err string `json:"err,omitempty"` + Type string `json:"type"` + XferID string `json:"xfer_id"` } +// protoXferAbort (control) carries xfer_id plus an optional err string. type protoXferAbort struct { - T string `json:"t"` - ID string `json:"id"` - Reason string `json:"reason"` + Type string `json:"type"` + XferID string `json:"xfer_id"` + Err string `json:"err,omitempty"` } // ---- codec helpers ---- @@ -162,11 +169,11 @@ func marshal(v any) []byte { return append(b, '\n') } -// protoType extracts the "t" field from a JSON line. +// protoType extracts the "type" field from a JSON line. func protoType(line []byte) string { var env struct { - T string `json:"t"` + Type string `json:"type"` } json.Unmarshal(line, &env) - return env.T + return env.Type } diff --git a/services/fabric/session.go b/services/fabric/session.go index 01407e5..4927c87 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -499,7 +499,7 @@ func (s *session) onHello(msg *protoHello) { s.logKV("hello rx", "peer_sid", msg.SID) if !s.sendFrame(marshal(protoHelloAck{ - T: msgHelloAck, + Type: msgHelloAck, Node: s.nodeID, SID: s.localSID, Proto: protoVersion, @@ -529,7 +529,7 @@ func (s *session) onHelloAck(msg *protoHelloAck) { func (s *session) onPing(msg *protoPing) { s.logKV("ping rx", "peer_sid", msg.SID) - if !s.sendFrame(marshal(protoPong{T: msgPong, TS: msg.TS, SID: s.localSID})) { + if !s.sendFrame(marshal(protoPong{Type: msgPong, TS: msg.TS, SID: s.localSID})) { return } s.log("pong tx") @@ -607,14 +607,14 @@ func (s *session) onCall(msg *protoCall) { ConfigCount: s.configCount, ConfigError: s.lastConfigErr, } - s.sendFrame(marshal(protoReply{T: msgReply, Corr: msg.ID, OK: true, Payload: mustMarshal(reply)})) + s.sendFrame(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: true, Value: mustMarshal(reply)})) return } localTopic := importCallTopic(msg.Topic) if localTopic == nil { s.log("incoming call dropped: no_route") - s.sendFrame(marshal(protoReply{T: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) + s.sendFrame(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return } @@ -644,7 +644,7 @@ func (s *session) onReply(msg *protoReply) { s.conn.Reply(call.req, types.ErrorReply{OK: false, Error: msg.Err}, false) return } - s.conn.Reply(call.req, decodePayload(msg.Payload), false) + s.conn.Reply(call.req, decodePayload(msg.Value), false) return } @@ -776,7 +776,7 @@ func (s *session) drainExports() { } if m.Retained && m.Payload == nil { if !s.sendFrame(marshal(protoUnretain{ - T: msgUnretain, + Type: msgUnretain, Topic: wire, })) { return @@ -790,7 +790,7 @@ func (s *session) drainExports() { continue } if !s.sendFrame(marshal(protoPub{ - T: msgPub, + Type: msgPub, Topic: wire, Payload: payload, Retain: m.Retained, @@ -818,25 +818,25 @@ func (s *session) drainInbound(now time.Time) { s.conn.Unsubscribe(call.sub) call.sub = nil // prevent double-unsubscribe in teardownInbound if !ok || reply == nil { - if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue } if errStr := checkBusError(reply.Payload); errStr != "" { - if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: errStr})) { + if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) { return } continue } payload, err := marshalPayload(reply.Payload) if err != nil { - if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { + if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { return } continue } - if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: true, Payload: payload})) { + if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Value: payload})) { return } continue @@ -846,7 +846,7 @@ func (s *session) drainInbound(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) call.sub = nil - if !s.sendFrame(marshal(protoReply{T: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue @@ -893,7 +893,7 @@ func (s *session) drainOutbound(now time.Time) { }) } if !s.sendFrame(marshal(protoCall{ - T: msgCall, + Type: msgCall, ID: corr, Topic: wireTopic, Payload: payload, diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 428a34d..07d1385 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -1,47 +1,45 @@ package fabric import ( - "crypto/sha256" "encoding/base64" - "encoding/hex" "encoding/json" - "fmt" - "hash" - "hash/crc32" "runtime" "strings" "time" "devicecode-go/x/strconvx" + "devicecode-go/x/xxhash" ) const postTransferDoneSettle = 250 * time.Millisecond const transferProgressLogEvery = 32 +// transferMeta captures xfer_begin contents. The required Lua wire shape is +// {xfer_id, size, checksum}; meta is optional but source-used (transfer_mgr +// passes it through to the receiver, where meta.receiver names a local +// endpoint to call after xfer_commit and before xfer_done). Preserve meta +// as an opaque blob — interpretation lives in fabric-update. type transferMeta struct { ID string - Kind string - Name string - Format string - Enc string Size uint32 - ChunkRaw uint32 - Chunks uint32 - SHA256 string + Checksum string // xxHash32 hex (8 lower-case hex chars), no algorithm field Meta json.RawMessage } +// transferInfo is internal-only state returned by the sink on Commit. It is +// no longer wire-visible — xfer_done carries only xfer_id in the canonical +// schema; size/checksum reconciliation lives on xfer_commit. type transferInfo struct { - BytesWritten uint32 `json:"bytes_written,omitempty"` - SlotXIPAddr uint32 `json:"slot_xip_addr,omitempty"` -} - -func (i transferInfo) isZero() bool { - return i.BytesWritten == 0 && i.SlotXIPAddr == 0 + BytesWritten uint32 + SlotXIPAddr uint32 } +// transferSink is the firmware-side write target for an incoming transfer. +// WriteChunk receives bytes at the given byte offset (matching xfer_chunk's +// canonical wire fields). No sequence number is passed — the caller has +// already validated offset against expected progress. type transferSink interface { - WriteChunk(seq, off uint32, data []byte) error + WriteChunk(offset uint32, data []byte) error Commit() (transferInfo, error) Apply() error Abort(reason string) error @@ -50,71 +48,46 @@ type transferSink interface { type incomingTransfer struct { meta transferMeta sink transferSink - expectedNext uint32 bytesWritten uint32 chunksSeen uint32 - hasher hash.Hash + hasher *xxhash.Hasher } func lowerHex(s string) string { return strings.ToLower(strings.TrimSpace(s)) } -func crc32Hex(data []byte) string { - return fmt.Sprintf("%08x", crc32.ChecksumIEEE(data)) -} - -func sha256Hex(h hash.Hash) string { - sum := h.Sum(nil) - return hex.EncodeToString(sum) -} - func u32s(v uint32) string { return strconvx.Itoa(int(v)) } -func infoPayload(info transferInfo) json.RawMessage { - if info.isZero() { - return nil - } - b, _ := json.Marshal(info) - return json.RawMessage(b) -} - -func (s *session) sendTransferReady(id string, ok bool, next *uint32, errStr string) bool { +func (s *session) sendTransferReady(id string) bool { return s.sendFrame(marshal(protoXferReady{ - T: msgXferReady, - ID: id, - OK: ok, - Next: next, - Err: errStr, + Type: msgXferReady, + XferID: id, })) } -func (s *session) sendTransferNeed(id string, next uint32, errStr string) bool { +func (s *session) sendTransferNeed(id string, next uint32) bool { return s.sendFrame(marshal(protoXferNeed{ - T: msgXferNeed, - ID: id, - Next: next, - Err: errStr, + Type: msgXferNeed, + XferID: id, + Next: next, })) } -func (s *session) sendTransferDone(id string, ok bool, info transferInfo, errStr string) bool { +func (s *session) sendTransferDone(id string) bool { return s.sendFrame(marshal(protoXferDone{ - T: msgXferDone, - ID: id, - OK: ok, - Info: infoPayload(info), - Err: errStr, + Type: msgXferDone, + XferID: id, })) } func (s *session) sendTransferAbort(id, reason string) bool { return s.sendFrame(marshal(protoXferAbort{ - T: msgXferAbort, - ID: id, - Reason: reason, + Type: msgXferAbort, + XferID: id, + Err: reason, })) } @@ -135,43 +108,19 @@ func (s *session) abortTransfer(reason string) { } func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { - if msg.ID == "" { - return transferMeta{}, "xfer_begin.id" - } - if msg.Kind == "" { - return transferMeta{}, "xfer_begin.kind" - } - if msg.Name == "" { - return transferMeta{}, "xfer_begin.name" - } - if msg.Format == "" { - return transferMeta{}, "xfer_begin.format" - } - if msg.Enc == "" { - return transferMeta{}, "xfer_begin.enc" + if msg.XferID == "" { + return transferMeta{}, "xfer_begin.xfer_id" } if msg.Size == 0 { return transferMeta{}, "xfer_begin.size" } - if msg.ChunkRaw == 0 { - return transferMeta{}, "xfer_begin.chunk_raw" - } - if msg.Chunks == 0 { - return transferMeta{}, "xfer_begin.chunks" - } - if msg.SHA256 == "" { - return transferMeta{}, "xfer_begin.sha256" + if msg.Checksum == "" { + return transferMeta{}, "xfer_begin.checksum" } return transferMeta{ - ID: msg.ID, - Kind: msg.Kind, - Name: msg.Name, - Format: msg.Format, - Enc: msg.Enc, + ID: msg.XferID, Size: msg.Size, - ChunkRaw: msg.ChunkRaw, - Chunks: msg.Chunks, - SHA256: lowerHex(msg.SHA256), + Checksum: lowerHex(msg.Checksum), Meta: append(json.RawMessage(nil), msg.Meta...), }, "" } @@ -179,18 +128,14 @@ func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { func (s *session) onTransferBegin(msg *protoXferBegin) { meta, errStr := validateTransferBegin(msg) if errStr != "" { - if msg.ID != "" { - s.sendTransferReady(msg.ID, false, nil, "bad_message: "+errStr) + if msg.XferID != "" { + s.sendTransferAbort(msg.XferID, "bad_message: "+errStr) } s.logKV("xfer_begin dropped", "err", errStr) return } if s.incomingTransfer != nil { - s.sendTransferReady(meta.ID, false, nil, "busy") - return - } - if meta.Enc != "b64url" { - s.sendTransferReady(meta.ID, false, nil, "unsupported_encoding") + s.sendTransferAbort(meta.ID, "busy") return } beginFn := s.beginTransfer @@ -199,88 +144,76 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { } sink, err := beginFn(meta) if err != nil { - s.sendTransferReady(meta.ID, false, nil, err.Error()) + s.sendTransferAbort(meta.ID, err.Error()) return } s.incomingTransfer = &incomingTransfer{ meta: meta, sink: sink, - hasher: sha256.New(), + hasher: xxhash.New(0), } println( "[fabric]", "sid", s.localSID, "xfer_begin accepted", "id", meta.ID, - "kind", meta.Kind, "size", u32s(meta.Size), - "chunks", u32s(meta.Chunks), - "chunk_raw", u32s(meta.ChunkRaw), + "checksum", meta.Checksum, ) - s.sendTransferReady(meta.ID, true, new(uint32), "") + s.sendTransferReady(meta.ID) } func (s *session) onTransferChunk(msg *protoXferChunk) { cur := s.incomingTransfer - if cur == nil || cur.meta.ID != msg.ID { - s.logKV("xfer_chunk dropped", "id", msg.ID) + if cur == nil || cur.meta.ID != msg.XferID { + s.logKV("xfer_chunk dropped", "id", msg.XferID) return } - if msg.Seq != cur.expectedNext { - println("[fabric]", "sid", s.localSID, "xfer_need sent", - "id", cur.meta.ID, "next", u32s(cur.expectedNext), - "err", "unexpected_seq", "seq", u32s(msg.Seq)) - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_seq") - return - } - if msg.Off != cur.bytesWritten { - println("[fabric]", "sid", s.localSID, "xfer_need sent", - "id", cur.meta.ID, "next", u32s(cur.expectedNext), - "err", "unexpected_offset", "off", u32s(msg.Off), "want_off", u32s(cur.bytesWritten)) - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "unexpected_offset") + // Lua transfer_mgr.lua aborts and clears the active transfer on any + // chunk-level fault (unexpected offset, decode failure, size mismatch). + // Match that — do not send xfer_need + keep alive. + id := cur.meta.ID + if msg.Offset != cur.bytesWritten { + println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", + "id", id, "err", "unexpected_offset", + "off", u32s(msg.Offset), "want_off", u32s(cur.bytesWritten)) + s.abortTransfer("unexpected_offset") + s.sendTransferAbort(id, "unexpected_offset") return } raw, err := base64.RawURLEncoding.DecodeString(msg.Data) if err != nil { - println("[fabric]", "sid", s.localSID, "xfer_need sent", - "id", cur.meta.ID, "next", u32s(cur.expectedNext), - "err", "decode_failed", "seq", u32s(msg.Seq), "off", u32s(msg.Off), - "data_len", u32s(uint32(len(msg.Data)))) - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "decode_failed") + println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", + "id", id, "err", "decode_failed", + "off", u32s(msg.Offset), "data_len", u32s(uint32(len(msg.Data)))) + s.abortTransfer("decode_failed") + s.sendTransferAbort(id, "decode_failed") return } - if uint32(len(raw)) != msg.N || msg.N == 0 { - println("[fabric]", "sid", s.localSID, "xfer_need sent", - "id", cur.meta.ID, "next", u32s(cur.expectedNext), - "err", "size_mismatch", "seq", u32s(msg.Seq), "off", u32s(msg.Off), - "n", u32s(msg.N), "data_len", u32s(uint32(len(msg.Data))), "decoded", u32s(uint32(len(raw)))) - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") - return - } - if crc32Hex(raw) != lowerHex(msg.CRC32) { - println("[fabric]", "sid", s.localSID, "xfer_need sent", - "id", cur.meta.ID, "next", u32s(cur.expectedNext), - "err", "bad_crc", "seq", u32s(msg.Seq)) - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "bad_crc") + if len(raw) == 0 { + println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", + "id", id, "err", "empty_chunk", "off", u32s(msg.Offset)) + s.abortTransfer("empty_chunk") + s.sendTransferAbort(id, "empty_chunk") return } if cur.bytesWritten+uint32(len(raw)) > cur.meta.Size { - println("[fabric]", "sid", s.localSID, "xfer_need sent", - "id", cur.meta.ID, "next", u32s(cur.expectedNext), - "err", "size_mismatch", "bytes_written", u32s(cur.bytesWritten), - "raw_len", u32s(uint32(len(raw))), "total", u32s(cur.meta.Size)) - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "size_mismatch") + println("[fabric]", "sid", s.localSID, "xfer_chunk aborted", + "id", id, "err", "size_overflow", + "bytes_written", u32s(cur.bytesWritten), + "raw_len", u32s(uint32(len(raw))), + "total", u32s(cur.meta.Size)) + s.abortTransfer("size_overflow") + s.sendTransferAbort(id, "size_overflow") return } - if err := cur.sink.WriteChunk(msg.Seq, msg.Off, raw); err != nil { - s.logKV("transfer write failed", "err", err.Error()) - id := cur.meta.ID + if err := cur.sink.WriteChunk(msg.Offset, raw); err != nil { reason := err.Error() + s.logKV("transfer write failed", "err", reason) s.abortTransfer(reason) - s.sendTransferDone(id, false, transferInfo{}, reason) + s.sendTransferAbort(id, reason) return } _, _ = cur.hasher.Write(raw) - cur.expectedNext++ cur.bytesWritten += uint32(len(raw)) cur.chunksSeen++ if cur.chunksSeen == 1 || (cur.chunksSeen%transferProgressLogEvery) == 0 { @@ -288,22 +221,23 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { "[fabric]", "sid", s.localSID, "xfer_chunk accepted", "id", cur.meta.ID, - "seq", u32s(msg.Seq), - "off", u32s(msg.Off), - "n", u32s(msg.N), - "data_len", u32s(uint32(len(msg.Data))), + "off", u32s(msg.Offset), + "data_len", u32s(uint32(len(raw))), "bytes_written", u32s(cur.bytesWritten), ) } raw = nil + // Forced GC after each absorbed chunk eliminates firmware-transfer byte + // drops on the safe-window allocator. Do NOT remove this without + // reproducing the regression in firmware-mono/docs/old/FABRIC_TRANSFER_FIX.md. runtime.GC() - s.sendTransferNeed(cur.meta.ID, cur.expectedNext, "") + s.sendTransferNeed(cur.meta.ID, cur.bytesWritten) } func (s *session) onTransferCommit(msg *protoXferCommit) { cur := s.incomingTransfer - if cur == nil || cur.meta.ID != msg.ID { - s.logKV("xfer_commit dropped", "id", msg.ID) + if cur == nil || cur.meta.ID != msg.XferID { + s.logKV("xfer_commit dropped", "id", msg.XferID) return } id := cur.meta.ID @@ -313,13 +247,20 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { "bytes_written", u32s(cur.bytesWritten), "msg_size", u32s(msg.Size), "meta_size", u32s(cur.meta.Size)) s.abortTransfer("size_mismatch") - s.sendTransferDone(id, false, transferInfo{}, "size_mismatch") + s.sendTransferAbort(id, "size_mismatch") return } - if lowerHex(msg.SHA256) != cur.meta.SHA256 || sha256Hex(cur.hasher) != cur.meta.SHA256 { - s.logKV("xfer_commit failed: sha256_mismatch", "id", id) - s.abortTransfer("sha256_mismatch") - s.sendTransferDone(id, false, transferInfo{}, "sha256_mismatch") + streamedHex := xxhashHex(cur.hasher.Sum32()) + commitChecksum := lowerHex(msg.Checksum) + if commitChecksum != cur.meta.Checksum || streamedHex != cur.meta.Checksum { + println("[fabric]", "sid", s.localSID, "xfer_commit failed", + "id", id, "err", "checksum_mismatch", + "begin", cur.meta.Checksum, + "commit", commitChecksum, + "streamed", streamedHex, + ) + s.abortTransfer("checksum_mismatch") + s.sendTransferAbort(id, "checksum_mismatch") return } info, err := cur.sink.Commit() @@ -327,7 +268,7 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { s.logKV("transfer commit failed", "err", err.Error()) reason := err.Error() s.abortTransfer(reason) - s.sendTransferDone(id, false, transferInfo{}, reason) + s.sendTransferAbort(id, reason) return } sink := cur.sink @@ -338,7 +279,7 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { "id", id, "bytes_written", u32s(info.BytesWritten), ) - if !s.sendTransferDone(id, true, info, "") { + if !s.sendTransferDone(id) { return } time.Sleep(postTransferDoneSettle) @@ -351,14 +292,26 @@ func (s *session) onTransferCommit(msg *protoXferCommit) { func (s *session) onTransferAbort(msg *protoXferAbort) { cur := s.incomingTransfer - if cur == nil || cur.meta.ID != msg.ID { - s.logKV("xfer_abort dropped", "id", msg.ID) + if cur == nil || cur.meta.ID != msg.XferID { + s.logKV("xfer_abort dropped", "id", msg.XferID) return } - reason := msg.Reason + reason := msg.Err if reason == "" { reason = "remote_abort" } println("[fabric]", "sid", s.localSID, "xfer_abort received", "id", cur.meta.ID, "reason", reason) s.abortTransfer(reason) } + +// xxhashHex formats a uint32 xxHash32 digest as 8 lower-case hex characters, +// matching the wire format used by the Lua reference's M.digest_hex. +func xxhashHex(v uint32) string { + const digits = "0123456789abcdef" + var buf [8]byte + for i := 7; i >= 0; i-- { + buf[i] = digits[v&0xf] + v >>= 4 + } + return string(buf[:]) +} diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index f117f8f..285870a 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -1,140 +1,23 @@ -//go:build tinygo && rp2350 +//go:build tinygo && rp2350 && !flash_unsafe + +// Default RP2350 transfer sink for the fabric-protocol baseline. Rejects all +// transfers at xfer_begin: signed-image verification and staged flash writes +// land in fabric-update via the receiver topic +// `raw/member/mcu/cap/updater/main/rpc/receive` and `pico2-a-b/imagev1/`. Until +// that path lands, the safe default is to refuse incoming transfers rather +// than flash unverified bytes directly into the inactive slot. +// +// To re-enable direct (unverified) abupdate flashing for development or +// hardware bring-up, build with `-tags=flash_unsafe`. See +// transfer_sink_rp2350_unsafe.go. package fabric -import ( - "errors" - "fmt" - "time" +import "errors" - "pico2-a-b/abupdate" -) - -const stageSize = 4096 - -var errTransferUnsupported = errors.New("unsupported") - -type transferSinkImpl struct { - updater *abupdate.Updater - - // Stage verified transfer bytes in protocol code so flash writes happen in - // larger batches instead of directly on every UART chunk. - stage [stageSize]byte - stageUsed uint32 - accepted uint32 -} +var errTransferUnsupported = errors.New("staging_unavailable: signed-image receiver not present in this build") func beginTransfer(meta transferMeta) (transferSink, error) { - if meta.Kind != "firmware.rp2350" || meta.Format != "bin" { - return nil, errTransferUnsupported - } - - var updater abupdate.Updater - if rc := updater.Init(); rc != 0 { - return nil, fmt.Errorf("updater_init:%d", rc) - } - if rc := updater.BeginUpdate(meta.Size); rc != 0 { - return nil, fmt.Errorf("begin_update:%d", rc) - } - - return &transferSinkImpl{updater: &updater}, nil -} - -func (s *transferSinkImpl) flushStage(seq uint32, force bool) error { - if s.stageUsed == 0 { - return nil - } - - before := s.updater.BytesWritten() - expected := s.accepted - s.stageUsed - if before != expected { - return fmt.Errorf("unexpected_offset:%d", before) - } - - flushed := s.stageUsed - start := time.Now() - if rc := s.updater.WriteChunk(s.stage[:flushed]); rc != 0 { - return fmt.Errorf("write_chunk:%d", rc) - } - after := s.updater.BytesWritten() - s.stageUsed = 0 - - dt := time.Since(start) - if force || seq == 0 || (seq%32) == 31 || dt >= 2*time.Millisecond { - println( - "[fabric]", "xfer_sink_flush", - "seq", u32s(seq), - "stage_n", u32s(flushed), - "bytes_before", u32s(before), - "bytes_after", u32s(after), - "dt_us", u32s(uint32(dt/time.Microsecond)), - ) - } - - return nil -} - -func (s *transferSinkImpl) WriteChunk(seq, off uint32, data []byte) error { - if s.accepted != off { - return fmt.Errorf("unexpected_offset:%d", s.accepted) - } - - remaining := data - for len(remaining) > 0 { - if s.stageUsed == uint32(len(s.stage)) { - if err := s.flushStage(seq, false); err != nil { - return err - } - } - - n := copy(s.stage[s.stageUsed:], remaining) - s.stageUsed += uint32(n) - s.accepted += uint32(n) - remaining = remaining[n:] - - if s.stageUsed == uint32(len(s.stage)) { - if err := s.flushStage(seq, false); err != nil { - return err - } - } - } - - if seq == 0 || (seq%32) == 31 { - println( - "[fabric]", "xfer_sink_stage", - "seq", u32s(seq), - "off", u32s(off), - "n", u32s(uint32(len(data))), - "stage_used", u32s(s.stageUsed), - "accepted", u32s(s.accepted), - ) - } - - return nil -} - -func (s *transferSinkImpl) Commit() (transferInfo, error) { - if err := s.flushStage(0, true); err != nil { - return transferInfo{}, err - } - if rc := s.updater.FlushFinal(); rc != 0 { - return transferInfo{}, fmt.Errorf("flush_final:%d", rc) - } - return transferInfo{ - BytesWritten: s.updater.BytesWritten(), - SlotXIPAddr: s.updater.SlotStorageAddr(), - }, nil -} - -func (s *transferSinkImpl) Apply() error { - if rc := s.updater.RebootIntoSlot(); rc != 0 { - return fmt.Errorf("reboot:%d", rc) - } - return nil -} - -func (s *transferSinkImpl) Abort(reason string) error { - _ = reason - s.stageUsed = 0 - return nil + _ = meta + return nil, errTransferUnsupported } diff --git a/services/fabric/transfer_sink_rp2350_unsafe.go b/services/fabric/transfer_sink_rp2350_unsafe.go new file mode 100644 index 0000000..add267b --- /dev/null +++ b/services/fabric/transfer_sink_rp2350_unsafe.go @@ -0,0 +1,148 @@ +//go:build tinygo && rp2350 && flash_unsafe + +// Direct abupdate flashing on incoming transfers. Gated behind the +// `flash_unsafe` build tag because this path flashes raw bytes without +// signed-image verification (signature verify and staging belong in +// fabric-update's receiver). Use only for development or hardware bring-up +// where unsigned images are acceptable. The default fabric-protocol build +// rejects transfers at xfer_begin — see transfer_sink_rp2350.go. + +package fabric + +import ( + "fmt" + "time" + + "pico2-a-b/abupdate" +) + +const stageSize = 4096 + +type transferSinkImpl struct { + updater *abupdate.Updater + + // Stage verified transfer bytes in protocol code so flash writes happen + // in larger batches instead of directly on every UART chunk. + stage [stageSize]byte + stageUsed uint32 + accepted uint32 + chunksSeen uint32 +} + +// beginTransfer creates an MCU-side sink for incoming firmware bytes. In the +// fabric-protocol baseline this accepts any well-formed transfer; image-format +// validation (kind/format/target/signature) is the receiver's job in +// fabric-update. +func beginTransfer(meta transferMeta) (transferSink, error) { + var updater abupdate.Updater + if rc := updater.Init(); rc != 0 { + return nil, fmt.Errorf("updater_init:%d", rc) + } + if rc := updater.BeginUpdate(meta.Size); rc != 0 { + return nil, fmt.Errorf("begin_update:%d", rc) + } + + return &transferSinkImpl{updater: &updater}, nil +} + +func (s *transferSinkImpl) flushStage(force bool) error { + if s.stageUsed == 0 { + return nil + } + + before := s.updater.BytesWritten() + expected := s.accepted - s.stageUsed + if before != expected { + return fmt.Errorf("unexpected_offset:%d", before) + } + + flushed := s.stageUsed + start := time.Now() + if rc := s.updater.WriteChunk(s.stage[:flushed]); rc != 0 { + return fmt.Errorf("write_chunk:%d", rc) + } + after := s.updater.BytesWritten() + s.stageUsed = 0 + + dt := time.Since(start) + chunk := s.chunksSeen + if force || chunk == 0 || (chunk%32) == 31 || dt >= 2*time.Millisecond { + println( + "[fabric]", "xfer_sink_flush", + "chunk", u32s(chunk), + "stage_n", u32s(flushed), + "bytes_before", u32s(before), + "bytes_after", u32s(after), + "dt_us", u32s(uint32(dt/time.Microsecond)), + ) + } + + return nil +} + +func (s *transferSinkImpl) WriteChunk(off uint32, data []byte) error { + if s.accepted != off { + return fmt.Errorf("unexpected_offset:%d", s.accepted) + } + + remaining := data + for len(remaining) > 0 { + if s.stageUsed == uint32(len(s.stage)) { + if err := s.flushStage(false); err != nil { + return err + } + } + + n := copy(s.stage[s.stageUsed:], remaining) + s.stageUsed += uint32(n) + s.accepted += uint32(n) + remaining = remaining[n:] + + if s.stageUsed == uint32(len(s.stage)) { + if err := s.flushStage(false); err != nil { + return err + } + } + } + + chunk := s.chunksSeen + if chunk == 0 || (chunk%32) == 31 { + println( + "[fabric]", "xfer_sink_stage", + "chunk", u32s(chunk), + "off", u32s(off), + "n", u32s(uint32(len(data))), + "stage_used", u32s(s.stageUsed), + "accepted", u32s(s.accepted), + ) + } + s.chunksSeen++ + + return nil +} + +func (s *transferSinkImpl) Commit() (transferInfo, error) { + if err := s.flushStage(true); err != nil { + return transferInfo{}, err + } + if rc := s.updater.FlushFinal(); rc != 0 { + return transferInfo{}, fmt.Errorf("flush_final:%d", rc) + } + return transferInfo{ + BytesWritten: s.updater.BytesWritten(), + SlotXIPAddr: s.updater.SlotStorageAddr(), + }, nil +} + +func (s *transferSinkImpl) Apply() error { + if rc := s.updater.RebootIntoSlot(); rc != 0 { + return fmt.Errorf("reboot:%d", rc) + } + return nil +} + +func (s *transferSinkImpl) Abort(reason string) error { + _ = reason + s.stageUsed = 0 + return nil +} diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 4b06411..03fe986 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -2,19 +2,17 @@ package fabric import ( "context" - "crypto/sha256" "encoding/base64" - "encoding/hex" "encoding/json" "strings" "testing" "time" "devicecode-go/bus" + "devicecode-go/x/xxhash" ) type fakeTransferSink struct { - seqs []uint32 offs []uint32 writes [][]byte writeErr error @@ -26,11 +24,10 @@ type fakeTransferSink struct { abortReasons []string } -func (s *fakeTransferSink) WriteChunk(seq, off uint32, data []byte) error { +func (s *fakeTransferSink) WriteChunk(off uint32, data []byte) error { if s.writeErr != nil { return s.writeErr } - s.seqs = append(s.seqs, seq) s.offs = append(s.offs, off) s.writes = append(s.writes, append([]byte(nil), data...)) return nil @@ -73,40 +70,54 @@ func rawURL(data []byte) string { return base64.RawURLEncoding.EncodeToString(data) } -func sha256String(data []byte) string { - sum := sha256.Sum256(data) - return hex.EncodeToString(sum[:]) +// xxhashStr is the wire-format checksum: lower-case hex, 8 chars, no algorithm +// field. Mirrors the Lua reference's M.digest_hex. +func xxhashStr(data []byte) string { + return xxhash.SumHex(data) } -func TestTransferBeginUnsupportedOnHost(t *testing.T) { +func TestTransferBeginPreservesMeta(t *testing.T) { + // xfer_begin's meta is opaque to fabric-protocol but must be preserved + // for fabric-update's receiver, which pulls meta.receiver out of it. b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + var captured transferMeta + sink := &fakeTransferSink{} + s := session{ + linkID: defaultLinkID, + nodeID: "mcu-1", + peerID: "cm5-local", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + beginTransfer: func(meta transferMeta) (transferSink, error) { + captured = meta + return sink, nil + }, + } + go s.run(ctx) bringUp(t, cm5) - payload := []byte("firmware") + payload := []byte("abcd") + metaBlob := json.RawMessage(`{"receiver":["raw","member","mcu","cap","updater","main","rpc","receive"],"version":"1.2.3"}`) + sendMsg(t, cm5, protoXferBegin{ - T: msgXferBegin, - ID: "xfer-1", - Kind: "firmware.rp2350", - Name: "fw.bin", - Format: "bin", - Enc: "b64url", + Type: msgXferBegin, + XferID: "xfer-meta", Size: uint32(len(payload)), - ChunkRaw: 4, - Chunks: 2, - SHA256: sha256String(payload), + Checksum: xxhashStr(payload), + Meta: metaBlob, }) + _ = readMsg[protoXferReady](t, cm5) - ready := readMsg[protoXferReady](t, cm5) - if ready.T != msgXferReady || ready.ID != "xfer-1" || ready.OK { - t.Fatalf("bad xfer_ready: %+v", ready) + if string(captured.Meta) != string(metaBlob) { + t.Fatalf("transferMeta.Meta = %q, want %q", captured.Meta, metaBlob) } - if ready.Err != "unsupported" { - t.Fatalf("xfer_ready.Err = %q, want unsupported", ready.Err) + if captured.ID != "xfer-meta" || captured.Size != uint32(len(payload)) { + t.Fatalf("transferMeta basic fields wrong: %+v", captured) } } @@ -127,65 +138,48 @@ func TestTransferReceiveSuccess(t *testing.T) { bringUp(t, cm5) payload := []byte("abcdefghij") + checksum := xxhashStr(payload) + sendMsg(t, cm5, protoXferBegin{ - T: msgXferBegin, - ID: "xfer-2", - Kind: "firmware.rp2350", - Name: "fw.bin", - Format: "bin", - Enc: "b64url", + Type: msgXferBegin, + XferID: "xfer-2", Size: uint32(len(payload)), - ChunkRaw: 4, - Chunks: 3, - SHA256: sha256String(payload), + Checksum: checksum, }) ready := readMsg[protoXferReady](t, cm5) - if !ready.OK || ready.Next == nil || *ready.Next != 0 { + if ready.Type != msgXferReady || ready.XferID != "xfer-2" { t.Fatalf("bad xfer_ready: %+v", ready) } - parts := [][]byte{ - payload[:4], - payload[4:8], - payload[8:], - } + parts := [][]byte{payload[:4], payload[4:8], payload[8:]} off := uint32(0) for i, part := range parts { sendMsg(t, cm5, protoXferChunk{ - T: msgXferChunk, - ID: "xfer-2", - Seq: uint32(i), - Off: off, - N: uint32(len(part)), - CRC32: crc32Hex(part), - Data: rawURL(part), + Type: msgXferChunk, + XferID: "xfer-2", + Offset: off, + Data: rawURL(part), }) need := readMsg[protoXferNeed](t, cm5) - if need.Next != uint32(i+1) || need.Err != "" { - t.Fatalf("bad xfer_need[%d]: %+v", i, need) + want := off + uint32(len(part)) + if need.Next != want { + t.Fatalf("xfer_need[%d].next = %d, want %d", i, need.Next, want) } - off += uint32(len(part)) + off = want } sendMsg(t, cm5, protoXferCommit{ - T: msgXferCommit, - ID: "xfer-2", - Size: uint32(len(payload)), - SHA256: sha256String(payload), + Type: msgXferCommit, + XferID: "xfer-2", + Size: uint32(len(payload)), + Checksum: checksum, }) done := readMsg[protoXferDone](t, cm5) - if !done.OK || done.ID != "xfer-2" { + if done.Type != msgXferDone || done.XferID != "xfer-2" { t.Fatalf("bad xfer_done: %+v", done) } - var info transferInfo - if err := json.Unmarshal(done.Info, &info); err != nil { - t.Fatalf("unmarshal info: %v", err) - } - if info.BytesWritten != 10 || info.SlotXIPAddr != 0x10280000 { - t.Fatalf("bad transfer info: %+v", info) - } time.Sleep(postTransferDoneSettle + 50*time.Millisecond) @@ -200,7 +194,10 @@ func TestTransferReceiveSuccess(t *testing.T) { } } -func TestTransferChunkBadCRCRequestsResend(t *testing.T) { +func TestTransferChunkBadOffsetAborts(t *testing.T) { + // Lua transfer_mgr aborts and clears the active transfer on chunk faults + // (unexpected_offset, decode_failed, size_overflow). Match that — do not + // keep the transfer alive with an xfer_need. b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -212,39 +209,71 @@ func TestTransferChunkBadCRCRequestsResend(t *testing.T) { payload := []byte("abcd") sendMsg(t, cm5, protoXferBegin{ - T: msgXferBegin, - ID: "xfer-3", - Kind: "firmware.rp2350", - Name: "fw.bin", - Format: "bin", - Enc: "b64url", + Type: msgXferBegin, + XferID: "xfer-3", Size: uint32(len(payload)), - ChunkRaw: 4, - Chunks: 1, - SHA256: sha256String(payload), + Checksum: xxhashStr(payload), }) _ = readMsg[protoXferReady](t, cm5) + // Send a chunk at the wrong byte offset; expect xfer_abort and + // sink.Abort, not an xfer_need retry. sendMsg(t, cm5, protoXferChunk{ - T: msgXferChunk, - ID: "xfer-3", - Seq: 0, - Off: 0, - N: uint32(len(payload)), - CRC32: "deadbeef", - Data: rawURL(payload), + Type: msgXferChunk, + XferID: "xfer-3", + Offset: 7, + Data: rawURL(payload), }) - need := readMsg[protoXferNeed](t, cm5) - if need.Next != 0 || need.Err != "bad_crc" { - t.Fatalf("bad xfer_need: %+v", need) + abort := readMsg[protoXferAbort](t, cm5) + if abort.Type != msgXferAbort || abort.XferID != "xfer-3" || abort.Err != "unexpected_offset" { + t.Fatalf("bad xfer_abort: %+v", abort) } if len(sink.writes) != 0 { t.Fatalf("sink received %d writes, want 0", len(sink.writes)) } + if len(sink.abortReasons) == 0 { + t.Fatal("expected sink.Abort to be called on chunk fault") + } +} + +func TestTransferChunkDecodeFailureAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, protoXferBegin{ + Type: msgXferBegin, + XferID: "xfer-d1", + Size: uint32(len(payload)), + Checksum: xxhashStr(payload), + }) + _ = readMsg[protoXferReady](t, cm5) + + // Bogus base64 (uses non-base64url chars). + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-d1", + Offset: 0, + Data: "!!!not-base64!!!", + }) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.Err != "decode_failed" { + t.Fatalf("bad xfer_abort: %+v", abort) + } + if len(sink.abortReasons) == 0 { + t.Fatal("expected sink.Abort on decode failure") + } } -func TestTransferCommitHashMismatchReturnsDoneError(t *testing.T) { +func TestTransferChunkSizeOverflowAborts(t *testing.T) { b := newBus() cm5, mcu := pipePair() ctx, cancel := context.WithCancel(context.Background()) @@ -255,43 +284,116 @@ func TestTransferCommitHashMismatchReturnsDoneError(t *testing.T) { bringUp(t, cm5) payload := []byte("abcd") + // Advertise size=4 but send 6 bytes in the first chunk. sendMsg(t, cm5, protoXferBegin{ - T: msgXferBegin, - ID: "xfer-4", - Kind: "firmware.rp2350", - Name: "fw.bin", - Format: "bin", - Enc: "b64url", + Type: msgXferBegin, + XferID: "xfer-d2", Size: uint32(len(payload)), - ChunkRaw: 4, - Chunks: 1, - SHA256: sha256String(payload), + Checksum: xxhashStr(payload), }) _ = readMsg[protoXferReady](t, cm5) sendMsg(t, cm5, protoXferChunk{ - T: msgXferChunk, - ID: "xfer-4", - Seq: 0, - Off: 0, - N: uint32(len(payload)), - CRC32: crc32Hex(payload), - Data: rawURL(payload), + Type: msgXferChunk, + XferID: "xfer-d2", + Offset: 0, + Data: rawURL([]byte("abcdef")), + }) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.Err != "size_overflow" { + t.Fatalf("bad xfer_abort: %+v", abort) + } +} + +func TestTransferCommitChecksumMismatchAborts(t *testing.T) { + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + // Begin with the wrong-checksum advertised. The only way to surface a + // commit-time mismatch is for begin/commit checksums to disagree, OR for + // the streamed bytes to disagree with the begin checksum. Use the + // latter: claim a bogus begin/commit checksum but stream the real bytes. + bogus := strings.Repeat("0", 8) + sendMsg(t, cm5, protoXferBegin{ + Type: msgXferBegin, + XferID: "xfer-4", + Size: uint32(len(payload)), + Checksum: bogus, + }) + _ = readMsg[protoXferReady](t, cm5) + + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-4", + Offset: 0, + Data: rawURL(payload), }) _ = readMsg[protoXferNeed](t, cm5) sendMsg(t, cm5, protoXferCommit{ - T: msgXferCommit, - ID: "xfer-4", - Size: uint32(len(payload)), - SHA256: strings.Repeat("0", 64), + Type: msgXferCommit, + XferID: "xfer-4", + Size: uint32(len(payload)), + Checksum: bogus, }) - done := readMsg[protoXferDone](t, cm5) - if done.OK || done.Err != "sha256_mismatch" { - t.Fatalf("bad xfer_done: %+v", done) + abort := readMsg[protoXferAbort](t, cm5) + if abort.Type != msgXferAbort || abort.Err != "checksum_mismatch" { + t.Fatalf("bad xfer_abort: %+v", abort) } if len(sink.abortReasons) == 0 { - t.Fatal("expected sink abort on hash mismatch") + t.Fatal("expected sink abort on checksum mismatch") + } +} + +func TestTransferCommitChecksumMismatchOnCommitFrameAborts(t *testing.T) { + // xfer_begin and xfer_commit must agree on the checksum. If they + // disagree (even when the streamed bytes match begin), commit aborts. + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + go runSessionWithSink(ctx, mcu, b.NewConnection("fabric"), sink) + bringUp(t, cm5) + + payload := []byte("abcd") + good := xxhashStr(payload) + sendMsg(t, cm5, protoXferBegin{ + Type: msgXferBegin, + XferID: "xfer-5", + Size: uint32(len(payload)), + Checksum: good, + }) + _ = readMsg[protoXferReady](t, cm5) + + sendMsg(t, cm5, protoXferChunk{ + Type: msgXferChunk, + XferID: "xfer-5", + Offset: 0, + Data: rawURL(payload), + }) + _ = readMsg[protoXferNeed](t, cm5) + + // Commit advertises a different checksum than begin: must abort. + sendMsg(t, cm5, protoXferCommit{ + Type: msgXferCommit, + XferID: "xfer-5", + Size: uint32(len(payload)), + Checksum: strings.Repeat("0", 8), + }) + + abort := readMsg[protoXferAbort](t, cm5) + if abort.Type != msgXferAbort || abort.Err != "checksum_mismatch" { + t.Fatalf("bad xfer_abort: %+v", abort) } } diff --git a/services/fabric/transport_rw.go b/services/fabric/transport_rw.go index 871316e..252b3a5 100644 --- a/services/fabric/transport_rw.go +++ b/services/fabric/transport_rw.go @@ -9,7 +9,13 @@ import ( // Used for USB serial (fabric-test) and host-side unit tests. -const maxLineLen = 2048 +// maxLineLen caps a single fabric frame (line-delimited JSON) end-to-end. It +// must clear the worst-case encoded transfer chunk: release chunk_size = 2048 +// raw → ~2731 chars base64url-encoded + ~150-byte JSON envelope + newline +// ≈ 2900 bytes. 4096 is the tightest round power-of-2 above that with ~1.1 KB +// headroom. See devicecode-lua/src/services/fabric/protocol.lua at +// update-migration tip for the canonical encoding. +const maxLineLen = 4096 var ErrLineTooLong = fmt.Errorf("line exceeds %d bytes", maxLineLen) diff --git a/x/xxhash/xxhash.go b/x/xxhash/xxhash.go new file mode 100644 index 0000000..7b9fc63 --- /dev/null +++ b/x/xxhash/xxhash.go @@ -0,0 +1,167 @@ +// Package xxhash implements the xxHash32 algorithm — a fast, non-cryptographic +// 32-bit hash. +// +// This package mirrors devicecode-lua/src/shared/hash/xxhash32.lua at +// update-migration tip (commit 2c88090). It is used for fabric wire-protocol +// integrity (xfer_begin / xfer_commit checksum field) and for HAL artefact +// hashing. It is not a security primitive. +package xxhash + +import "math/bits" + +// xxHash32 round constants. These match the canonical xxHash32 spec and the +// Lua reference at src/shared/hash/xxhash32.lua. +const ( + prime32_1 uint32 = 0x9E3779B1 + prime32_2 uint32 = 0x85EBCA77 + prime32_3 uint32 = 0xC2B2AE3D + prime32_4 uint32 = 0x27D4EB2F + prime32_5 uint32 = 0x165667B1 +) + +// Hasher is a streaming xxHash32 state. +type Hasher struct { + seed uint32 + totalLen uint32 + v1, v2, v3, v4 uint32 + mem [16]byte + memN uint8 // 0..15 + large bool // true once a 16-byte block has been absorbed +} + +// New returns a streaming xxHash32 hasher seeded with seed. +func New(seed uint32) *Hasher { + h := &Hasher{} + h.reset(seed) + return h +} + +// Reset re-initialises the hasher with seed 0. To re-seed with a different +// value, allocate a new Hasher with New. +func (h *Hasher) Reset() { h.reset(0) } + +func (h *Hasher) reset(seed uint32) { + h.seed = seed + h.totalLen = 0 + h.v1 = seed + prime32_1 + prime32_2 + h.v2 = seed + prime32_2 + h.v3 = seed + h.v4 = seed - prime32_1 + h.memN = 0 + h.large = false +} + +// Write absorbs p into the running hash. Always returns (len(p), nil). +func (h *Hasher) Write(p []byte) (int, error) { + n := len(p) + if n == 0 { + return 0, nil + } + h.totalLen += uint32(n) + + // Top up the partial-block buffer if it has any bytes. + if h.memN > 0 { + need := 16 - int(h.memN) + if n < need { + copy(h.mem[h.memN:], p) + h.memN += uint8(n) + return n, nil + } + copy(h.mem[h.memN:], p[:need]) + h.absorbBlock(h.mem[:]) + p = p[need:] + h.memN = 0 + } + + // Absorb aligned 16-byte blocks directly from p. + for len(p) >= 16 { + h.absorbBlock(p[:16]) + p = p[16:] + } + + // Stash the trailing remainder (0..15 bytes). + if len(p) > 0 { + copy(h.mem[:], p) + h.memN = uint8(len(p)) + } + return n, nil +} + +func (h *Hasher) absorbBlock(b []byte) { + h.large = true + h.v1 = round(h.v1, leU32(b[0:4])) + h.v2 = round(h.v2, leU32(b[4:8])) + h.v3 = round(h.v3, leU32(b[8:12])) + h.v4 = round(h.v4, leU32(b[12:16])) +} + +// Sum32 returns the xxHash32 of all bytes absorbed so far. It does not modify +// the hasher state; calling Write afterwards continues the hash. +func (h *Hasher) Sum32() uint32 { + var x uint32 + if h.large { + x = bits.RotateLeft32(h.v1, 1) + + bits.RotateLeft32(h.v2, 7) + + bits.RotateLeft32(h.v3, 12) + + bits.RotateLeft32(h.v4, 18) + } else { + x = h.seed + prime32_5 + } + + x += h.totalLen + + rem := h.mem[:h.memN] + for len(rem) >= 4 { + x += leU32(rem) * prime32_3 + x = bits.RotateLeft32(x, 17) * prime32_4 + rem = rem[4:] + } + for _, b := range rem { + x += uint32(b) * prime32_5 + x = bits.RotateLeft32(x, 11) * prime32_1 + } + + x ^= x >> 15 + x *= prime32_2 + x ^= x >> 13 + x *= prime32_3 + x ^= x >> 16 + return x +} + +// Sum32 computes the xxHash32 of p with the given seed in one pass. +func Sum32(p []byte, seed uint32) uint32 { + var h Hasher + h.reset(seed) + _, _ = h.Write(p) + return h.Sum32() +} + +// SumHex returns the xxHash32 of p (seed 0) as 8 lower-case hex characters, +// matching the wire format used by the Lua reference's M.digest_hex. +func SumHex(p []byte) string { return hex8(Sum32(p, 0)) } + +// VerifyHex compares SumHex(p) to expected for case-sensitive equality. +func VerifyHex(p []byte, expected string) bool { return SumHex(p) == expected } + +func round(acc, lane uint32) uint32 { + acc += lane * prime32_2 + acc = bits.RotateLeft32(acc, 13) + acc *= prime32_1 + return acc +} + +func leU32(b []byte) uint32 { + return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 +} + +const hexdigits = "0123456789abcdef" + +func hex8(v uint32) string { + var buf [8]byte + for i := 7; i >= 0; i-- { + buf[i] = hexdigits[v&0xf] + v >>= 4 + } + return string(buf[:]) +} diff --git a/x/xxhash/xxhash_test.go b/x/xxhash/xxhash_test.go new file mode 100644 index 0000000..00fef90 --- /dev/null +++ b/x/xxhash/xxhash_test.go @@ -0,0 +1,157 @@ +package xxhash + +import ( + "bytes" + "testing" +) + +// Reference vectors validated against +// devicecode-lua/src/shared/hash/xxhash32.lua at update-migration tip +// (commit 2c88090) using `print(M.digest_hex(input))` with seed 0. +var refVectors = []struct { + name string + input string + hex string +}{ + {"empty", "", "02cc5d05"}, + {"a", "a", "550d7456"}, + {"abc", "abc", "32d153ff"}, + {"123456789", "123456789", "937bad67"}, +} + +func TestSumHex_KnownAnswer(t *testing.T) { + for _, v := range refVectors { + got := SumHex([]byte(v.input)) + if got != v.hex { + t.Errorf("SumHex(%q): got %s, want %s", v.input, got, v.hex) + } + } +} + +func TestSum32_KnownAnswer(t *testing.T) { + // Sum32(_, 0) must agree with SumHex (which forces seed 0). + for _, v := range refVectors { + want := SumHex([]byte(v.input)) + got := hex8(Sum32([]byte(v.input), 0)) + if got != want { + t.Errorf("Sum32(%q, 0): got %s, want %s", v.input, got, want) + } + } +} + +func TestVerifyHex(t *testing.T) { + for _, v := range refVectors { + if !VerifyHex([]byte(v.input), v.hex) { + t.Errorf("VerifyHex(%q, %s) returned false", v.input, v.hex) + } + if VerifyHex([]byte(v.input), "deadbeef") { + t.Errorf("VerifyHex(%q, deadbeef) returned true", v.input) + } + } +} + +func TestStreaming_ByteByByte(t *testing.T) { + for _, v := range refVectors { + h := New(0) + for _, b := range []byte(v.input) { + h.Write([]byte{b}) + } + got := hex8(h.Sum32()) + if got != v.hex { + t.Errorf("byte-stream %q: got %s, want %s", v.input, got, v.hex) + } + } +} + +func TestStreaming_OddSplits(t *testing.T) { + // A 32-byte input spans two 16-byte blocks, so splits at 1, 7, 15, 16, + // 17, and 31 exercise mem-buffer top-up, exact block boundary, and tail + // bytes. + in := []byte("0123456789abcdef0123456789abcdef") + want := SumHex(in) + + for _, split := range []int{0, 1, 7, 15, 16, 17, 31, 32} { + h := New(0) + h.Write(in[:split]) + h.Write(in[split:]) + got := hex8(h.Sum32()) + if got != want { + t.Errorf("split=%d: got %s, want %s", split, got, want) + } + } +} + +func TestStreaming_EmptyWritesNoOp(t *testing.T) { + h := New(0) + h.Write(nil) + h.Write([]byte{}) + h.Write([]byte("abc")) + h.Write([]byte{}) + if got := hex8(h.Sum32()); got != "32d153ff" { + t.Errorf("with empty writes interleaved: got %s, want 32d153ff", got) + } +} + +func TestReset(t *testing.T) { + h := New(0) + h.Write([]byte("abc")) + if hex8(h.Sum32()) != "32d153ff" { + t.Fatalf("first sum mismatch") + } + h.Reset() + h.Write([]byte("abc")) + if hex8(h.Sum32()) != "32d153ff" { + t.Fatalf("post-reset sum mismatch") + } +} + +func TestSeedNonZero(t *testing.T) { + in := []byte("the quick brown fox jumps over the lazy dog") + if Sum32(in, 0) == Sum32(in, 1) { + t.Fatalf("seeds 0 and 1 produced same hash") + } + h := New(42) + h.Write(in) + if h.Sum32() != Sum32(in, 42) { + t.Fatalf("streaming with seed=42 != one-shot") + } +} + +func TestSum32Idempotent(t *testing.T) { + // Sum32 should not mutate state; calling it twice must give the same result. + h := New(0) + h.Write([]byte("abc")) + a := h.Sum32() + b := h.Sum32() + if a != b { + t.Errorf("Sum32 not idempotent: %x != %x", a, b) + } +} + +func TestSum32ContinuesAfter(t *testing.T) { + // Calling Sum32, then Write, then Sum32 again must reflect the new bytes. + h := New(0) + h.Write([]byte("a")) + h.Sum32() + h.Write([]byte("bc")) + got := hex8(h.Sum32()) + if got != "32d153ff" { + t.Errorf("post-Sum32 continuation: got %s, want 32d153ff", got) + } +} + +func TestLargeBuffer(t *testing.T) { + // Confirm one-shot and streaming agree on a buffer comfortably larger + // than the 16-byte block size; this exercises the hot loop in Write. + in := bytes.Repeat([]byte("0123456789abcdef"), 64) // 1024 bytes + want := Sum32(in, 0) + + h := New(0) + for i := 0; i < len(in); i += 7 { + end := min(i+7, len(in)) + h.Write(in[i:end]) + } + if got := h.Sum32(); got != want { + t.Errorf("1024-byte streaming vs one-shot: got %x, want %x", got, want) + } +} From 0b201976f08a70b9e95dcab4c9dda76e403bad19 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 09:30:29 +0000 Subject: [PATCH 52/65] fabric: W3 link config + idle-chunk watchdog - New LinkConfig{ChunkSize, PhaseTimeout} threaded via fabric.Run with release defaults from bigbox-v1-cm-2.json (2048 / 15s); zero-value config falls back to defaults via applyDefaults so direct session{} test construction stays safe. - incomingTransfer.deadline armed on xfer_begin accept and refreshed on every accepted chunk, mirroring transfer_mgr.lua. - New checkTransferTimeout fires from the existing 50ms drain tick; on expiry it aborts the local sink and emits xfer_abort{err="timeout"} to match Lua's clear_active('timeout') + outbound abort. - Reactor caller now passes fabric.DefaultLinkConfig(). - Test fabric_test.go callers updated to pass DefaultLinkConfig(). - New TestTransferIdleChunkWatchdog with PhaseTimeout=100ms. --- services/fabric/fabric.go | 38 +++++++++++++++++++++++- services/fabric/fabric_test.go | 50 ++++++++++++++++---------------- services/fabric/session.go | 8 +++-- services/fabric/transfer.go | 32 ++++++++++++++++++-- services/fabric/transfer_test.go | 46 +++++++++++++++++++++++++++++ services/reactor/reactor.go | 2 +- 6 files changed, 144 insertions(+), 32 deletions(-) diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index d0e3cd9..dbdc5ca 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -3,6 +3,7 @@ package fabric import ( "context" "sync/atomic" + "time" "devicecode-go/bus" "devicecode-go/x/strconvx" @@ -18,6 +19,40 @@ type Transport interface { const protoVersion = 1 const defaultLinkID = "mcu0" +// LinkConfig carries the fabric link parameters that the CM5 publishes +// alongside its own session/transfer-mgr instances. Mirrors the relevant +// keys in `bigbox-v1-cm-2.json` `service.fabric.links.` for the +// MCU-facing link. Missing fields fall back to release defaults via +// applyDefaults so callers can pass `LinkConfig{}` to mean "release". +type LinkConfig struct { + // ChunkSize is the expected raw-byte payload per xfer_chunk. The MCU + // is receive-only for transfers, so this is informational/validation + // only on the Go side. Release: 2048 bytes. + ChunkSize uint32 + // PhaseTimeout is the idle-chunk watchdog: an active inbound transfer + // is aborted with reason="timeout" if no xfer_chunk arrives within + // this window. Mirrors transfer_mgr.lua's `phase_timeout`. + // Release: 15s. + PhaseTimeout time.Duration +} + +func DefaultLinkConfig() LinkConfig { + return LinkConfig{ + ChunkSize: 2048, + PhaseTimeout: 15 * time.Second, + } +} + +func (c *LinkConfig) applyDefaults() { + d := DefaultLinkConfig() + if c.ChunkSize == 0 { + c.ChunkSize = d.ChunkSize + } + if c.PhaseTimeout == 0 { + c.PhaseTimeout = d.PhaseTimeout + } +} + var nextSessionID atomic.Uint64 func newLocalSID() string { @@ -30,7 +65,7 @@ func newLocalSID() string { // and replies with hello_ack; it responds to ping with pong. The CM5 // owns heartbeat cadence — the MCU marks the link stale if nothing // arrives within the timeout. -func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string) { +func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string, cfg LinkConfig) { s := session{ linkID: defaultLinkID, nodeID: nodeID, @@ -38,6 +73,7 @@ func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID localSID: newLocalSID(), tr: tr, conn: conn, + cfg: cfg, } s.run(ctx) } diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 8e57731..4ea8658 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -320,7 +320,7 @@ func TestHandshake(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) sendMsg(t, cm5, protoHello{ Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s1", Proto: protoVersion, @@ -342,7 +342,7 @@ func TestSessionReset(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-1", SID: "s2", Proto: protoVersion}) @@ -362,7 +362,7 @@ func TestRejectsWrongPeer(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) sendMsg(t, cm5, protoHello{Type: "hello", Node: "cm5-local", Peer: "mcu-999", SID: "s1", Proto: protoVersion}) gotLine := make(chan readResult, 1) @@ -398,7 +398,7 @@ func TestRejectsMissingNodeWhenPeerPinned(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) gotLine := make(chan readResult, 1) go func() { @@ -436,7 +436,7 @@ func TestPingPong(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) ack := bringUp(t, cm5) sendMsg(t, cm5, protoPing{Type: "ping", TS: 42, SID: "s1"}) pong := readMsg[protoPong](t, cm5) @@ -450,7 +450,7 @@ func TestMCUNeverInitiates(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) gotLine := make(chan struct{}) go func() { cm5.ReadLine(); close(gotLine) }() select { @@ -466,7 +466,7 @@ func TestUnknownTypeIgnored(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) cm5.WriteLine([]byte(`{"type":"future_msg"}`)) sendMsg(t, cm5, protoPing{Type: "ping", TS: 1}) @@ -481,7 +481,7 @@ func TestMalformedJSONIgnored(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) cm5.WriteLine([]byte("not json")) sendMsg(t, cm5, protoPing{Type: "ping", TS: 2}) @@ -496,7 +496,7 @@ func TestCancelClosesCleanly(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) done := make(chan struct{}) - go func() { Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local"); close(done) }() + go func() { Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()); close(done) }() bringUp(t, cm5) cancel() select { @@ -515,7 +515,7 @@ func TestLinkStatePublishedOnHandshake(t *testing.T) { ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) ack := bringUp(t, cm5) @@ -682,7 +682,7 @@ func TestPubImport(t *testing.T) { conn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, conn, "mcu-1", "cm5-local") + go Run(ctx, mcu, conn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) reader := b.NewConnection("test") @@ -714,7 +714,7 @@ func TestPubExport(t *testing.T) { publishConn := b.NewConnection("hal") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) unlockExports(t, cm5) @@ -744,7 +744,7 @@ func TestUnretainExport(t *testing.T) { publishConn := b.NewConnection("hal") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) unlockExports(t, cm5) @@ -841,7 +841,7 @@ func TestPubIgnoredBeforeHandshake(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) sendMsg(t, cm5, protoPub{ Type: "pub", Topic: []string{"config", "device"}, @@ -864,7 +864,7 @@ func TestUnretainIgnoredBeforeHandshake(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) writer := b.NewConnection("writer") writer.Publish(writer.NewMessage(bus.T("config", "device"), json.RawMessage(`{"v":1}`), true)) @@ -895,7 +895,7 @@ func TestUnretain(t *testing.T) { conn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, conn, "mcu-1", "cm5-local") + go Run(ctx, mcu, conn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) sendMsg(t, cm5, protoPub{ @@ -925,7 +925,7 @@ func TestCallIgnoredBeforeHandshake(t *testing.T) { fabricConn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) handler := b.NewConnection("handler") sub := handler.Subscribe(bus.T("rpc", "hal", "dump")) @@ -949,7 +949,7 @@ func TestCallImport(t *testing.T) { fabricConn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) handler := b.NewConnection("handler") @@ -979,7 +979,7 @@ func TestCallNoRoute(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local") + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) sendMsg(t, cm5, protoCall{ @@ -1005,7 +1005,7 @@ func TestDumpCallReturnsConfigState(t *testing.T) { fabricConn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) // Send config first so the session has state. @@ -1048,7 +1048,7 @@ func TestDumpCallDoesNotBlockPing(t *testing.T) { fabricConn := b.NewConnection("fabric") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) // Send dump call and ping back-to-back. @@ -1092,7 +1092,7 @@ func TestCallExport(t *testing.T) { reqConn := b.NewConnection("caller") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) unlockExports(t, cm5) @@ -1163,7 +1163,7 @@ func TestCallExportOnlyConfiguredRule(t *testing.T) { reqConn := b.NewConnection("caller") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) unlockExports(t, cm5) @@ -1415,7 +1415,7 @@ func TestCallExportPeerReset(t *testing.T) { reqConn := b.NewConnection("caller") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) bringUp(t, cm5) unlockExports(t, cm5) @@ -1473,7 +1473,7 @@ func TestEchoedHelloAckIgnoredDuringOutgoingCall(t *testing.T) { reqConn := b.NewConnection("caller") ctx, cancel := context.WithCancel(context.Background()) defer cancel() - go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local") + go Run(ctx, mcu, fabricConn, "mcu-1", "cm5-local", DefaultLinkConfig()) ack := bringUp(t, cm5) unlockExports(t, cm5) diff --git a/services/fabric/session.go b/services/fabric/session.go index 4927c87..82dbb50 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -138,6 +138,7 @@ type session struct { localSID string tr Transport conn *bus.Connection + cfg LinkConfig link linkState peerNode string @@ -173,6 +174,7 @@ func (s *session) logKV(msg, key, value string) { // run is the main loop. Blocks until ctx is cancelled. func (s *session) run(ctx context.Context) { + s.cfg.applyDefaults() lines := make(chan readResult, lineQueueSize) go func() { @@ -239,9 +241,11 @@ func (s *session) run(ctx context.Context) { resetTimer(stale, staleTimeout) case <-exportTick.C: + now := time.Now() s.drainExports() - s.drainInbound(time.Now()) - s.drainOutbound(time.Now()) + s.drainInbound(now) + s.drainOutbound(now) + s.checkTransferTimeout(now) case <-waitTick.C: s.logWaiting() diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 07d1385..130d923 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -51,6 +51,10 @@ type incomingTransfer struct { bytesWritten uint32 chunksSeen uint32 hasher *xxhash.Hasher + // deadline is the idle-chunk watchdog: bumped on every accepted chunk + // and on initial xfer_begin. checkTransferTimeout fires if now > deadline. + // Mirrors transfer_mgr.lua: `active.deadline = runtime.now() + phase_timeout`. + deadline time.Time } func lowerHex(s string) string { @@ -107,6 +111,26 @@ func (s *session) abortTransfer(reason string) { } } +// checkTransferTimeout enforces the idle-chunk watchdog. Fires once per +// drain tick from the session run loop; cheap when no transfer is active. +// On expiry both the local sink is aborted and an xfer_abort frame is sent +// to the peer (matching Lua transfer_mgr.lua's `clear_active('timeout')` + +// outbound xfer_abort). +func (s *session) checkTransferTimeout(now time.Time) { + cur := s.incomingTransfer + if cur == nil { + return + } + if !now.After(cur.deadline) { + return + } + id := cur.meta.ID + println("[fabric]", "sid", s.localSID, "xfer_phase_timeout", + "id", id, "phase_s", u32s(uint32(s.cfg.PhaseTimeout/time.Second))) + s.abortTransfer("timeout") + s.sendTransferAbort(id, "timeout") +} + func validateTransferBegin(msg *protoXferBegin) (transferMeta, string) { if msg.XferID == "" { return transferMeta{}, "xfer_begin.xfer_id" @@ -148,9 +172,10 @@ func (s *session) onTransferBegin(msg *protoXferBegin) { return } s.incomingTransfer = &incomingTransfer{ - meta: meta, - sink: sink, - hasher: xxhash.New(0), + meta: meta, + sink: sink, + hasher: xxhash.New(0), + deadline: time.Now().Add(s.cfg.PhaseTimeout), } println( "[fabric]", "sid", s.localSID, @@ -216,6 +241,7 @@ func (s *session) onTransferChunk(msg *protoXferChunk) { _, _ = cur.hasher.Write(raw) cur.bytesWritten += uint32(len(raw)) cur.chunksSeen++ + cur.deadline = time.Now().Add(s.cfg.PhaseTimeout) if cur.chunksSeen == 1 || (cur.chunksSeen%transferProgressLogEvery) == 0 { println( "[fabric]", "sid", s.localSID, diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 03fe986..4e74692 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -354,6 +354,52 @@ func TestTransferCommitChecksumMismatchAborts(t *testing.T) { } } +func TestTransferIdleChunkWatchdog(t *testing.T) { + // transfer_mgr.lua refreshes active.deadline = now + phase_timeout on + // each accepted chunk and aborts with reason="timeout" if the deadline + // passes. With a tight PhaseTimeout, dropping the wire after xfer_begin + // must produce an unsolicited xfer_abort within ~one drain tick. + b := newBus() + cm5, mcu := pipePair() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + + sink := &fakeTransferSink{} + s := session{ + linkID: defaultLinkID, + nodeID: "mcu-1", + peerID: "cm5-local", + localSID: "mcu-sid-test", + tr: mcu, + conn: b.NewConnection("fabric"), + cfg: LinkConfig{PhaseTimeout: 100 * time.Millisecond}, + beginTransfer: func(meta transferMeta) (transferSink, error) { + return sink, nil + }, + } + go s.run(ctx) + bringUp(t, cm5) + + payload := []byte("abcd") + sendMsg(t, cm5, protoXferBegin{ + Type: msgXferBegin, + XferID: "xfer-wd", + Size: uint32(len(payload)), + Checksum: xxhashStr(payload), + }) + _ = readMsg[protoXferReady](t, cm5) + + // Stop sending chunks; watchdog should fire within ~PhaseTimeout + + // one exportTickInterval (50ms). + abort := readMsg[protoXferAbort](t, cm5) + if abort.Type != msgXferAbort || abort.XferID != "xfer-wd" || abort.Err != "timeout" { + t.Fatalf("bad xfer_abort: %+v", abort) + } + if len(sink.abortReasons) == 0 || sink.abortReasons[0] != "timeout" { + t.Fatalf("sink.Abort reasons = %v, want [\"timeout\"]", sink.abortReasons) + } +} + func TestTransferCommitChecksumMismatchOnCommitFrameAborts(t *testing.T) { // xfer_begin and xfer_commit must agree on the checksum. If they // disagree (even when the streamed bytes match begin), commit aborts. diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index e58ba13..450a28f 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -559,7 +559,7 @@ func (r *Reactor) Run(ctx context.Context) { fabricSessionOpen = true go func() { defer close(done) - fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5-local") + fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5-local", fabric.DefaultLinkConfig()) }() log.Println("[uart1] fabric session opened") } From ea93ffa5b86b2a5f1926699fc0476ad2cb0cc1f8 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 09:38:48 +0000 Subject: [PATCH 53/65] fabric: W6 active ping, session_reset, bounded inbound helpers MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LinkConfig grows three fields with release defaults pulled from bigbox-v1-cm-2.json `service.fabric.links.`: - PingInterval (10s) - LivenessTimeout (30s) — replaces hardcoded staleTimeout=45s - MaxInboundHelpers (64) — Lua's `max_pending_calls` fallback Session lifecycle changes mirror session_ctl.lua / rpc_bridge.lua at update-migration tip: - Active outbound ping cadence: tickPing fires from the existing 50ms drain tick; sends `ping` and resets nextPingAt = now + PingInterval unconditionally (no TX-activity dependency). nextPingAt is armed in promoteLink so the first ping fires PingInterval after link-up. - Stale timer now runs on cfg.LivenessTimeout (was 45s). - Pending outbound calls now fail with err="session_reset" on peer SID change (renamed reasonPeerSessionChanged -> reasonSessionReset); matches rpc_bridge.lua's fail_pending(pending, 'session_reset'). - onCall now enforces capacity: if len(inboundCalls) >= cfg.MaxInboundHelpers, reply {ok=false, err="busy"} before route resolution. Mirrors rpc_bridge.lua's spawn_local_call_helper. New tests: - TestSessionPingsUnconditionally: 3 pings within 500ms at PingInterval=150ms - TestInboundCallBusyAtCapacity: second concurrent call hits busy with MaxInboundHelpers=1 (uses test-scoped importCallRules entry so both calls actually route). - TestCallExportPeerReset updated for the new session_reset string. The receiver-half of "imported retained facts unretained on session-gen change" is L2/fabric-update territory (it lives in the bridge's imported-fact cache), so out of scope here. The pending-call cancel + xfer abort halves of the session-reset semantics are now in place. --- services/fabric/fabric.go | 41 +++++++++++++++---- services/fabric/fabric_test.go | 73 +++++++++++++++++++++++++++++++++- services/fabric/session.go | 68 +++++++++++++++++++++---------- 3 files changed, 153 insertions(+), 29 deletions(-) diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index dbdc5ca..1e92e5c 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -34,12 +34,29 @@ type LinkConfig struct { // this window. Mirrors transfer_mgr.lua's `phase_timeout`. // Release: 15s. PhaseTimeout time.Duration + // PingInterval drives the unconditional outbound ping cadence after + // the link is established (`session_ctl.lua` resets next_ping_at = + // now + ping_interval after every send; not TX-activity-based). + // Release: 10s. + PingInterval time.Duration + // LivenessTimeout tears the link down if no frame arrives within + // this window once established. Mirrors session_ctl.lua's + // liveness_timeout_s. Release: 30s. + LivenessTimeout time.Duration + // MaxInboundHelpers caps the number of in-flight inbound RPC calls. + // Excess inbound calls reply `{ok=false, err="busy"}` per + // rpc_bridge.lua's `spawn_local_call_helper`. Lua default is 64 + // (falls back to max_pending_calls); we keep that for parity. + MaxInboundHelpers int } func DefaultLinkConfig() LinkConfig { return LinkConfig{ - ChunkSize: 2048, - PhaseTimeout: 15 * time.Second, + ChunkSize: 2048, + PhaseTimeout: 15 * time.Second, + PingInterval: 10 * time.Second, + LivenessTimeout: 30 * time.Second, + MaxInboundHelpers: 64, } } @@ -51,6 +68,15 @@ func (c *LinkConfig) applyDefaults() { if c.PhaseTimeout == 0 { c.PhaseTimeout = d.PhaseTimeout } + if c.PingInterval == 0 { + c.PingInterval = d.PingInterval + } + if c.LivenessTimeout == 0 { + c.LivenessTimeout = d.LivenessTimeout + } + if c.MaxInboundHelpers == 0 { + c.MaxInboundHelpers = d.MaxInboundHelpers + } } var nextSessionID atomic.Uint64 @@ -60,11 +86,12 @@ func newLocalSID() string { } // Run starts the fabric session. Blocks until ctx is cancelled or the -// transport returns an unrecoverable error. The MCU is respond-only: -// it never initiates hello or ping. It waits for hello from the CM5 -// and replies with hello_ack; it responds to ping with pong. The CM5 -// owns heartbeat cadence — the MCU marks the link stale if nothing -// arrives within the timeout. +// transport returns an unrecoverable error. The MCU is a hello +// responder (CM5 always initiates hello/hello_ack), but otherwise +// runs the symmetric session_ctl semantics: once established, it +// sends pings every PingInterval and tears the link down if no frame +// arrives within LivenessTimeout. Mirrors session_ctl.lua at +// devicecode-lua@2c88090. func Run(ctx context.Context, tr Transport, conn *bus.Connection, nodeID, peerID string, cfg LinkConfig) { s := session{ linkID: defaultLinkID, diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 4ea8658..b533703 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -446,6 +446,9 @@ func TestPingPong(t *testing.T) { } func TestMCUNeverInitiates(t *testing.T) { + // Pre-handshake the MCU is silent; tickPing only fires once the link + // is up. Active outbound pings post-handshake are covered by + // TestSessionPingsUnconditionally. mcu, cm5 := pipePair() b := newBus() ctx, cancel := context.WithCancel(context.Background()) @@ -461,6 +464,72 @@ func TestMCUNeverInitiates(t *testing.T) { cancel() } +func TestSessionPingsUnconditionally(t *testing.T) { + // session_ctl.lua resets next_ping_at = now + ping_interval after + // every send, with no TX-activity dependency. Once the link is up, + // pings must keep flowing even if neither side talks otherwise. + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", LinkConfig{PingInterval: 150 * time.Millisecond}) + bringUp(t, cm5) + + for i := 0; i < 3; i++ { + ping := readMsg[protoPing](t, cm5) + if ping.Type != msgPing { + t.Fatalf("ping[%d] type = %q, want %q", i, ping.Type, msgPing) + } + } +} + +func TestInboundCallBusyAtCapacity(t *testing.T) { + // rpc_bridge.lua's spawn_local_call_helper rejects with err="busy" + // when inbound_helpers >= max_inbound_helpers, before the route check. + // With MaxInboundHelpers=1, the second concurrent inbound call must + // reply busy without going through routing. + prev := importCallRules + importCallRules = append([]importRule{}, prev...) + importCallRules = append(importCallRules, importRule{ + wire: []string{"rpc", "test", "noop"}, + local: []string{"rpc", "test", "noop"}, + }) + t.Cleanup(func() { importCallRules = prev }) + + mcu, cm5 := pipePair() + b := newBus() + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", LinkConfig{MaxInboundHelpers: 1}) + bringUp(t, cm5) + + // First call holds the only helper slot. The bus has no handler, so + // the call sits as a pending request until timeout. + sendMsg(t, cm5, protoCall{ + Type: msgCall, + ID: "c1", + Topic: []string{"rpc", "test", "noop"}, + Payload: json.RawMessage(`{}`), + TimeoutMs: 5000, + }) + + // Second call arrives while the helper is full → busy reply. + sendMsg(t, cm5, protoCall{ + Type: msgCall, + ID: "c2", + Topic: []string{"rpc", "test", "noop"}, + Payload: json.RawMessage(`{}`), + }) + + reply := readMsg[protoReply](t, cm5) + if reply.Corr != "c2" { + t.Fatalf("first reply corr = %q, want c2", reply.Corr) + } + if reply.OK || reply.Err != "busy" { + t.Fatalf("expected busy reply for c2, got %+v", reply) + } +} + func TestUnknownTypeIgnored(t *testing.T) { mcu, cm5 := pipePair() b := newBus() @@ -1458,8 +1527,8 @@ func TestCallExportPeerReset(t *testing.T) { if out.OK { t.Fatal("expected ok=false") } - if out.Error != "peer_session_changed" { - t.Fatalf("error = %q, want peer_session_changed", out.Error) + if out.Error != "session_reset" { + t.Fatalf("error = %q, want session_reset", out.Error) } case <-time.After(2 * time.Second): t.Fatal("timeout waiting for peer-reset reply") diff --git a/services/fabric/session.go b/services/fabric/session.go index 82dbb50..f4dc016 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -31,15 +31,14 @@ const ( // ---- timeouts (local policy) ---- // -// Timing relationships: -// staleTimeout (45s) > callTimeoutDef (5s) -// -// The CM5 sends pings every 15s of TX inactivity. The MCU marks the -// peer stale after 45s without any RX, giving a 30s margin. -// Exports are enabled immediately on link-up (after exportStartHoldoff). +// LinkConfig drives the ping cadence (PingInterval) and liveness-stale +// detection (LivenessTimeout). Mirrors session_ctl.lua at +// devicecode-lua@2c88090: pings fire unconditionally every +// ping_interval_s; the link is torn down if no frame arrives within +// liveness_timeout_s. Exports are enabled immediately on link-up +// (after exportStartHoldoff). const ( - staleTimeout = 45 * time.Second callTimeoutDef = 5 * time.Second waitLogEvery = 2 * time.Second exportStartHoldoff = 1 * time.Second @@ -60,15 +59,16 @@ const ( // ---- link reasons and error strings ---- const ( - reasonLinkDown = "link_down" - reasonPeerStale = "peer_stale" - reasonPeerReset = "peer_reset" - reasonPeerSessionChanged = "peer_session_changed" - reasonHelloRejected = "hello_rejected" - reasonTransportDown = "transport_down" - reasonTransportWrite = "transport_write_failed" - reasonNoRoute = "no_route" - reasonTimeout = "timeout" + reasonLinkDown = "link_down" + reasonPeerStale = "peer_stale" + reasonPeerReset = "peer_reset" + reasonSessionReset = "session_reset" + reasonHelloRejected = "hello_rejected" + reasonTransportDown = "transport_down" + reasonTransportWrite = "transport_write_failed" + reasonNoRoute = "no_route" + reasonBusy = "busy" + reasonTimeout = "timeout" ) // ---- bus topics for config handling ---- @@ -155,6 +155,7 @@ type session struct { inboundCalls []*inboundCall outboundCalls []*outboundCall nextOutboundID uint64 + nextPingAt time.Time incomingTransfer *incomingTransfer beginTransfer func(transferMeta) (transferSink, error) @@ -209,7 +210,7 @@ func (s *session) run(ctx context.Context) { defer s.abortTransfer(reasonLinkDown) defer s.log("run stop") - stale := time.NewTimer(staleTimeout) + stale := time.NewTimer(s.cfg.LivenessTimeout) defer stale.Stop() waitTick := time.NewTicker(waitLogEvery) @@ -238,7 +239,7 @@ func (s *session) run(ctx context.Context) { return } s.dispatch(res.line) - resetTimer(stale, staleTimeout) + resetTimer(stale, s.cfg.LivenessTimeout) case <-exportTick.C: now := time.Now() @@ -246,6 +247,7 @@ func (s *session) run(ctx context.Context) { s.drainInbound(now) s.drainOutbound(now) s.checkTransferTimeout(now) + s.tickPing(now) case <-waitTick.C: s.logWaiting() @@ -254,7 +256,7 @@ func (s *session) run(ctx context.Context) { if s.link == linkUp { s.handleLinkDown(reasonPeerStale, "") } else { - stale.Reset(staleTimeout) + stale.Reset(s.cfg.LivenessTimeout) } } } @@ -348,6 +350,9 @@ func (s *session) handleLinkDown(reason, err string) { } // promoteLink transitions to linkUp, tearing down any prior session state. +// `reason` carries the link-state telemetry tag (e.g. session_reset) and +// is also used as the err string on any pending outbound calls cancelled +// by the transition, matching rpc_bridge.lua's session-replace behaviour. func (s *session) promoteLink(reason string) { if s.link == linkUp { if reason == "" { @@ -362,6 +367,7 @@ func (s *session) promoteLink(reason string) { s.setupExports() s.exportsEnabled = true s.exportReadyAt = time.Now().Add(exportStartHoldoff) + s.nextPingAt = time.Now().Add(s.cfg.PingInterval) s.log("exports enabled") s.publishLinkState(reason, "") } @@ -454,7 +460,7 @@ func (s *session) logMalformed(line []byte, err error) { func (s *session) notePeerIdentity(node, sid string, proto int) string { reason := "" if s.link == linkUp && s.peerSID != "" && sid != "" && s.peerSID != sid { - reason = reasonPeerSessionChanged + reason = reasonSessionReset } if node != "" { s.peerNode = node @@ -539,6 +545,22 @@ func (s *session) onPing(msg *protoPing) { s.log("pong tx") } +// tickPing sends an outbound ping if the link is established and the +// PingInterval cadence has elapsed. Mirrors session_ctl.lua: pings fire +// unconditionally every ping_interval after each send (NOT TX-activity-based). +func (s *session) tickPing(now time.Time) { + if s.link != linkUp { + return + } + if s.nextPingAt.IsZero() || now.Before(s.nextPingAt) { + return + } + if !s.sendFrame(marshal(protoPing{Type: msgPing, TS: now.UnixMilli(), SID: s.localSID})) { + return + } + s.nextPingAt = now.Add(s.cfg.PingInterval) +} + func (s *session) onPong(msg *protoPong) { if s.isSelfControlFrame("", msg.SID) { s.log("echoed pong ignored") @@ -615,6 +637,12 @@ func (s *session) onCall(msg *protoCall) { return } + if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { + s.log("incoming call dropped: busy") + s.sendFrame(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) + return + } + localTopic := importCallTopic(msg.Topic) if localTopic == nil { s.log("incoming call dropped: no_route") From 77c966d5add753a5619afbb36edd64b6a00056fe Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 09:57:21 +0000 Subject: [PATCH 54/65] fabric: W5 3-lane writer (control / weighted RR rpc-bulk) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Mirrors src/services/fabric/writer.lua at update-migration tip. Frame priority class follows protocol.lua's FRAME_CLASS: control: hello, hello_ack, ping, pong, xfer_{begin,ready,need,commit, done,abort} rpc: pub, unretain, call, reply bulk: xfer_chunk (MCU does not originate; bulk lane is wired in for symmetry but currently unused on MCU) Implementation notes: - new writer.go: txLane FIFO + enqueueFrame(lane, data) + flushWriter (drains controlQ fully, then weighted RR between rpcQ and bulkQ). - session.go: per-lane buffers on session; sendFrame replaced by sendControl + sendRPC wrappers at every call site (transfer.go's xfer_* senders likewise). - LinkConfig grows RPCQuantum (default 4) and BulkQuantum (default 1), matching writer.lua's release tuning. applyDefaults sets them. - flushWriter floors zero quantums to 1 defensively so unit tests that construct session{} directly (without applyDefaults) still make forward progress instead of spinning the outer loop. Test: - TestWriterControlPreemptsRPCAndBulk pre-loads all 3 lanes and asserts the drain order: 2 control, then 4 rpc, 1 bulk, 1 rpc, 1 bulk. The MCU's actual outbound traffic profile (1 frame per drain tick from the existing single producer) doesn't currently exercise the RR fairness — but the structure is in place for fabric-update's retained state publishers, which will queue rpc-lane frames concurrently with control-lane xfer_need / ping. --- services/fabric/fabric.go | 13 ++++ services/fabric/fabric_test.go | 41 ++++++++++++ services/fabric/session.go | 58 ++++++++--------- services/fabric/transfer.go | 8 +-- services/fabric/writer.go | 113 +++++++++++++++++++++++++++++++++ 5 files changed, 198 insertions(+), 35 deletions(-) create mode 100644 services/fabric/writer.go diff --git a/services/fabric/fabric.go b/services/fabric/fabric.go index 1e92e5c..0a2ab70 100644 --- a/services/fabric/fabric.go +++ b/services/fabric/fabric.go @@ -48,6 +48,11 @@ type LinkConfig struct { // rpc_bridge.lua's `spawn_local_call_helper`. Lua default is 64 // (falls back to max_pending_calls); we keep that for parity. MaxInboundHelpers int + // RPCQuantum and BulkQuantum control the writer's weighted + // round-robin between the rpc and bulk lanes after the control + // lane drains. Mirrors writer.lua's lane scheduler. Release: 4 and 1. + RPCQuantum int + BulkQuantum int } func DefaultLinkConfig() LinkConfig { @@ -57,6 +62,8 @@ func DefaultLinkConfig() LinkConfig { PingInterval: 10 * time.Second, LivenessTimeout: 30 * time.Second, MaxInboundHelpers: 64, + RPCQuantum: 4, + BulkQuantum: 1, } } @@ -77,6 +84,12 @@ func (c *LinkConfig) applyDefaults() { if c.MaxInboundHelpers == 0 { c.MaxInboundHelpers = d.MaxInboundHelpers } + if c.RPCQuantum == 0 { + c.RPCQuantum = d.RPCQuantum + } + if c.BulkQuantum == 0 { + c.BulkQuantum = d.BulkQuantum + } } var nextSessionID atomic.Uint64 diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index b533703..96acd0a 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -483,6 +483,47 @@ func TestSessionPingsUnconditionally(t *testing.T) { } } +func TestWriterControlPreemptsRPCAndBulk(t *testing.T) { + // writer.lua drains the control lane first (no fairness); then + // weighted RR between rpc and bulk. Pre-load all three lanes and + // assert the drain order is: all control, then 4 rpc, then 1 bulk, + // then any remaining rpc/bulk (default rpc_quantum=4, bulk_quantum=1). + tr := &captureTransport{} + s := session{tr: tr, cfg: DefaultLinkConfig()} + s.txBulk.push([]byte(`{"type":"xfer_chunk","i":0}`)) + s.txBulk.push([]byte(`{"type":"xfer_chunk","i":1}`)) + for i := 0; i < 5; i++ { + s.txRPC.push([]byte(`{"type":"pub","i":` + string(rune('0'+i)) + `}`)) + } + s.txControl.push([]byte(`{"type":"ping"}`)) + s.txControl.push([]byte(`{"type":"xfer_need"}`)) + + if !s.flushWriter() { + t.Fatal("flushWriter returned false") + } + if len(tr.writes) != 9 { + t.Fatalf("writes = %d, want 9", len(tr.writes)) + } + // Control drains first. + want := []string{ + `{"type":"ping"}`, + `{"type":"xfer_need"}`, + // Then RR: 4 rpc, 1 bulk, 1 rpc, 1 bulk, 0 (no more bulk; remaining rpc). + `{"type":"pub","i":0}`, + `{"type":"pub","i":1}`, + `{"type":"pub","i":2}`, + `{"type":"pub","i":3}`, + `{"type":"xfer_chunk","i":0}`, + `{"type":"pub","i":4}`, + `{"type":"xfer_chunk","i":1}`, + } + for i, w := range want { + if string(tr.writes[i]) != w { + t.Fatalf("write[%d] = %q, want %q", i, tr.writes[i], w) + } + } +} + func TestInboundCallBusyAtCapacity(t *testing.T) { // rpc_bridge.lua's spawn_local_call_helper rejects with err="busy" // when inbound_helpers >= max_inbound_helpers, before the route check. diff --git a/services/fabric/session.go b/services/fabric/session.go index f4dc016..4823b64 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -156,6 +156,9 @@ type session struct { outboundCalls []*outboundCall nextOutboundID uint64 nextPingAt time.Time + txControl txLane + txRPC txLane + txBulk txLane incomingTransfer *incomingTransfer beginTransfer func(transferMeta) (transferSink, error) @@ -508,7 +511,7 @@ func (s *session) onHello(msg *protoHello) { reason := s.notePeerIdentity(msg.Node, msg.SID, msg.Proto) s.logKV("hello rx", "peer_sid", msg.SID) - if !s.sendFrame(marshal(protoHelloAck{ + if !s.sendControl(marshal(protoHelloAck{ Type: msgHelloAck, Node: s.nodeID, SID: s.localSID, @@ -539,7 +542,7 @@ func (s *session) onHelloAck(msg *protoHelloAck) { func (s *session) onPing(msg *protoPing) { s.logKV("ping rx", "peer_sid", msg.SID) - if !s.sendFrame(marshal(protoPong{Type: msgPong, TS: msg.TS, SID: s.localSID})) { + if !s.sendControl(marshal(protoPong{Type: msgPong, TS: msg.TS, SID: s.localSID})) { return } s.log("pong tx") @@ -555,7 +558,7 @@ func (s *session) tickPing(now time.Time) { if s.nextPingAt.IsZero() || now.Before(s.nextPingAt) { return } - if !s.sendFrame(marshal(protoPing{Type: msgPing, TS: now.UnixMilli(), SID: s.localSID})) { + if !s.sendControl(marshal(protoPing{Type: msgPing, TS: now.UnixMilli(), SID: s.localSID})) { return } s.nextPingAt = now.Add(s.cfg.PingInterval) @@ -633,20 +636,20 @@ func (s *session) onCall(msg *protoCall) { ConfigCount: s.configCount, ConfigError: s.lastConfigErr, } - s.sendFrame(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: true, Value: mustMarshal(reply)})) + s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: true, Value: mustMarshal(reply)})) return } if len(s.inboundCalls) >= s.cfg.MaxInboundHelpers { s.log("incoming call dropped: busy") - s.sendFrame(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) + s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonBusy})) return } localTopic := importCallTopic(msg.Topic) if localTopic == nil { s.log("incoming call dropped: no_route") - s.sendFrame(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) + s.sendRPC(marshal(protoReply{Type: msgReply, Corr: msg.ID, OK: false, Err: reasonNoRoute})) return } @@ -807,7 +810,7 @@ func (s *session) drainExports() { continue } if m.Retained && m.Payload == nil { - if !s.sendFrame(marshal(protoUnretain{ + if !s.sendRPC(marshal(protoUnretain{ Type: msgUnretain, Topic: wire, })) { @@ -821,7 +824,7 @@ func (s *session) drainExports() { s.logKV("export payload dropped", "err", err.Error()) continue } - if !s.sendFrame(marshal(protoPub{ + if !s.sendRPC(marshal(protoPub{ Type: msgPub, Topic: wire, Payload: payload, @@ -850,25 +853,25 @@ func (s *session) drainInbound(now time.Time) { s.conn.Unsubscribe(call.sub) call.sub = nil // prevent double-unsubscribe in teardownInbound if !ok || reply == nil { - if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue } if errStr := checkBusError(reply.Payload); errStr != "" { - if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) { + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errStr})) { return } continue } payload, err := marshalPayload(reply.Payload) if err != nil { - if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: errPayloadMarshal})) { return } continue } - if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Value: payload})) { + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: true, Value: payload})) { return } continue @@ -878,7 +881,7 @@ func (s *session) drainInbound(now time.Time) { if !now.Before(call.deadline) { s.conn.Unsubscribe(call.sub) call.sub = nil - if !s.sendFrame(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { + if !s.sendRPC(marshal(protoReply{Type: msgReply, Corr: call.id, OK: false, Err: reasonTimeout})) { return } continue @@ -924,7 +927,7 @@ func (s *session) drainOutbound(now time.Time) { deadline: now.Add(callTimeoutDef), }) } - if !s.sendFrame(marshal(protoCall{ + if !s.sendRPC(marshal(protoCall{ Type: msgCall, ID: corr, Topic: wireTopic, @@ -959,23 +962,16 @@ func (s *session) drainOutbound(now time.Time) { // ---- transport write ---- -func (s *session) sendFrame(data []byte) bool { - if len(data) > 0 && data[len(data)-1] == '\n' { - data = data[:len(data)-1] - } - if err := s.tr.WriteLine(data); err != nil { - if errors.Is(err, ErrLineTooLong) { - // Oversized frame is dropped but the transport is still - // healthy — return true so the session continues. - s.log("oversized write dropped") - return true - } - s.handleLinkDown(reasonTransportWrite, err.Error()) - return false - } - s.markTx() - return true -} +// sendControl, sendRPC, sendBulk are the lane-tagged enqueue entry +// points used at every send site. They wrap enqueueFrame (defined in +// writer.go) so the lane intent is explicit at the call site. +// +// Lane assignment per protocol.lua's FRAME_CLASS: +// control: hello, hello_ack, ping, pong, xfer_{begin,ready,need,commit,done,abort} +// rpc: pub, unretain, call, reply +// bulk: xfer_chunk (MCU does not originate; bulk lane unused on MCU) +func (s *session) sendControl(data []byte) bool { return s.enqueueFrame(laneControl, data) } +func (s *session) sendRPC(data []byte) bool { return s.enqueueFrame(laneRPC, data) } func (s *session) logWaiting() { if s.peerSID != "" { diff --git a/services/fabric/transfer.go b/services/fabric/transfer.go index 130d923..59919b8 100644 --- a/services/fabric/transfer.go +++ b/services/fabric/transfer.go @@ -66,14 +66,14 @@ func u32s(v uint32) string { } func (s *session) sendTransferReady(id string) bool { - return s.sendFrame(marshal(protoXferReady{ + return s.sendControl(marshal(protoXferReady{ Type: msgXferReady, XferID: id, })) } func (s *session) sendTransferNeed(id string, next uint32) bool { - return s.sendFrame(marshal(protoXferNeed{ + return s.sendControl(marshal(protoXferNeed{ Type: msgXferNeed, XferID: id, Next: next, @@ -81,14 +81,14 @@ func (s *session) sendTransferNeed(id string, next uint32) bool { } func (s *session) sendTransferDone(id string) bool { - return s.sendFrame(marshal(protoXferDone{ + return s.sendControl(marshal(protoXferDone{ Type: msgXferDone, XferID: id, })) } func (s *session) sendTransferAbort(id, reason string) bool { - return s.sendFrame(marshal(protoXferAbort{ + return s.sendControl(marshal(protoXferAbort{ Type: msgXferAbort, XferID: id, Err: reason, diff --git a/services/fabric/writer.go b/services/fabric/writer.go new file mode 100644 index 0000000..5d3f596 --- /dev/null +++ b/services/fabric/writer.go @@ -0,0 +1,113 @@ +package fabric + +import "errors" + +// Outbound frame scheduler — control / rpc / bulk lanes per +// devicecode-lua@2c88090 src/services/fabric/writer.lua. Control bypasses +// fairness and drains first; rpc and bulk share remaining bandwidth via +// weighted round-robin (defaults rpc_quantum=4, bulk_quantum=1). +// +// Lane assignment for outbound MCU frames mirrors protocol.lua's +// FRAME_CLASS map. The MCU never originates xfer_chunk so the bulk lane +// is currently unused on the MCU side; it is wired in for symmetry and +// for future fabric-update telemetry that may want to route bulk frames. + +type lane uint8 + +const ( + laneControl lane = iota + laneRPC + laneBulk +) + +// txLane is a single FIFO of pending wire frames. +type txLane struct { + frames [][]byte +} + +func (l *txLane) push(data []byte) { l.frames = append(l.frames, data) } +func (l *txLane) len() int { return len(l.frames) } +func (l *txLane) pop() []byte { + f := l.frames[0] + l.frames = l.frames[1:] + if len(l.frames) == 0 { + l.frames = nil + } + return f +} + +// enqueueFrame routes data into the lane and immediately drains the +// writer in priority order. With a single producer goroutine the queue +// is normally drained empty before the next caller, but the lane +// discipline kicks in when multiple frames are queued in a single tick +// (e.g. drainExports + drainOutbound generating frames back-to-back). +func (s *session) enqueueFrame(l lane, data []byte) bool { + s.lane(l).push(data) + return s.flushWriter() +} + +func (s *session) lane(l lane) *txLane { + switch l { + case laneControl: + return &s.txControl + case laneRPC: + return &s.txRPC + case laneBulk: + return &s.txBulk + default: + return &s.txRPC + } +} + +// flushWriter writes queued frames to the transport in priority order: +// 1. drain controlQ fully (no fairness), +// 2. weighted RR between rpcQ and bulkQ until both empty. +// Returns false on transport-write failure (link torn down). +func (s *session) flushWriter() bool { + rpcQ, bulkQ := s.cfg.RPCQuantum, s.cfg.BulkQuantum + // Defensive: guarantee forward progress even if a caller bypasses + // applyDefaults (e.g. unit tests constructing session{} directly). + // Without this, a zero quantum would spin the outer loop forever. + if rpcQ <= 0 { + rpcQ = 1 + } + if bulkQ <= 0 { + bulkQ = 1 + } + for s.txControl.len() > 0 { + if !s.writeFrame(s.txControl.pop()) { + return false + } + } + for s.txRPC.len() > 0 || s.txBulk.len() > 0 { + for i := 0; i < rpcQ && s.txRPC.len() > 0; i++ { + if !s.writeFrame(s.txRPC.pop()) { + return false + } + } + for i := 0; i < bulkQ && s.txBulk.len() > 0; i++ { + if !s.writeFrame(s.txBulk.pop()) { + return false + } + } + } + return true +} + +// writeFrame is the actual transport write. Mirrors what the prior +// sendFrame did inline; isolated so flushWriter can call it per-frame. +func (s *session) writeFrame(data []byte) bool { + if len(data) > 0 && data[len(data)-1] == '\n' { + data = data[:len(data)-1] + } + if err := s.tr.WriteLine(data); err != nil { + if errors.Is(err, ErrLineTooLong) { + s.log("oversized write dropped") + return true + } + s.handleLinkDown(reasonTransportWrite, err.Error()) + return false + } + s.markTx() + return true +} From fc3b62d573c728b09deb69de99aff9331c5ff6a0 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 10:02:43 +0000 Subject: [PATCH 55/65] fabric: W7 swap UART roles, drop legacy CM5 telemetry JSON path MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit bigbox-v1-cm-2.json binds the CM5-facing fabric link to uart0; mirror that on the MCU. Atomic cutover — no dual-run. reactor.go (production / !qa_reactor): - uart0 now carries fabric (was: legacy telemetry JSON). - uart1 now carries the log mirror via log.SetUART1 (was: fabric link). - OnCharger / OnBattery / OnTempDeciC / emitMemSnapshot strip their inline JSONWriter blocks; FSM state updates and human-readable log lines stay. - humidSub / evSub handlers in Run drop their JSON branches. - jsonOut / droppedUART0Bytes fields and jsonWrite helper deleted. Retained-state publishers in fabric-update will replace the old JSON-over-uart0 telemetry. qa_reactor.go (//go:build qa_reactor) is intentionally unchanged — it remains the hardware bring-up path with uart0=telemetry / uart1=log. Build sizes (pico_bb_proto_1): default : code 282892 -> 280388 (-2504 B) flash_unsafe: code 287028 -> 284500 (-2528 B) --- services/reactor/reactor.go | 178 ++++++------------------------------ 1 file changed, 26 insertions(+), 152 deletions(-) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 450a28f..d55f91c 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -134,9 +134,6 @@ type Reactor struct { bus *bus.Bus uiConn *bus.Connection - // UART - jsonOut *shmring.Ring // telemetry (JSON UART TX) - // inputs (latest) vin_mV, vbat_mV int32 iin_mA, ibat_mA int32 @@ -165,9 +162,6 @@ type Reactor struct { // misc now time.Time - - // telemetry drop counters (bytes) - droppedUART0Bytes int } func NewReactor(b *bus.Bus, uiConn *bus.Connection) *Reactor { @@ -377,96 +371,23 @@ func (r *Reactor) OnCharger(v types.ChargerValue) { r.vin_mV = v.VIN_mV r.iin_mA = v.IIn_mA r.tsVIN = r.now - - // JSON: {"power/charger/internal/vin":..,"vsys":..,"iin":..} - if r.jsonOut != nil { - var w utilities.JSONWriter - w.Write = r.jsonWrite - w.Begin() - w.KvInt("power/charger/internal/vin", int(v.VIN_mV)) - w.KvInt("power/charger/internal/vsys", int(v.VSYS_mV)) - w.KvInt("power/charger/internal/iin", int(v.IIn_mA)) - // Full bitfield maps (0/1) for LOCF pipelines - { - it := types.NewBitIter(types.SystemStatus(v.Sys), types.SystemStatusTable[:]) - for { - bitName, set, ok := it.NextAny() - if !ok { - break - } - if set { - w.KvInt("power/charger/internal/system/"+bitName, 1) - } else { - w.KvInt("power/charger/internal/system/"+bitName, 0) - } - } - } - { - it := types.NewBitIter(types.ChargeStatusBits(v.Status), types.ChargeStatusTable[:]) - for { - bitName, set, ok := it.NextAny() - if !ok { - break - } - if set { - w.KvInt("power/charger/internal/status/"+bitName, 1) - } else { - w.KvInt("power/charger/internal/status/"+bitName, 0) - } - } - } - { - it := types.NewBitIter(types.ChargerStateBits(v.State), types.ChargerStateTable[:]) - for { - bitName, set, ok := it.NextAny() - if !ok { - break - } - if set { - w.KvInt("power/charger/internal/state/"+bitName, 1) - } else { - w.KvInt("power/charger/internal/state/"+bitName, 0) - } - } - } - w.End() - } } func (r *Reactor) OnBattery(v types.BatteryValue) { r.vbat_mV = v.PackMilliV r.ibat_mA = v.IBatMilliA r.tsVBAT = r.now - - // JSON: {"power/battery/internal/vbat":..,"ibat":..} - if r.jsonOut != nil { - var w utilities.JSONWriter - w.Write = r.jsonWrite - w.Begin() - w.KvInt("power/battery/internal/vbat", int(v.PackMilliV)) - w.KvInt("power/battery/internal/ibat", int(v.IBatMilliA)) - w.KvInt("power/battery/internal/bsr", int(v.BSR_uOhmPerCell)) - w.End() - } } -func (r *Reactor) OnTempDeciC(label string, deci int, jsonKey string) { +func (r *Reactor) OnTempDeciC(label string, deci int, _ string) { log.Deci(label, deci) - if r.jsonOut != nil { - var w utilities.JSONWriter - w.Write = r.jsonWrite - w.Begin() - w.KvInt(jsonKey, deci) - w.End() - } } -// ---- memory snapshot telemetry (every ~2 s in main loop) ---- +// ---- memory snapshot (every ~3 s in main loop) ---- func (r *Reactor) emitMemSnapshot() { var ms runtime.MemStats runtime.ReadMemStats(&ms) - // log line log.Println( "[mem] ", "alloc:", int(ms.Alloc), " ", @@ -474,14 +395,6 @@ func (r *Reactor) emitMemSnapshot() { "mallocs:", int(ms.Mallocs), " ", "frees:", int(ms.Frees), ) - // JSON (minimal to keep overhead low) - if r.jsonOut != nil { - var w utilities.JSONWriter - w.Write = r.jsonWrite - w.Begin() - w.KvInt("sys/mem/alloc", int(ms.Alloc)) - w.End() - } } func (r *Reactor) Run(ctx context.Context) { @@ -494,22 +407,26 @@ func (r *Reactor) Run(ctx context.Context) { stSub := r.uiConn.Subscribe(stTopic) evSub := r.uiConn.Subscribe(evTopic) - // UART sessions + // UART sessions — fabric on uart0, log mirror on uart1. Mirrors + // devicecode-lua@2c88090 `bigbox-v1-cm-2.json`, where the CM5-facing + // fabric link binds to uart0. The legacy CM5 telemetry-over-JSON path + // on uart0 has been removed; retained-state publishers in + // fabric-update will replace it. const ( - uartTele = "uart0" // telemetry JSON - uartFabric = "uart1" // fabric link to CM5 + uartFabric = "uart0" // fabric link to CM5 + uartLog = "uart1" // debug/log mirror only ) - subSessOpenTele := r.uiConn.Subscribe(tSessOpened(uartTele)) subSessOpenFabric := r.uiConn.Subscribe(tSessOpened(uartFabric)) - subSessClosedTele := r.uiConn.Subscribe(tSessClosed(uartTele)) + subSessOpenLog := r.uiConn.Subscribe(tSessOpened(uartLog)) subSessClosedFabric := r.uiConn.Subscribe(tSessClosed(uartFabric)) + subSessClosedLog := r.uiConn.Subscribe(tSessClosed(uartLog)) // Kick open requests (fire-and-forget; events carry handles) - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartTele), nil, false)) r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartLog), nil, false)) // Retry back-off guards - var retryTeleAt, retryFabricAt time.Time + var retryFabricAt, retryLogAt time.Time // Fabric session lifecycle state var fabricCancel context.CancelFunc @@ -539,11 +456,6 @@ func (r *Reactor) Run(ctx context.Context) { for { select { // ---- UART session opened/closed ---- - case m := <-subSessOpenTele.Channel(): - if ev, ok := m.Payload.(types.SerialSessionOpened); ok { - r.jsonOut = shmring.Get(shmring.Handle(ev.TXHandle)) - log.Println("[uart0] telemetry session opened") - } case m := <-subSessOpenFabric.Channel(): if ev, ok := m.Payload.(types.SerialSessionOpened); ok { // Tear down any previous fabric session before starting a new one. @@ -561,15 +473,12 @@ func (r *Reactor) Run(ctx context.Context) { defer close(done) fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5-local", fabric.DefaultLinkConfig()) }() - log.Println("[uart1] fabric session opened") + log.Println("[uart0] fabric session opened") } - case <-subSessClosedTele.Channel(): - r.jsonOut = nil - log.Println("[uart0] telemetry session closed") - // Auto-reopen with back-off - if time.Now().After(retryTeleAt) { - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartTele), nil, false)) - retryTeleAt = time.Now().Add(2 * time.Second) + case m := <-subSessOpenLog.Channel(): + if ev, ok := m.Payload.(types.SerialSessionOpened); ok { + log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) + log.Println("[uart1] log session opened") } case <-subSessClosedFabric.Channel(): // Ignore stale close events — the open handler already tears down @@ -580,11 +489,18 @@ func (r *Reactor) Run(ctx context.Context) { stopFabricSession() fabricSessionOpen = false nextFabricWaitLog = time.Now() - log.Println("[uart1] fabric session closed") + log.Println("[uart0] fabric session closed") if time.Now().After(retryFabricAt) { r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) retryFabricAt = time.Now().Add(2 * time.Second) } + case <-subSessClosedLog.Channel(): + log.SetUART1(nil) + log.Println("[uart1] log session closed") + if time.Now().After(retryLogAt) { + r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartLog), nil, false)) + retryLogAt = time.Now().Add(2 * time.Second) + } // ---- Env prints ---- case m := <-tempSub.Channel(): @@ -601,14 +517,6 @@ func (r *Reactor) Run(ctx context.Context) { case m := <-humidSub.Channel(): if v, ok := m.Payload.(types.HumidityValue); ok { log.Hundredths("[value] env/humidity/core %RH=", int(v.RHx100)) - // JSON - if r.jsonOut != nil { - var w utilities.JSONWriter - w.Write = r.jsonWrite - w.Begin() - w.KvInt("env/humidity/core", int(v.RHx100)) - w.End() - } } // ---- Die Temp Backup ---- @@ -643,20 +551,6 @@ func (r *Reactor) Run(ctx context.Context) { case m := <-evSub.Channel(): printCapEvent(m) - // JSON: {"///event":""} - if r.jsonOut != nil { - dom, _ := m.Topic.At(2).(string) - kind, _ := m.Topic.At(3).(string) - name, _ := m.Topic.At(4).(string) - tag, _ := m.Topic.At(6).(string) - if dom != "" && kind != "" && name != "" && tag != "" { - var w utilities.JSONWriter - w.Write = r.jsonWrite - w.Begin() - w.KvStr(dom+"/"+kind+"/"+name+"/event", tag) - w.End() - } - } // ---- Supervisory tick ---- case <-ticker.C: @@ -687,26 +581,6 @@ func (r *Reactor) Run(ctx context.Context) { } } -// ----------------------------------------------------------------------------- -// Centralised UART write helpers (handle partial writes) -// ----------------------------------------------------------------------------- - -// uart0 (telemetry JSON) — returns bytes written; tracks dropped bytes on partial writes. -func (r *Reactor) jsonWrite(b []byte) int { - if r == nil || r.jsonOut == nil || len(b) == 0 { - return 0 - } - n := r.jsonOut.TryWriteFrom(b) - if n < len(b) { - r.droppedUART0Bytes += (len(b) - n) - // Rate-limited note - if r.droppedUART0Bytes == (len(b)-n) || (r.droppedUART0Bytes%1024) == 0 { - log.Println("[uart0] dropped bytes =", r.droppedUART0Bytes) - } - } - return n -} - // ----------------------------------------------------------------------------- // Printing helpers (via Logger) // ----------------------------------------------------------------------------- From 95eaf55b33319bd531fcf8e2bf44089a585e1200 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 10:53:49 +0000 Subject: [PATCH 56/65] fabric: close W6/W7 gaps from review MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit W6 — gate Ready on rpc_ready, unretain imported facts on session reset: - Track imported retained local topics (s.importedRetained). onPub appends, onUnretain removes; trackImportedRetain dedups. - New teardownImportedRetained: nil-payload retained publish on every tracked topic. Called from promoteLink (session-reset path) and from handleLinkDown. Mirrors rpc_bridge.lua's invalidate_imported_retained on generation bump. - New rpcReady flag, gated on the post-handshake export holdoff. New tickReady fires from the 50ms drain tick once exportReadyAt elapses, sets rpcReady=true, and republishes link state so consumers observe the ready edge. linkStatePayload.Ready is now `linkUp && rpcReady`; currentStatus() likewise returns "ready" only when rpcReady. Mirrors session_ctl.lua + rpc_bridge.lua's `ready == established and rpc_ready`. W7 — debug uart policy gate: - New debug_uart build tag. debug_uart_release.go (no tag) provides a no-op debugUARTLog stub; debug_uart_dev.go (with tag) opens uart1 and routes log.SetUART1 through the existing shmring drop-on-overflow mirror. reactor.go's Run uses the helper, so the uart1 lifecycle is compiled out of release builds entirely. - log.SetUART1 path's existing TryWriteFrom drop-on-full is the rate-limit; documented in the dev-build comment. New tests: - TestReadyHeldUntilExportHoldoff asserts the Established+!Ready edge precedes Ready=true. - TestSessionResetUnretainsImports forces a session-gen bump (new SID hello) and asserts a nil-payload retained publish lands on the imported topic. Build sizes (pico_bb_proto_1): release default: code 280388 -> 279892 (-496 B; uart1 code stripped) release unsafe : code 284500 -> 284012 (-488 B) dev (debug_uart): code 281644 (uart1 code retained) --- services/fabric/fabric_test.go | 106 +++++++++++++++++++++++++ services/fabric/session.go | 66 ++++++++++++++- services/reactor/debug_uart_dev.go | 63 +++++++++++++++ services/reactor/debug_uart_release.go | 19 +++++ services/reactor/reactor.go | 35 +++----- 5 files changed, 264 insertions(+), 25 deletions(-) create mode 100644 services/reactor/debug_uart_dev.go create mode 100644 services/reactor/debug_uart_release.go diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 96acd0a..34deed7 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -483,6 +483,112 @@ func TestSessionPingsUnconditionally(t *testing.T) { } } +func TestReadyHeldUntilExportHoldoff(t *testing.T) { + // session_ctl.lua / rpc_bridge.lua: ready == established and rpc_ready, + // where rpc_ready edges true only after retained replay completes. + // The Go side gates rpcReady on exportReadyAt elapsing post-handshake. + mcu, cm5 := pipePair() + b := newBus() + observer := b.NewConnection("observer") + sub := observer.Subscribe(bus.T("state", "fabric", "link", "mcu0")) + defer observer.Unsubscribe(sub) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + bringUp(t, cm5) + + var sawNotReady, sawReady bool + deadline := time.After(3 * time.Second) + for !sawReady { + select { + case msg := <-sub.Channel(): + payload, ok := msg.Payload.(linkStatePayload) + if !ok { + t.Fatalf("payload type = %T", msg.Payload) + } + if payload.Established && !payload.Ready { + sawNotReady = true + } + if payload.Ready { + if !sawNotReady { + t.Fatalf("Ready edge raised without prior Established+!Ready state") + } + sawReady = true + } + case <-deadline: + t.Fatal("timeout waiting for Ready=true") + } + } +} + +func TestSessionResetUnretainsImports(t *testing.T) { + // rpc_bridge.lua's invalidate_imported_retained clears every imported + // retained slot on session-generation bump. The Go side mirrors this + // in promoteLink/teardownImportedRetained: each tracked local topic + // gets a nil-payload retained publish that clears the bus's retain + // store, so consumers don't see stale CM5-session data. + mcu, cm5 := pipePair() + b := newBus() + observer := b.NewConnection("observer") + cfgSub := observer.Subscribe(tConfigHAL) + defer observer.Unsubscribe(cfgSub) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + bringUp(t, cm5) + + // Push a config via the import pub path so config/hal becomes a + // tracked imported retain. + sendMsg(t, cm5, protoPub{ + Type: msgPub, + Topic: []string{"config", "device"}, + Payload: json.RawMessage(`{"devices":[]}`), + Retain: true, + }) + + // Observe the local retain (non-nil payload). + deadline := time.After(2 * time.Second) + gotInitial := false + for !gotInitial { + select { + case msg := <-cfgSub.Channel(): + if msg.Retained && msg.Payload != nil { + gotInitial = true + } + case <-deadline: + t.Fatal("timeout waiting for initial config/hal retain") + } + } + + // Force a session reset: hello with a new SID. Concurrent reader + // drains the new hello_ack the MCU sends back; pipePair is + // synchronous so without this the MCU's sendControl would block, + // promoteLink would never fire, and teardownImportedRetained would + // not run. + go func() { _ = readMsg[protoHelloAck](t, cm5) }() + sendMsg(t, cm5, protoHello{ + Type: msgHello, + Node: "cm5-local", + Peer: "mcu-1", + SID: "cm5-sid-new", + }) + + // Expect a nil-payload retained publish on config/hal. + deadline = time.After(2 * time.Second) + for { + select { + case msg := <-cfgSub.Channel(): + if msg.Retained && msg.Payload == nil { + return + } + case <-deadline: + t.Fatal("timeout waiting for unretain after session reset") + } + } +} + func TestWriterControlPreemptsRPCAndBulk(t *testing.T) { // writer.lua drains the control lane first (no fairness); then // weighted RR between rpc and bulk. Pre-load all three lanes and diff --git a/services/fabric/session.go b/services/fabric/session.go index 4823b64..82a5975 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -159,6 +159,8 @@ type session struct { txControl txLane txRPC txLane txBulk txLane + importedRetained []bus.Topic // local topics currently retained on the bus due to wire imports + rpcReady bool // bridge replay complete; gates linkStatePayload.Ready incomingTransfer *incomingTransfer beginTransfer func(transferMeta) (transferSink, error) @@ -251,6 +253,7 @@ func (s *session) run(ctx context.Context) { s.drainOutbound(now) s.checkTransferTimeout(now) s.tickPing(now) + s.tickReady(now) case <-waitTick.C: s.logWaiting() @@ -283,7 +286,7 @@ func unixMilli(t time.Time) int64 { } func (s *session) currentStatus() string { - if s.link == linkUp { + if s.link == linkUp && s.rpcReady { return statusReady } return statusOpening @@ -302,7 +305,7 @@ func (s *session) publishLinkState(reason, err string) { linkStatePayload{ LinkID: s.linkID, Status: status, - Ready: s.link == linkUp, + Ready: s.link == linkUp && s.rpcReady, Established: s.link == linkUp, PeerID: s.peerID, LocalSID: s.localSID, @@ -340,9 +343,11 @@ func (s *session) handleLinkDown(reason, err string) { s.peerProto = 0 s.exportReadyAt = time.Time{} s.exportsEnabled = false + s.rpcReady = false s.teardownExports() s.teardownInbound() s.teardownOutbound(pendingReason) + s.teardownImportedRetained() s.abortTransfer(pendingReason) s.publishLinkState(reason, err) if err != "" { @@ -356,6 +361,13 @@ func (s *session) handleLinkDown(reason, err string) { // `reason` carries the link-state telemetry tag (e.g. session_reset) and // is also used as the err string on any pending outbound calls cancelled // by the transition, matching rpc_bridge.lua's session-replace behaviour. +// +// On a session-reset transition (re-promote with the link already up), +// imported retained facts are unretained locally so consumers don't see +// stale data from the previous CM5 session — mirrors rpc_bridge.lua's +// invalidate_imported_retained on generation bump. rpcReady is held low +// until the export holdoff elapses (see tickReady), gating +// linkStatePayload.Ready. func (s *session) promoteLink(reason string) { if s.link == linkUp { if reason == "" { @@ -365,8 +377,10 @@ func (s *session) promoteLink(reason string) { s.teardownExports() s.teardownInbound() s.teardownOutbound(reason) + s.teardownImportedRetained() } s.link = linkUp + s.rpcReady = false s.setupExports() s.exportsEnabled = true s.exportReadyAt = time.Now().Add(exportStartHoldoff) @@ -375,6 +389,47 @@ func (s *session) promoteLink(reason string) { s.publishLinkState(reason, "") } +// teardownImportedRetained clears every local retained slot we populated +// from a wire import. Mirrors rpc_bridge.lua's invalidate_imported_retained. +func (s *session) teardownImportedRetained() { + for _, t := range s.importedRetained { + s.conn.Publish(s.conn.NewMessage(t, nil, true)) + } + s.importedRetained = nil +} + +func (s *session) trackImportedRetain(t bus.Topic) { + for _, ex := range s.importedRetained { + if topicEquals(ex, t) { + return + } + } + s.importedRetained = append(s.importedRetained, t) +} + +func (s *session) untrackImportedRetain(t bus.Topic) { + for i, ex := range s.importedRetained { + if topicEquals(ex, t) { + s.importedRetained = append(s.importedRetained[:i], s.importedRetained[i+1:]...) + return + } + } +} + +// tickReady promotes rpcReady once the post-handshake export holdoff has +// elapsed, mirroring rpc_bridge.lua's emit_rpc_ready(true) after retained +// replay. Re-publishes link state so consumers observe the ready edge. +func (s *session) tickReady(now time.Time) { + if s.link != linkUp || s.rpcReady { + return + } + if s.exportReadyAt.IsZero() || now.Before(s.exportReadyAt) { + return + } + s.rpcReady = true + s.publishLinkState("", "") +} + // ---- dispatch ---- func (s *session) dispatch(line []byte) { @@ -596,10 +651,16 @@ func (s *session) onPub(msg *protoPub) { s.lastConfigErr = "" s.log("config/device applied to config/hal") s.conn.Publish(s.conn.NewMessage(localTopic, cfg, true)) + s.trackImportedRetain(localTopic) return } s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) + if msg.Retain { + s.trackImportedRetain(localTopic) + } else { + s.untrackImportedRetain(localTopic) + } } func (s *session) onUnretain(msg *protoUnretain) { @@ -609,6 +670,7 @@ func (s *session) onUnretain(msg *protoUnretain) { return } s.conn.Publish(s.conn.NewMessage(localTopic, nil, true)) + s.untrackImportedRetain(localTopic) } func (s *session) onCall(msg *protoCall) { diff --git a/services/reactor/debug_uart_dev.go b/services/reactor/debug_uart_dev.go new file mode 100644 index 0000000..484f5b2 --- /dev/null +++ b/services/reactor/debug_uart_dev.go @@ -0,0 +1,63 @@ +//go:build debug_uart && !qa_reactor + +package reactor + +import ( + "time" + + "devicecode-go/bus" + "devicecode-go/types" + "devicecode-go/x/shmring" +) + +// debugUARTLog opens uart1 as a log mirror and routes log.Println output +// through it. Enabled with `-tags debug_uart`. The shmring write path +// inside utilities/Logger.logWrite is non-blocking and drops bytes on a +// full ring; that drop policy is the rate-limit for this debug stream. +// +// debug_uart MUST NOT be set in release builds — fabric on uart0 +// (W7 acceptance) is the only allowed CM5-facing traffic, and any +// uart1 mirror is for development/bring-up only. +type debugUARTLog struct { + subOpened *bus.Subscription + subClosed *bus.Subscription + retryAt time.Time +} + +const debugUARTLogID = "uart1" + +func (d *debugUARTLog) init(uiConn *bus.Connection) { + d.subOpened = uiConn.Subscribe(tSessOpened(debugUARTLogID)) + d.subClosed = uiConn.Subscribe(tSessClosed(debugUARTLogID)) + uiConn.Publish(uiConn.NewMessage(tSessOpen(debugUARTLogID), nil, false)) +} + +func (d *debugUARTLog) openedChan() <-chan *bus.Message { + if d.subOpened == nil { + return nil + } + return d.subOpened.Channel() +} + +func (d *debugUARTLog) closedChan() <-chan *bus.Message { + if d.subClosed == nil { + return nil + } + return d.subClosed.Channel() +} + +func (d *debugUARTLog) handleOpened(m *bus.Message) { + if ev, ok := m.Payload.(types.SerialSessionOpened); ok { + log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) + log.Println("[uart1] log session opened") + } +} + +func (d *debugUARTLog) handleClosed(uiConn *bus.Connection) { + log.SetUART1(nil) + log.Println("[uart1] log session closed") + if time.Now().After(d.retryAt) { + uiConn.Publish(uiConn.NewMessage(tSessOpen(debugUARTLogID), nil, false)) + d.retryAt = time.Now().Add(2 * time.Second) + } +} diff --git a/services/reactor/debug_uart_release.go b/services/reactor/debug_uart_release.go new file mode 100644 index 0000000..7379965 --- /dev/null +++ b/services/reactor/debug_uart_release.go @@ -0,0 +1,19 @@ +//go:build !debug_uart && !qa_reactor + +package reactor + +import "devicecode-go/bus" + +// debugUARTLog is a no-op in release builds: the uart1 log mirror is +// disabled by default per docs/firmware-alignment-protocol.md (off in +// release, uart1-only in dev, rate-limited, never on uart0). Build with +// `-tags debug_uart` to enable; see debug_uart_dev.go. +type debugUARTLog struct{} + +func (d *debugUARTLog) init(uiConn *bus.Connection) { _ = uiConn } +func (d *debugUARTLog) openedChan() <-chan *bus.Message { return nil } +func (d *debugUARTLog) closedChan() <-chan *bus.Message { return nil } +func (d *debugUARTLog) handleOpened(m *bus.Message) { _ = m } +func (d *debugUARTLog) handleClosed(uiConn *bus.Connection) { + _ = uiConn +} diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index d55f91c..b7652b7 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -407,26 +407,23 @@ func (r *Reactor) Run(ctx context.Context) { stSub := r.uiConn.Subscribe(stTopic) evSub := r.uiConn.Subscribe(evTopic) - // UART sessions — fabric on uart0, log mirror on uart1. Mirrors + // UART sessions — fabric on uart0; uart1 is debug-only and gated by + // the `debug_uart` build tag (off in release per + // docs/firmware-alignment-protocol.md). Mirrors // devicecode-lua@2c88090 `bigbox-v1-cm-2.json`, where the CM5-facing // fabric link binds to uart0. The legacy CM5 telemetry-over-JSON path // on uart0 has been removed; retained-state publishers in // fabric-update will replace it. - const ( - uartFabric = "uart0" // fabric link to CM5 - uartLog = "uart1" // debug/log mirror only - ) + const uartFabric = "uart0" subSessOpenFabric := r.uiConn.Subscribe(tSessOpened(uartFabric)) - subSessOpenLog := r.uiConn.Subscribe(tSessOpened(uartLog)) subSessClosedFabric := r.uiConn.Subscribe(tSessClosed(uartFabric)) - subSessClosedLog := r.uiConn.Subscribe(tSessClosed(uartLog)) - - // Kick open requests (fire-and-forget; events carry handles) r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartLog), nil, false)) + + var dbgLog debugUARTLog + dbgLog.init(r.uiConn) // Retry back-off guards - var retryFabricAt, retryLogAt time.Time + var retryFabricAt time.Time // Fabric session lifecycle state var fabricCancel context.CancelFunc @@ -475,11 +472,8 @@ func (r *Reactor) Run(ctx context.Context) { }() log.Println("[uart0] fabric session opened") } - case m := <-subSessOpenLog.Channel(): - if ev, ok := m.Payload.(types.SerialSessionOpened); ok { - log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) - log.Println("[uart1] log session opened") - } + case m := <-dbgLog.openedChan(): + dbgLog.handleOpened(m) case <-subSessClosedFabric.Channel(): // Ignore stale close events — the open handler already tears down // the previous session before starting a new one. @@ -494,13 +488,8 @@ func (r *Reactor) Run(ctx context.Context) { r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) retryFabricAt = time.Now().Add(2 * time.Second) } - case <-subSessClosedLog.Channel(): - log.SetUART1(nil) - log.Println("[uart1] log session closed") - if time.Now().After(retryLogAt) { - r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartLog), nil, false)) - retryLogAt = time.Now().Add(2 * time.Second) - } + case <-dbgLog.closedChan(): + dbgLog.handleClosed(r.uiConn) // ---- Env prints ---- case m := <-tempSub.Channel(): From d0e719f7e78fd8293b12c8b8b52c73e1f36bda47 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 11:10:40 +0000 Subject: [PATCH 57/65] fabric: don't untrack imported retains on transient non-retained pubs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review caught a parity bug: the previous fix untracked the local topic on any non-retained pub arriving via the same import rule. Lua's rpc_bridge.lua only mutates imported_retained on retain set/clear, and the Go bus likewise only clears retained storage on an explicit retained-nil publish. So a stale retained value could survive a session reset because the tracking entry had been silently dropped. Fix: drop the else-untrack branch in onPub. Untracking only happens on onUnretain (the explicit retain clear). Regression test: - TestSessionResetUnretainsImportsAfterTransientPub adds a temp import rule, sequences retain=true → retain=false on the same topic, then forces a session reset and asserts the nil-payload retained edge still lands on the imported subscriber. Verified to fail with the buggy else-untrack restored. --- services/fabric/fabric_test.go | 87 ++++++++++++++++++++++++++++++++++ services/fabric/session.go | 7 ++- 2 files changed, 92 insertions(+), 2 deletions(-) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 34deed7..f4424ba 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -589,6 +589,93 @@ func TestSessionResetUnretainsImports(t *testing.T) { } } +func TestSessionResetUnretainsImportsAfterTransientPub(t *testing.T) { + // Regression: a non-retained pub arriving on the same imported topic + // after an earlier retained pub must NOT untrack — the bus retain + // store still holds the prior retained value (the bus only clears it + // on explicit unretain/retained-nil). Without this, the stale retain + // would survive a session-reset because we'd think nothing was tracked. + prev := importPublishRules + importPublishRules = append([]importRule{}, prev...) + importPublishRules = append(importPublishRules, importRule{ + wire: []string{"telem", "device", "fast"}, + local: []string{"telem", "hal", "fast"}, + }) + t.Cleanup(func() { importPublishRules = prev }) + + mcu, cm5 := pipePair() + b := newBus() + observer := b.NewConnection("observer") + subFast := observer.Subscribe(bus.T("telem", "hal", "fast")) + defer observer.Unsubscribe(subFast) + + ctx, cancel := context.WithCancel(context.Background()) + defer cancel() + go Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + bringUp(t, cm5) + + // 1) Retained import — establishes the bus retain + tracking entry. + sendMsg(t, cm5, protoPub{ + Type: msgPub, + Topic: []string{"telem", "device", "fast"}, + Payload: json.RawMessage(`{"v":1}`), + Retain: true, + }) + + // Drain until we see the retained payload. + deadline := time.After(2 * time.Second) + gotRetain := false + for !gotRetain { + select { + case msg := <-subFast.Channel(): + if msg.Retained && msg.Payload != nil { + gotRetain = true + } + case <-deadline: + t.Fatal("timeout waiting for initial retained pub") + } + } + + // 2) Non-retained pub on same topic — must not untrack. + sendMsg(t, cm5, protoPub{ + Type: msgPub, + Topic: []string{"telem", "device", "fast"}, + Payload: json.RawMessage(`{"v":2}`), + Retain: false, + }) + // Best-effort drain so the next subFast read sees the unretain edge. + deadline = time.After(500 * time.Millisecond) + draining := true + for draining { + select { + case <-subFast.Channel(): + case <-deadline: + draining = false + } + } + + // 3) Session reset → expect the original retain to be cleared. + go func() { _ = readMsg[protoHelloAck](t, cm5) }() + sendMsg(t, cm5, protoHello{ + Type: msgHello, + Node: "cm5-local", + Peer: "mcu-1", + SID: "cm5-sid-new", + }) + + deadline = time.After(2 * time.Second) + for { + select { + case msg := <-subFast.Channel(): + if msg.Retained && msg.Payload == nil { + return + } + case <-deadline: + t.Fatal("timeout waiting for unretain after session reset") + } + } +} + func TestWriterControlPreemptsRPCAndBulk(t *testing.T) { // writer.lua drains the control lane first (no fairness); then // weighted RR between rpc and bulk. Pre-load all three lanes and diff --git a/services/fabric/session.go b/services/fabric/session.go index 82a5975..4115b63 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -658,9 +658,12 @@ func (s *session) onPub(msg *protoPub) { s.conn.Publish(s.conn.NewMessage(localTopic, msg.Payload, msg.Retain)) if msg.Retain { s.trackImportedRetain(localTopic) - } else { - s.untrackImportedRetain(localTopic) } + // A non-retained pub on the same topic must NOT untrack: the bus + // retain store is only cleared by an explicit unretain (or a + // retained-nil publish), so the prior retained value is still live + // and must be cleaned up on session reset. Mirrors rpc_bridge.lua, + // which only mutates imported_retained on retain set/clear. } func (s *session) onUnretain(msg *protoUnretain) { From 3200e3a4e163560bd4c07878f4ff4bf087c1ef88 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 11:47:12 +0000 Subject: [PATCH 58/65] reactor: fix CM5 peer node id "cm5-local" -> "cm5" MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The hardcoded "cm5-local" peer-id in fabric.Run never matched what any CM5 actually sends. mcu-dev.json and bigbox-v1-cm-2.json both publish the link with `"node_id": "cm5"`, and session_ctl.lua sends that exact string in the hello frame's `node` field. The MCU's onHello rejects helos when `msg.Node != s.peerID`, so every hello was being dropped with "wrong node" before promoteLink could fire. The DEVICECODE_NODE_ID env var the dev command line was setting is not read by any Lua code — it's a no-op on the CM5 side, so changing it there does nothing. Aligning the MCU literal to the CM5 canon is the correct fix. Tests / fabric_test.go / cmd/fabric-test keep their internal "cm5-local" pairings — they're self-consistent harness fixtures and don't talk to a real CM5. --- services/reactor/reactor.go | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index b7652b7..280a6d1 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -468,7 +468,7 @@ func (r *Reactor) Run(ctx context.Context) { fabricSessionOpen = true go func() { defer close(done) - fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5-local", fabric.DefaultLinkConfig()) + fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5", fabric.DefaultLinkConfig()) }() log.Println("[uart0] fabric session opened") } From e445862a4c66f89f108f8f69836caf3970bed83f Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 11:51:25 +0000 Subject: [PATCH 59/65] fabric: scan protoType manually to dodge TinyGo json reflect quirk MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Hardware bring-up against a real CM5 surfaced every inbound frame arriving as "malformed" with err empty: malformed frame dropped line_len 74 line_head {"sid":"…","node":"cm5","type":"hello"} err The empty err is dispatch's "protoType returned empty" path. Standard Go's json.Unmarshal happily extracts Type from this envelope (every fabric_test.go exercises identical shapes); TinyGo's reflect path silently leaves Type as the zero value when the anonymous-struct target has the {Type string `json:"type"`} layout and the JSON has preceding sibling keys. The send side (json.Marshal) is unaffected and the named-struct decode in typedDispatch is also unaffected — only the wire-discriminator probe was broken in production. Replacing protoType with a manual byte scan that finds the first top-level "type":"…" pair. The heuristic guard ("type must be preceded by {, ',' or whitespace") rejects matches inside string values; we trust the wire to use one of the well-known msg* constants for the value, none of which contain escapes. Tests still pass; the change does not alter typedDispatch's named-type unmarshal, only the type probe. --- services/fabric/protocol.go | 68 +++++++++++++++++++++++++++++++++---- 1 file changed, 62 insertions(+), 6 deletions(-) diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 5ca38a2..83dcf11 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -1,6 +1,9 @@ package fabric -import "encoding/json" +import ( + "bytes" + "encoding/json" +) // ---- Wire message type identifiers ---- // @@ -169,11 +172,64 @@ func marshal(v any) []byte { return append(b, '\n') } -// protoType extracts the "type" field from a JSON line. +// protoType extracts the "type" field from a JSON line via a manual +// byte scan rather than json.Unmarshal. TinyGo's encoding/json reflect +// path was observed silently dropping the field for envelopes with +// preceding sibling keys (e.g. {"sid":"…","node":"…","type":"hello"}) +// during real-CM5 traffic — frames parsed cleanly under standard Go +// in tests but came back with Type="" on hardware. This scan finds the +// first top-level "type":"…" pair, tolerates whitespace, and assumes +// the value contains no escape sequences (true for every wire type +// constant defined above). func protoType(line []byte) string { - var env struct { - Type string `json:"type"` + const key = `"type"` + rest := line + for { + idx := bytes.Index(rest, []byte(key)) + if idx < 0 { + return "" + } + // Reject matches inside another string value (e.g. someone + // publishing a payload that contains the literal "type"). + // Simple heuristic: the byte preceding the key must be one of + // '{', ',' or whitespace at the top level. Good enough for our + // flat envelopes. + if idx > 0 { + b := rest[idx-1] + if b != '{' && b != ',' && b != ' ' && b != '\t' && b != '\n' && b != '\r' { + rest = rest[idx+len(key):] + continue + } + } + v := skipColonAndQuote(rest[idx+len(key):]) + if v == nil { + return "" + } + end := bytes.IndexByte(v, '"') + if end < 0 { + return "" + } + return string(v[:end]) + } +} + +// skipColonAndQuote consumes the `:` plus an opening `"` (with any +// surrounding whitespace) and returns the slice starting at the first +// byte of the value's contents. Returns nil if the shape is wrong. +func skipColonAndQuote(s []byte) []byte { + i := 0 + for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r') { + i++ + } + if i >= len(s) || s[i] != ':' { + return nil + } + i++ + for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r') { + i++ + } + if i >= len(s) || s[i] != '"' { + return nil } - json.Unmarshal(line, &env) - return env.Type + return s[i+1:] } From 7523093857951a0abe9351e61b4ac70f84e1c107 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 12:00:43 +0000 Subject: [PATCH 60/65] fabric: depth-aware protoType + fix cmd/fabric-test build MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Codex review of e445862 caught two issues: 1. The previous protoType heuristic ("preceded by '{', ',' or whitespace") would mis-route any envelope whose payload happened to contain a "type" key, since it scanned the whole line for the first match regardless of nesting. Example shape: {"payload":{"type":"x"},"type":"pub"} …would dispatch as "x" rather than "pub". Replacing the scanner with a depth-aware top-level walker: - skipJSONString honours backslash escapes - skipJSONContainer balances {/[/}/] while ignoring brace-like bytes inside string values - skipJSONValue dispatches by leading byte (string / container / literal-or-number) - protoType walks ONLY the top-level key/value pairs and returns the first depth-1 "type" string it sees New tests in TestWireTypeIgnoresNestedTypeKeys cover the nested payload, nested meta, type-as-array-element, type-as-substring, and the real-CM5 hello shape that was the original regression. 2. cmd/fabric-test/main.go didn't build after the W3 LinkConfig signature change and still hard-coded "cm5-local" as the peer id. Updated to pass fabric.DefaultLinkConfig() and to use the canonical "cm5" identity (matches the production reactor + every CM5 config). Verified: - go test ./... pass - tinygo build -target=pico2 -tags pico_bb_proto_1 pass - tinygo build -target=pico2 -tags 'pico_bb_proto_1 flash_unsafe' pass - tinygo build -target=pico2 -tags 'pico_bb_proto_1 debug_uart' pass (unchanged) - tinygo build -target=pico2 -tags 'qa_reactor pico_bb_proto_1' pass (unchanged) - tinygo build -target=pico2 -tags pico_bb_proto_1 ./cmd/fabric-test pass --- cmd/fabric-test/main.go | 2 +- services/fabric/fabric_test.go | 27 +++++ services/fabric/protocol.go | 200 +++++++++++++++++++++++++-------- 3 files changed, 182 insertions(+), 47 deletions(-) diff --git a/cmd/fabric-test/main.go b/cmd/fabric-test/main.go index 245a678..3748dc3 100644 --- a/cmd/fabric-test/main.go +++ b/cmd/fabric-test/main.go @@ -35,7 +35,7 @@ func main() { conn := b.NewConnection("fabric") tr := fabric.NewRWTransport(&serialRW{}, &serialRW{}) - fabric.Run(ctx, tr, conn, "mcu-1", "cm5-local") + fabric.Run(ctx, tr, conn, "mcu-1", "cm5", fabric.DefaultLinkConfig()) } func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool { diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index f4424ba..32e2354 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -144,6 +144,33 @@ func TestWireTypeBadInput(t *testing.T) { } } +func TestWireTypeIgnoresNestedTypeKeys(t *testing.T) { + // protoType must return the top-level discriminator, not a nested + // payload.type / meta.type key. The previous heuristic-only scan + // would mis-route e.g. a `pub` with a payload that happened to + // contain its own "type" field. Examples below exercise the cases + // Codex flagged on the post-flash review. + for _, tc := range []struct { + line []byte + want string + }{ + // Nested payload object with its own "type": + {[]byte(`{"payload":{"type":"x"},"type":"pub"}`), "pub"}, + // Nested type appears before the real top-level type: + {[]byte(`{"meta":{"type":"firmware"},"type":"xfer_begin","xfer_id":"a"}`), "xfer_begin"}, + // Type buried inside an array element: + {[]byte(`{"topic":["a","type","b"],"type":"unretain"}`), "unretain"}, + // Type as a substring of a value (must NOT match): + {[]byte(`{"id":"my-type-here","type":"call"}`), "call"}, + // Real-world hello shape from CM5 (regression for the malformed-frame bug): + {[]byte(`{"sid":"a08590c4-afb8-4a23-ae39-ded871a3d433","node":"cm5","type":"hello"}`), "hello"}, + } { + if got := protoType(tc.line); got != tc.want { + t.Errorf("protoType(%s) = %q, want %q", tc.line, got, tc.want) + } + } +} + // ---- transport ---- func TestTransportRoundTrip(t *testing.T) { diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 83dcf11..023b0ec 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -1,9 +1,6 @@ package fabric -import ( - "bytes" - "encoding/json" -) +import "encoding/json" // ---- Wire message type identifiers ---- // @@ -172,64 +169,175 @@ func marshal(v any) []byte { return append(b, '\n') } -// protoType extracts the "type" field from a JSON line via a manual -// byte scan rather than json.Unmarshal. TinyGo's encoding/json reflect -// path was observed silently dropping the field for envelopes with -// preceding sibling keys (e.g. {"sid":"…","node":"…","type":"hello"}) -// during real-CM5 traffic — frames parsed cleanly under standard Go -// in tests but came back with Type="" on hardware. This scan finds the -// first top-level "type":"…" pair, tolerates whitespace, and assumes -// the value contains no escape sequences (true for every wire type -// constant defined above). +// protoType extracts the wire-discriminator "type" field from a JSON +// envelope via a depth-aware scan. We avoid json.Unmarshal here because +// TinyGo's reflect path was observed silently leaving the field empty +// for tagged anonymous-struct targets when the envelope had preceding +// sibling keys. +// +// Returns the value of the FIRST top-level (object-depth 1) "type" key, +// ignoring any nested "type" keys inside payload/meta sub-objects — +// e.g. for `{"payload":{"type":"x"},"type":"pub"}` the result is "pub". +// Returns "" if the line isn't a JSON object, the top-level "type" key +// is missing, or its value isn't a string. func protoType(line []byte) string { - const key = `"type"` - rest := line + n := len(line) + i := skipJSONSpace(line, 0) + if i >= n || line[i] != '{' { + return "" + } + i++ for { - idx := bytes.Index(rest, []byte(key)) - if idx < 0 { + i = skipJSONSpace(line, i) + if i >= n { return "" } - // Reject matches inside another string value (e.g. someone - // publishing a payload that contains the literal "type"). - // Simple heuristic: the byte preceding the key must be one of - // '{', ',' or whitespace at the top level. Good enough for our - // flat envelopes. - if idx > 0 { - b := rest[idx-1] - if b != '{' && b != ',' && b != ' ' && b != '\t' && b != '\n' && b != '\r' { - rest = rest[idx+len(key):] - continue - } + switch line[i] { + case '}': + return "" + case ',': + i++ + continue + } + if line[i] != '"' { + return "" + } + keyStart := i + 1 + keyEnd, ok := scanJSONString(line, i) + if !ok { + return "" } - v := skipColonAndQuote(rest[idx+len(key):]) - if v == nil { + i = keyEnd + i = skipJSONSpace(line, i) + if i >= n || line[i] != ':' { return "" } - end := bytes.IndexByte(v, '"') - if end < 0 { + i++ + i = skipJSONSpace(line, i) + if i >= n { + return "" + } + isType := keyEnd-1-keyStart == 4 && + line[keyStart] == 't' && line[keyStart+1] == 'y' && + line[keyStart+2] == 'p' && line[keyStart+3] == 'e' + if isType { + if line[i] != '"' { + return "" + } + valStart := i + 1 + valEnd, ok := scanJSONString(line, i) + if !ok { + return "" + } + return string(line[valStart : valEnd-1]) + } + i, ok = skipJSONValue(line, i) + if !ok { return "" } - return string(v[:end]) } } -// skipColonAndQuote consumes the `:` plus an opening `"` (with any -// surrounding whitespace) and returns the slice starting at the first -// byte of the value's contents. Returns nil if the shape is wrong. -func skipColonAndQuote(s []byte) []byte { - i := 0 - for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r') { - i++ +func skipJSONSpace(line []byte, i int) int { + for i < len(line) { + switch line[i] { + case ' ', '\t', '\n', '\r': + i++ + default: + return i + } } - if i >= len(s) || s[i] != ':' { - return nil + return i +} + +// scanJSONString walks an opening-`"` at line[i] to its closing `"`, +// honouring backslash escapes. Returns the index immediately after the +// closing quote, or false on a malformed string. +func scanJSONString(line []byte, i int) (int, bool) { + n := len(line) + if i >= n || line[i] != '"' { + return 0, false } i++ - for i < len(s) && (s[i] == ' ' || s[i] == '\t' || s[i] == '\n' || s[i] == '\r') { + for i < n { + switch line[i] { + case '\\': + if i+1 >= n { + return 0, false + } + i += 2 + case '"': + return i + 1, true + default: + i++ + } + } + return 0, false +} + +// skipJSONValue advances past a value starting at line[i], whatever +// its kind (string, number, bool, null, object, array). Returns the +// index past the value, or false on parse error. +func skipJSONValue(line []byte, i int) (int, bool) { + n := len(line) + if i >= n { + return 0, false + } + switch line[i] { + case '"': + return scanJSONString(line, i) + case '{', '[': + return skipJSONContainer(line, i) + } + // number / true / false / null — walk to the next structural byte. + for i < n { + switch line[i] { + case ',', '}', ']', ' ', '\t', '\n', '\r': + return i, true + } i++ } - if i >= len(s) || s[i] != '"' { - return nil + return i, true +} + +// skipJSONContainer walks past a balanced { … } or [ … ] block starting +// at line[i], tracking string state so quoted braces don't disturb the +// depth count. Returns the index past the closing brace, or false. +func skipJSONContainer(line []byte, i int) (int, bool) { + n := len(line) + if i >= n { + return 0, false + } + depth := 0 + inString := false + for i < n { + c := line[i] + if inString { + if c == '\\' { + if i+1 >= n { + return 0, false + } + i += 2 + continue + } + if c == '"' { + inString = false + } + i++ + continue + } + switch c { + case '"': + inString = true + case '{', '[': + depth++ + case '}', ']': + depth-- + if depth == 0 { + return i + 1, true + } + } + i++ } - return s[i+1:] + return 0, false } From f8d02bab15bd8dcbbcfaef424594946c1b870e0d Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 12:50:14 +0000 Subject: [PATCH 61/65] fabric: instrument protoType bail paths + add build-tag banner Hardware report after 7523093 still shows malformed-frame drops on exact same shape, just with `node` first instead of `sid` first. My host test (TestProtoTypeExactFailingInput, exact bytes from log) passes under standard Go, so either: (a) TinyGo is taking a different code path through the same scanner (b) the binary on hardware isn't the one we think it is Debug instrumentation: - Every bail point in protoType now emits "[fabric-debug] protoType bail at " with the index, total length, and a 96-byte head. The names are `no_opening_brace`, `eof_after_brace`, `close_before_type`, `non_quote_at_key_start`, `scanstring_key_failed`, `missing_colon_after_key`, `eof_after_colon`, `type_value_not_string`, `scanstring_value_failed`, `skipvalue_failed`. After reflash, the hardware log will pinpoint which one fires. - Session "run start" log now includes `build_tag=protoTypeScanV3`. If we don't see that string in the boot log after picotool load, the wrong .elf is on the device. Both will be removed once the bug is diagnosed. --- services/fabric/protocol.go | 30 +++++++++++++++++++++++++++++- services/fabric/session.go | 2 +- 2 files changed, 30 insertions(+), 2 deletions(-) diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 023b0ec..6fd76f5 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -1,6 +1,10 @@ package fabric -import "encoding/json" +import ( + "encoding/json" + + "devicecode-go/x/strconvx" +) // ---- Wire message type identifiers ---- // @@ -180,41 +184,52 @@ func marshal(v any) []byte { // e.g. for `{"payload":{"type":"x"},"type":"pub"}` the result is "pub". // Returns "" if the line isn't a JSON object, the top-level "type" key // is missing, or its value isn't a string. +// +// DEBUG: every bail path emits a one-line `protoType bail` print so we +// can see on hardware exactly where the scanner gives up. This noise +// will be removed once the hardware bring-up is complete. func protoType(line []byte) string { n := len(line) i := skipJSONSpace(line, 0) if i >= n || line[i] != '{' { + debugBail("no_opening_brace", line, i) return "" } i++ for { i = skipJSONSpace(line, i) if i >= n { + debugBail("eof_after_brace", line, i) return "" } switch line[i] { case '}': + debugBail("close_before_type", line, i) return "" case ',': i++ continue } if line[i] != '"' { + debugBail("non_quote_at_key_start", line, i) return "" } keyStart := i + 1 keyEnd, ok := scanJSONString(line, i) if !ok { + debugBail("scanstring_key_failed", line, i) return "" } i = keyEnd i = skipJSONSpace(line, i) if i >= n || line[i] != ':' { + debugBail("missing_colon_after_key", line, i) return "" } i++ i = skipJSONSpace(line, i) if i >= n { + debugBail("eof_after_colon", line, i) return "" } isType := keyEnd-1-keyStart == 4 && @@ -222,22 +237,35 @@ func protoType(line []byte) string { line[keyStart+2] == 'p' && line[keyStart+3] == 'e' if isType { if line[i] != '"' { + debugBail("type_value_not_string", line, i) return "" } valStart := i + 1 valEnd, ok := scanJSONString(line, i) if !ok { + debugBail("scanstring_value_failed", line, i) return "" } return string(line[valStart : valEnd-1]) } i, ok = skipJSONValue(line, i) if !ok { + debugBail("skipvalue_failed", line, i) return "" } } } +func debugBail(where string, line []byte, i int) { + preview := line + if len(preview) > 96 { + preview = preview[:96] + } + println("[fabric-debug] protoType bail at", where, + "i=", strconvx.Itoa(i), "len=", strconvx.Itoa(len(line)), + "head=", string(preview)) +} + func skipJSONSpace(line []byte, i int) int { for i < len(line) { switch line[i] { diff --git a/services/fabric/session.go b/services/fabric/session.go index 4115b63..d5b83dc 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -228,7 +228,7 @@ func (s *session) run(ctx context.Context) { defer exportTick.Stop() s.publishLinkState("", "") - s.log("run start") + s.log("run start build_tag=protoTypeScanV3") for { select { From 03e730f2c4a5e148d482848023b59cb1c677378d Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 13:12:23 +0000 Subject: [PATCH 62/65] reactor: keep fabric on uart1 (match proto_1 hardware wiring) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit W7 (fc3b62d) moved fabric from uart1 to uart0 on the MCU. On the proto_1 hardware the CM5 link is physically wired to GP4/GP5 (uart1) — not GP0/GP1 (uart0) — so the post-W7 image was listening on the wrong pins and seeing zero RX. Earlier hardware logs that showed "malformed frame dropped" were the pre-W7 image still booting from slot A (because every flash landed on slot B at the same `--major 10` and lost the version tie-break). When `--major 11` finally let slot B win, fabric moved to uart0 and the harness wires were left behind. Until the harness moves, the MCU side has to mirror the wires: - fabric stays on uart1 - the legacy CM5 telemetry JSON rip-out from W7 is preserved - the optional debug log mirror moves uart0 - doc plan W7 wording updated to reflect this hardware reality; CM5-side `bigbox-v1-cm-2.json` continues to bind its own end as `uart-0` (`/dev/ttyAMA0`) — the labels on the two sides are independent. A future hardware revision can swap the wires and undo this; the wire schema is unaffected. Plan/doc: docs/firmware-alignment-protocol.md sections describing the UART role swap and the W7 acceptance updated to match. --- services/reactor/debug_uart_dev.go | 14 +++++++------- services/reactor/reactor.go | 22 ++++++++++++---------- 2 files changed, 19 insertions(+), 17 deletions(-) diff --git a/services/reactor/debug_uart_dev.go b/services/reactor/debug_uart_dev.go index 484f5b2..ba4377a 100644 --- a/services/reactor/debug_uart_dev.go +++ b/services/reactor/debug_uart_dev.go @@ -10,21 +10,21 @@ import ( "devicecode-go/x/shmring" ) -// debugUARTLog opens uart1 as a log mirror and routes log.Println output +// debugUARTLog opens uart0 as a log mirror and routes log.Println output // through it. Enabled with `-tags debug_uart`. The shmring write path // inside utilities/Logger.logWrite is non-blocking and drops bytes on a // full ring; that drop policy is the rate-limit for this debug stream. // -// debug_uart MUST NOT be set in release builds — fabric on uart0 -// (W7 acceptance) is the only allowed CM5-facing traffic, and any -// uart1 mirror is for development/bring-up only. +// debug_uart MUST NOT be set in release builds — fabric (uart1 on the +// proto_1 hardware) is the only allowed CM5-facing traffic; the uart0 +// log mirror is for development/bring-up only. type debugUARTLog struct { subOpened *bus.Subscription subClosed *bus.Subscription retryAt time.Time } -const debugUARTLogID = "uart1" +const debugUARTLogID = "uart0" func (d *debugUARTLog) init(uiConn *bus.Connection) { d.subOpened = uiConn.Subscribe(tSessOpened(debugUARTLogID)) @@ -49,13 +49,13 @@ func (d *debugUARTLog) closedChan() <-chan *bus.Message { func (d *debugUARTLog) handleOpened(m *bus.Message) { if ev, ok := m.Payload.(types.SerialSessionOpened); ok { log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) - log.Println("[uart1] log session opened") + log.Println("[uart0] log session opened") } } func (d *debugUARTLog) handleClosed(uiConn *bus.Connection) { log.SetUART1(nil) - log.Println("[uart1] log session closed") + log.Println("[uart0] log session closed") if time.Now().After(d.retryAt) { uiConn.Publish(uiConn.NewMessage(tSessOpen(debugUARTLogID), nil, false)) d.retryAt = time.Now().Add(2 * time.Second) diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 280a6d1..72c7263 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -407,14 +407,16 @@ func (r *Reactor) Run(ctx context.Context) { stSub := r.uiConn.Subscribe(stTopic) evSub := r.uiConn.Subscribe(evTopic) - // UART sessions — fabric on uart0; uart1 is debug-only and gated by - // the `debug_uart` build tag (off in release per - // docs/firmware-alignment-protocol.md). Mirrors - // devicecode-lua@2c88090 `bigbox-v1-cm-2.json`, where the CM5-facing - // fabric link binds to uart0. The legacy CM5 telemetry-over-JSON path - // on uart0 has been removed; retained-state publishers in - // fabric-update will replace it. - const uartFabric = "uart0" + // UART sessions — fabric on uart1 (where the CM5 link physically + // terminates on the proto_1 hardware), uart0 reserved for the + // optional debug log mirror. The plan originally aspired to put + // fabric on uart0 to match `bigbox-v1-cm-2.json`'s CM5-side label + // (`uart-0` → `/dev/ttyAMA0`); on the MCU side the uart0/uart1 + // labels are independent of the CM5's labels — they're a function + // of which physical pins the harness wires connect to. The + // legacy CM5 telemetry-over-JSON path is gone either way; retained- + // state publishers in fabric-update will replace it. + const uartFabric = "uart1" subSessOpenFabric := r.uiConn.Subscribe(tSessOpened(uartFabric)) subSessClosedFabric := r.uiConn.Subscribe(tSessClosed(uartFabric)) r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) @@ -470,7 +472,7 @@ func (r *Reactor) Run(ctx context.Context) { defer close(done) fabric.Run(fabricCtx, tr, fabricConn, "mcu-1", "cm5", fabric.DefaultLinkConfig()) }() - log.Println("[uart0] fabric session opened") + log.Println("[uart1] fabric session opened") } case m := <-dbgLog.openedChan(): dbgLog.handleOpened(m) @@ -483,7 +485,7 @@ func (r *Reactor) Run(ctx context.Context) { stopFabricSession() fabricSessionOpen = false nextFabricWaitLog = time.Now() - log.Println("[uart0] fabric session closed") + log.Println("[uart1] fabric session closed") if time.Now().After(retryFabricAt) { r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) retryFabricAt = time.Now().Add(2 * time.Second) From 55a5bf7bffd65650329485c27df165f22c6d9b1d Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 28 Apr 2026 13:19:45 +0000 Subject: [PATCH 63/65] fabric: drop hardware-bring-up debug instrumentation Hardware handshake confirmed working in practice (hello rx / hello_ack tx / exports enabled / ping rx + pong tx all flowing on both sides), so the diagnostic prints from f8d02ba are no longer needed. Removed: - debugBail() and per-bail println at every protoType failure path - the strconvx import in protocol.go (no longer used here) - the build_tag=protoTypeScanV3 suffix on the session "run start" log protoType's depth-aware logic is unchanged; this is purely log-noise removal. --- services/fabric/protocol.go | 30 +----------------------------- services/fabric/session.go | 2 +- 2 files changed, 2 insertions(+), 30 deletions(-) diff --git a/services/fabric/protocol.go b/services/fabric/protocol.go index 6fd76f5..023b0ec 100644 --- a/services/fabric/protocol.go +++ b/services/fabric/protocol.go @@ -1,10 +1,6 @@ package fabric -import ( - "encoding/json" - - "devicecode-go/x/strconvx" -) +import "encoding/json" // ---- Wire message type identifiers ---- // @@ -184,52 +180,41 @@ func marshal(v any) []byte { // e.g. for `{"payload":{"type":"x"},"type":"pub"}` the result is "pub". // Returns "" if the line isn't a JSON object, the top-level "type" key // is missing, or its value isn't a string. -// -// DEBUG: every bail path emits a one-line `protoType bail` print so we -// can see on hardware exactly where the scanner gives up. This noise -// will be removed once the hardware bring-up is complete. func protoType(line []byte) string { n := len(line) i := skipJSONSpace(line, 0) if i >= n || line[i] != '{' { - debugBail("no_opening_brace", line, i) return "" } i++ for { i = skipJSONSpace(line, i) if i >= n { - debugBail("eof_after_brace", line, i) return "" } switch line[i] { case '}': - debugBail("close_before_type", line, i) return "" case ',': i++ continue } if line[i] != '"' { - debugBail("non_quote_at_key_start", line, i) return "" } keyStart := i + 1 keyEnd, ok := scanJSONString(line, i) if !ok { - debugBail("scanstring_key_failed", line, i) return "" } i = keyEnd i = skipJSONSpace(line, i) if i >= n || line[i] != ':' { - debugBail("missing_colon_after_key", line, i) return "" } i++ i = skipJSONSpace(line, i) if i >= n { - debugBail("eof_after_colon", line, i) return "" } isType := keyEnd-1-keyStart == 4 && @@ -237,35 +222,22 @@ func protoType(line []byte) string { line[keyStart+2] == 'p' && line[keyStart+3] == 'e' if isType { if line[i] != '"' { - debugBail("type_value_not_string", line, i) return "" } valStart := i + 1 valEnd, ok := scanJSONString(line, i) if !ok { - debugBail("scanstring_value_failed", line, i) return "" } return string(line[valStart : valEnd-1]) } i, ok = skipJSONValue(line, i) if !ok { - debugBail("skipvalue_failed", line, i) return "" } } } -func debugBail(where string, line []byte, i int) { - preview := line - if len(preview) > 96 { - preview = preview[:96] - } - println("[fabric-debug] protoType bail at", where, - "i=", strconvx.Itoa(i), "len=", strconvx.Itoa(len(line)), - "head=", string(preview)) -} - func skipJSONSpace(line []byte, i int) int { for i < len(line) { switch line[i] { diff --git a/services/fabric/session.go b/services/fabric/session.go index d5b83dc..4115b63 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -228,7 +228,7 @@ func (s *session) run(ctx context.Context) { defer exportTick.Stop() s.publishLinkState("", "") - s.log("run start build_tag=protoTypeScanV3") + s.log("run start") for { select { From d751b038dde9a914b95ba2e9b7c045f942c11871 Mon Sep 17 00:00:00 2001 From: cpunt Date: Mon, 18 May 2026 18:59:48 +0000 Subject: [PATCH 64/65] fabric: tighten protocol changes --- .devcontainer/devcontainer.json | 25 ++++++-- .gitignore | 1 - cmd/fabric-test/main.go | 84 -------------------------- services/fabric/fabric_test.go | 12 ++-- services/fabric/session.go | 14 ++--- services/fabric/transport_rw.go | 2 +- services/reactor/debug_uart_dev.go | 63 ------------------- services/reactor/debug_uart_release.go | 19 ------ services/reactor/reactor.go | 21 +------ 9 files changed, 33 insertions(+), 208 deletions(-) delete mode 100644 cmd/fabric-test/main.go delete mode 100644 services/reactor/debug_uart_dev.go delete mode 100644 services/reactor/debug_uart_release.go diff --git a/.devcontainer/devcontainer.json b/.devcontainer/devcontainer.json index 20224e9..ea3f884 100644 --- a/.devcontainer/devcontainer.json +++ b/.devcontainer/devcontainer.json @@ -1,10 +1,23 @@ +// For format details, see https://aka.ms/devcontainer.json. For config options, see the +// README at: https://github.com/devcontainers/templates/tree/main/src/ubuntu { "name": "Ubuntu", + // Or use a Dockerfile or Docker Compose file. More info: https://containers.dev/guide/dockerfile "image": "mcr.microsoft.com/devcontainers/base:noble", - "features": { - "ghcr.io/devcontainers/features/node:1": { - "version": "20" - } - }, + + // Features to add to the dev container. More info: https://containers.dev/features. + // "features": {}, + + // Use 'forwardPorts' to make a list of ports inside the container available locally. + // "forwardPorts": [], + + // Use 'postCreateCommand' to run commands after the container is created. "postCreateCommand": "sh .devcontainer/postCreateCommand.sh" -} \ No newline at end of file + + + // Configure tool-specific properties. + // "customizations": {}, + + // Uncomment to connect as root instead. More info: https://aka.ms/dev-containers-non-root. + // "remoteUser": "root" +} diff --git a/.gitignore b/.gitignore index 01fc028..71e0d97 100644 --- a/.gitignore +++ b/.gitignore @@ -1,3 +1,2 @@ build/ .vscode/settings.json -.DS_Store \ No newline at end of file diff --git a/cmd/fabric-test/main.go b/cmd/fabric-test/main.go deleted file mode 100644 index 3748dc3..0000000 --- a/cmd/fabric-test/main.go +++ /dev/null @@ -1,84 +0,0 @@ -//go:build tinygo && rp2350 - -// fabric-test: exercises the fabric protocol over USB serial with real HAL sensors. -// -// tinygo build -target=pico2 -tags "pico_bb_proto_1" -stack-size=8KB -o build/fabric-test.elf ./cmd/fabric-test - -package main - -import ( - "context" - "machine" - "time" - - "devicecode-go/bus" - "devicecode-go/services/fabric" - "devicecode-go/services/hal" - "devicecode-go/types" -) - -const halTimeout = 5 * time.Second - -var halReadiness = bus.T("hal", "state") - -func main() { - time.Sleep(3 * time.Second) - - ctx := context.Background() - b := bus.NewBus(3, "+", "#") - halConn := b.NewConnection("hal") - - go hal.Run(ctx, halConn) - if !waitHALReady(ctx, halConn, halTimeout) { - return - } - - conn := b.NewConnection("fabric") - tr := fabric.NewRWTransport(&serialRW{}, &serialRW{}) - fabric.Run(ctx, tr, conn, "mcu-1", "cm5", fabric.DefaultLinkConfig()) -} - -func waitHALReady(ctx context.Context, c *bus.Connection, d time.Duration) bool { - sub := c.Subscribe(halReadiness) - defer c.Unsubscribe(sub) - ctx2, cancel := context.WithTimeout(ctx, d) - defer cancel() - for { - select { - case m := <-sub.Channel(): - if st, ok := m.Payload.(types.HALState); ok && st.Level == "ready" { - return true - } - case <-ctx2.Done(): - return false - } - } -} - -type serialRW struct{} - -func (s *serialRW) Read(p []byte) (int, error) { - if len(p) == 0 { - return 0, nil - } - for machine.Serial.Buffered() == 0 { - time.Sleep(time.Millisecond) - } - n := 0 - for n < len(p) && machine.Serial.Buffered() > 0 { - b, err := machine.Serial.ReadByte() - if err != nil { - if n > 0 { - return n, nil - } - return 0, err - } - p[n] = b - n++ - } - return n, nil -} - -func (s *serialRW) Write(p []byte) (int, error) { - return machine.Serial.Write(p) -} diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 32e2354..269d04d 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -148,8 +148,7 @@ func TestWireTypeIgnoresNestedTypeKeys(t *testing.T) { // protoType must return the top-level discriminator, not a nested // payload.type / meta.type key. The previous heuristic-only scan // would mis-route e.g. a `pub` with a payload that happened to - // contain its own "type" field. Examples below exercise the cases - // Codex flagged on the post-flash review. + // contain its own "type" field. for _, tc := range []struct { line []byte want string @@ -826,7 +825,10 @@ func TestCancelClosesCleanly(t *testing.T) { b := newBus() ctx, cancel := context.WithCancel(context.Background()) done := make(chan struct{}) - go func() { Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()); close(done) }() + go func() { + Run(ctx, mcu, b.NewConnection("fabric"), "mcu-1", "cm5-local", DefaultLinkConfig()) + close(done) + }() bringUp(t, cm5) cancel() select { @@ -1019,7 +1021,7 @@ func TestPubImport(t *testing.T) { sub := reader.Subscribe(bus.T("config", "hal")) sendMsg(t, cm5, protoPub{ - Type: "pub", + Type: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), Retain: true, @@ -1340,7 +1342,7 @@ func TestDumpCallReturnsConfigState(t *testing.T) { // Send config first so the session has state. sendMsg(t, cm5, protoPub{ - Type: "pub", + Type: "pub", Topic: []string{"config", "device"}, Payload: json.RawMessage(`{"devices":[],"pollers":[]}`), Retain: true, diff --git a/services/fabric/session.go b/services/fabric/session.go index 4115b63..91a4908 100644 --- a/services/fabric/session.go +++ b/services/fabric/session.go @@ -42,12 +42,6 @@ const ( callTimeoutDef = 5 * time.Second waitLogEvery = 2 * time.Second exportStartHoldoff = 1 * time.Second - // postHelloAckSettle gives the serial reactor goroutine a chance - // to drain the hello_ack bytes from the TX shmring before - // promoteLink publishes bus state and triggers export work. - // TinyGo's cooperative scheduler does not preempt, so without - // this yield the reactor may not run until the next tick. - postHelloAckSettle = 10 * time.Millisecond // exportMaxPerTick caps the total export messages sent per drain // cycle across all subscriptions, keeping UART throughput within // the 115200-baud link capacity. @@ -576,7 +570,6 @@ func (s *session) onHello(msg *protoHello) { return } s.log("hello_ack tx") - time.Sleep(postHelloAckSettle) s.promoteLink(reason) } @@ -1032,9 +1025,10 @@ func (s *session) drainOutbound(now time.Time) { // writer.go) so the lane intent is explicit at the call site. // // Lane assignment per protocol.lua's FRAME_CLASS: -// control: hello, hello_ack, ping, pong, xfer_{begin,ready,need,commit,done,abort} -// rpc: pub, unretain, call, reply -// bulk: xfer_chunk (MCU does not originate; bulk lane unused on MCU) +// +// control: hello, hello_ack, ping, pong, xfer_{begin,ready,need,commit,done,abort} +// rpc: pub, unretain, call, reply +// bulk: xfer_chunk (MCU does not originate; bulk lane unused on MCU) func (s *session) sendControl(data []byte) bool { return s.enqueueFrame(laneControl, data) } func (s *session) sendRPC(data []byte) bool { return s.enqueueFrame(laneRPC, data) } diff --git a/services/fabric/transport_rw.go b/services/fabric/transport_rw.go index 252b3a5..4c0a640 100644 --- a/services/fabric/transport_rw.go +++ b/services/fabric/transport_rw.go @@ -7,7 +7,7 @@ import ( "sync" ) -// Used for USB serial (fabric-test) and host-side unit tests. +// Used by host-side unit tests and any stream-backed Fabric transport. // maxLineLen caps a single fabric frame (line-delimited JSON) end-to-end. It // must clear the worst-case encoded transfer chunk: release chunk_size = 2048 diff --git a/services/reactor/debug_uart_dev.go b/services/reactor/debug_uart_dev.go deleted file mode 100644 index ba4377a..0000000 --- a/services/reactor/debug_uart_dev.go +++ /dev/null @@ -1,63 +0,0 @@ -//go:build debug_uart && !qa_reactor - -package reactor - -import ( - "time" - - "devicecode-go/bus" - "devicecode-go/types" - "devicecode-go/x/shmring" -) - -// debugUARTLog opens uart0 as a log mirror and routes log.Println output -// through it. Enabled with `-tags debug_uart`. The shmring write path -// inside utilities/Logger.logWrite is non-blocking and drops bytes on a -// full ring; that drop policy is the rate-limit for this debug stream. -// -// debug_uart MUST NOT be set in release builds — fabric (uart1 on the -// proto_1 hardware) is the only allowed CM5-facing traffic; the uart0 -// log mirror is for development/bring-up only. -type debugUARTLog struct { - subOpened *bus.Subscription - subClosed *bus.Subscription - retryAt time.Time -} - -const debugUARTLogID = "uart0" - -func (d *debugUARTLog) init(uiConn *bus.Connection) { - d.subOpened = uiConn.Subscribe(tSessOpened(debugUARTLogID)) - d.subClosed = uiConn.Subscribe(tSessClosed(debugUARTLogID)) - uiConn.Publish(uiConn.NewMessage(tSessOpen(debugUARTLogID), nil, false)) -} - -func (d *debugUARTLog) openedChan() <-chan *bus.Message { - if d.subOpened == nil { - return nil - } - return d.subOpened.Channel() -} - -func (d *debugUARTLog) closedChan() <-chan *bus.Message { - if d.subClosed == nil { - return nil - } - return d.subClosed.Channel() -} - -func (d *debugUARTLog) handleOpened(m *bus.Message) { - if ev, ok := m.Payload.(types.SerialSessionOpened); ok { - log.SetUART1(shmring.Get(shmring.Handle(ev.TXHandle))) - log.Println("[uart0] log session opened") - } -} - -func (d *debugUARTLog) handleClosed(uiConn *bus.Connection) { - log.SetUART1(nil) - log.Println("[uart0] log session closed") - if time.Now().After(d.retryAt) { - uiConn.Publish(uiConn.NewMessage(tSessOpen(debugUARTLogID), nil, false)) - d.retryAt = time.Now().Add(2 * time.Second) - } -} diff --git a/services/reactor/debug_uart_release.go b/services/reactor/debug_uart_release.go deleted file mode 100644 index 7379965..0000000 --- a/services/reactor/debug_uart_release.go +++ /dev/null @@ -1,19 +0,0 @@ -//go:build !debug_uart && !qa_reactor - -package reactor - -import "devicecode-go/bus" - -// debugUARTLog is a no-op in release builds: the uart1 log mirror is -// disabled by default per docs/firmware-alignment-protocol.md (off in -// release, uart1-only in dev, rate-limited, never on uart0). Build with -// `-tags debug_uart` to enable; see debug_uart_dev.go. -type debugUARTLog struct{} - -func (d *debugUARTLog) init(uiConn *bus.Connection) { _ = uiConn } -func (d *debugUARTLog) openedChan() <-chan *bus.Message { return nil } -func (d *debugUARTLog) closedChan() <-chan *bus.Message { return nil } -func (d *debugUARTLog) handleOpened(m *bus.Message) { _ = m } -func (d *debugUARTLog) handleClosed(uiConn *bus.Connection) { - _ = uiConn -} diff --git a/services/reactor/reactor.go b/services/reactor/reactor.go index 72c7263..9429b2c 100644 --- a/services/reactor/reactor.go +++ b/services/reactor/reactor.go @@ -398,7 +398,7 @@ func (r *Reactor) emitMemSnapshot() { } func (r *Reactor) Run(ctx context.Context) { -// Subscriptions (env + power) + // Subscriptions (env + power) log.Println("[main] subscribing env + power …") tempSub := r.uiConn.Subscribe(tTempValue) tempDieSub := r.uiConn.Subscribe(tDieTempValue) @@ -407,23 +407,12 @@ func (r *Reactor) Run(ctx context.Context) { stSub := r.uiConn.Subscribe(stTopic) evSub := r.uiConn.Subscribe(evTopic) - // UART sessions — fabric on uart1 (where the CM5 link physically - // terminates on the proto_1 hardware), uart0 reserved for the - // optional debug log mirror. The plan originally aspired to put - // fabric on uart0 to match `bigbox-v1-cm-2.json`'s CM5-side label - // (`uart-0` → `/dev/ttyAMA0`); on the MCU side the uart0/uart1 - // labels are independent of the CM5's labels — they're a function - // of which physical pins the harness wires connect to. The - // legacy CM5 telemetry-over-JSON path is gone either way; retained- - // state publishers in fabric-update will replace it. + // UART session for the CM5 Fabric link on proto_1 hardware. const uartFabric = "uart1" subSessOpenFabric := r.uiConn.Subscribe(tSessOpened(uartFabric)) subSessClosedFabric := r.uiConn.Subscribe(tSessClosed(uartFabric)) r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) - var dbgLog debugUARTLog - dbgLog.init(r.uiConn) - // Retry back-off guards var retryFabricAt time.Time @@ -445,7 +434,6 @@ func (r *Reactor) Run(ctx context.Context) { } } - // Supervisory ticker ticker := time.NewTicker(TICK) defer ticker.Stop() @@ -474,8 +462,6 @@ func (r *Reactor) Run(ctx context.Context) { }() log.Println("[uart1] fabric session opened") } - case m := <-dbgLog.openedChan(): - dbgLog.handleOpened(m) case <-subSessClosedFabric.Channel(): // Ignore stale close events — the open handler already tears down // the previous session before starting a new one. @@ -490,9 +476,6 @@ func (r *Reactor) Run(ctx context.Context) { r.uiConn.Publish(r.uiConn.NewMessage(tSessOpen(uartFabric), nil, false)) retryFabricAt = time.Now().Add(2 * time.Second) } - case <-dbgLog.closedChan(): - dbgLog.handleClosed(r.uiConn) - // ---- Env prints ---- case m := <-tempSub.Channel(): if v, ok := m.Payload.(types.TemperatureValue); ok { From c0dc2c3db923219710cce475c8744dd0f148e3f5 Mon Sep 17 00:00:00 2001 From: cpunt Date: Tue, 19 May 2026 09:06:32 +0000 Subject: [PATCH 65/65] fabric: remove unused protocol helpers --- services/fabric/fabric_test.go | 6 +- services/fabric/transfer_sink_rp2350.go | 6 +- .../fabric/transfer_sink_rp2350_unsafe.go | 148 ------------------ services/fabric/transfer_test.go | 2 +- services/fabric/transport_limits.go | 11 ++ .../{transport_rw.go => transport_rw_test.go} | 33 +--- x/xxhash/xxhash.go | 32 +--- x/xxhash/xxhash_test.go | 62 +++----- 8 files changed, 47 insertions(+), 253 deletions(-) delete mode 100644 services/fabric/transfer_sink_rp2350_unsafe.go create mode 100644 services/fabric/transport_limits.go rename services/fabric/{transport_rw.go => transport_rw_test.go} (52%) diff --git a/services/fabric/fabric_test.go b/services/fabric/fabric_test.go index 269d04d..361495c 100644 --- a/services/fabric/fabric_test.go +++ b/services/fabric/fabric_test.go @@ -15,10 +15,10 @@ import ( "devicecode-go/x/shmring" ) -func pipePair() (*RWTransport, *RWTransport) { +func pipePair() (*rwTransport, *rwTransport) { r1, w1 := io.Pipe() r2, w2 := io.Pipe() - return NewRWTransport(r2, w1), NewRWTransport(r1, w2) + return newRWTransport(r2, w1), newRWTransport(r1, w2) } func newBus() *bus.Bus { return bus.NewBus(3, "+", "#") } @@ -197,7 +197,7 @@ func TestTransportRoundTrip(t *testing.T) { func TestOversizeLineRecovery(t *testing.T) { big := `{"type":"ping","ts":0,"x":"` + strings.Repeat("x", maxLineLen+100) + `"}` input := big + "\n" + `{"type":"ping","ts":3}` + "\n" - tr := NewRWTransport(strings.NewReader(input), io.Discard) + tr := newRWTransport(strings.NewReader(input), io.Discard) _, err := tr.ReadLine() if !errors.Is(err, ErrLineTooLong) { t.Fatalf("expected ErrLineTooLong, got %v", err) diff --git a/services/fabric/transfer_sink_rp2350.go b/services/fabric/transfer_sink_rp2350.go index 285870a..3360aa6 100644 --- a/services/fabric/transfer_sink_rp2350.go +++ b/services/fabric/transfer_sink_rp2350.go @@ -1,4 +1,4 @@ -//go:build tinygo && rp2350 && !flash_unsafe +//go:build tinygo && rp2350 // Default RP2350 transfer sink for the fabric-protocol baseline. Rejects all // transfers at xfer_begin: signed-image verification and staged flash writes @@ -6,10 +6,6 @@ // `raw/member/mcu/cap/updater/main/rpc/receive` and `pico2-a-b/imagev1/`. Until // that path lands, the safe default is to refuse incoming transfers rather // than flash unverified bytes directly into the inactive slot. -// -// To re-enable direct (unverified) abupdate flashing for development or -// hardware bring-up, build with `-tags=flash_unsafe`. See -// transfer_sink_rp2350_unsafe.go. package fabric diff --git a/services/fabric/transfer_sink_rp2350_unsafe.go b/services/fabric/transfer_sink_rp2350_unsafe.go deleted file mode 100644 index add267b..0000000 --- a/services/fabric/transfer_sink_rp2350_unsafe.go +++ /dev/null @@ -1,148 +0,0 @@ -//go:build tinygo && rp2350 && flash_unsafe - -// Direct abupdate flashing on incoming transfers. Gated behind the -// `flash_unsafe` build tag because this path flashes raw bytes without -// signed-image verification (signature verify and staging belong in -// fabric-update's receiver). Use only for development or hardware bring-up -// where unsigned images are acceptable. The default fabric-protocol build -// rejects transfers at xfer_begin — see transfer_sink_rp2350.go. - -package fabric - -import ( - "fmt" - "time" - - "pico2-a-b/abupdate" -) - -const stageSize = 4096 - -type transferSinkImpl struct { - updater *abupdate.Updater - - // Stage verified transfer bytes in protocol code so flash writes happen - // in larger batches instead of directly on every UART chunk. - stage [stageSize]byte - stageUsed uint32 - accepted uint32 - chunksSeen uint32 -} - -// beginTransfer creates an MCU-side sink for incoming firmware bytes. In the -// fabric-protocol baseline this accepts any well-formed transfer; image-format -// validation (kind/format/target/signature) is the receiver's job in -// fabric-update. -func beginTransfer(meta transferMeta) (transferSink, error) { - var updater abupdate.Updater - if rc := updater.Init(); rc != 0 { - return nil, fmt.Errorf("updater_init:%d", rc) - } - if rc := updater.BeginUpdate(meta.Size); rc != 0 { - return nil, fmt.Errorf("begin_update:%d", rc) - } - - return &transferSinkImpl{updater: &updater}, nil -} - -func (s *transferSinkImpl) flushStage(force bool) error { - if s.stageUsed == 0 { - return nil - } - - before := s.updater.BytesWritten() - expected := s.accepted - s.stageUsed - if before != expected { - return fmt.Errorf("unexpected_offset:%d", before) - } - - flushed := s.stageUsed - start := time.Now() - if rc := s.updater.WriteChunk(s.stage[:flushed]); rc != 0 { - return fmt.Errorf("write_chunk:%d", rc) - } - after := s.updater.BytesWritten() - s.stageUsed = 0 - - dt := time.Since(start) - chunk := s.chunksSeen - if force || chunk == 0 || (chunk%32) == 31 || dt >= 2*time.Millisecond { - println( - "[fabric]", "xfer_sink_flush", - "chunk", u32s(chunk), - "stage_n", u32s(flushed), - "bytes_before", u32s(before), - "bytes_after", u32s(after), - "dt_us", u32s(uint32(dt/time.Microsecond)), - ) - } - - return nil -} - -func (s *transferSinkImpl) WriteChunk(off uint32, data []byte) error { - if s.accepted != off { - return fmt.Errorf("unexpected_offset:%d", s.accepted) - } - - remaining := data - for len(remaining) > 0 { - if s.stageUsed == uint32(len(s.stage)) { - if err := s.flushStage(false); err != nil { - return err - } - } - - n := copy(s.stage[s.stageUsed:], remaining) - s.stageUsed += uint32(n) - s.accepted += uint32(n) - remaining = remaining[n:] - - if s.stageUsed == uint32(len(s.stage)) { - if err := s.flushStage(false); err != nil { - return err - } - } - } - - chunk := s.chunksSeen - if chunk == 0 || (chunk%32) == 31 { - println( - "[fabric]", "xfer_sink_stage", - "chunk", u32s(chunk), - "off", u32s(off), - "n", u32s(uint32(len(data))), - "stage_used", u32s(s.stageUsed), - "accepted", u32s(s.accepted), - ) - } - s.chunksSeen++ - - return nil -} - -func (s *transferSinkImpl) Commit() (transferInfo, error) { - if err := s.flushStage(true); err != nil { - return transferInfo{}, err - } - if rc := s.updater.FlushFinal(); rc != 0 { - return transferInfo{}, fmt.Errorf("flush_final:%d", rc) - } - return transferInfo{ - BytesWritten: s.updater.BytesWritten(), - SlotXIPAddr: s.updater.SlotStorageAddr(), - }, nil -} - -func (s *transferSinkImpl) Apply() error { - if rc := s.updater.RebootIntoSlot(); rc != 0 { - return fmt.Errorf("reboot:%d", rc) - } - return nil -} - -func (s *transferSinkImpl) Abort(reason string) error { - _ = reason - s.stageUsed = 0 - return nil -} diff --git a/services/fabric/transfer_test.go b/services/fabric/transfer_test.go index 4e74692..7837980 100644 --- a/services/fabric/transfer_test.go +++ b/services/fabric/transfer_test.go @@ -73,7 +73,7 @@ func rawURL(data []byte) string { // xxhashStr is the wire-format checksum: lower-case hex, 8 chars, no algorithm // field. Mirrors the Lua reference's M.digest_hex. func xxhashStr(data []byte) string { - return xxhash.SumHex(data) + return xxhashHex(xxhash.Sum32(data, 0)) } func TestTransferBeginPreservesMeta(t *testing.T) { diff --git a/services/fabric/transport_limits.go b/services/fabric/transport_limits.go new file mode 100644 index 0000000..7f5afec --- /dev/null +++ b/services/fabric/transport_limits.go @@ -0,0 +1,11 @@ +package fabric + +import "fmt" + +// maxLineLen caps a single fabric frame (line-delimited JSON) end-to-end. +// It must clear the release transfer chunk: 1024 raw bytes becomes about +// 1366 base64url chars, plus JSON envelope and newline. 4096 leaves margin +// while keeping malformed lines bounded. +const maxLineLen = 4096 + +var ErrLineTooLong = fmt.Errorf("line exceeds %d bytes", maxLineLen) diff --git a/services/fabric/transport_rw.go b/services/fabric/transport_rw_test.go similarity index 52% rename from services/fabric/transport_rw.go rename to services/fabric/transport_rw_test.go index 4c0a640..bcbeb1e 100644 --- a/services/fabric/transport_rw.go +++ b/services/fabric/transport_rw_test.go @@ -2,33 +2,19 @@ package fabric import ( "bufio" - "fmt" "io" "sync" ) -// Used by host-side unit tests and any stream-backed Fabric transport. - -// maxLineLen caps a single fabric frame (line-delimited JSON) end-to-end. It -// must clear the worst-case encoded transfer chunk: release chunk_size = 2048 -// raw → ~2731 chars base64url-encoded + ~150-byte JSON envelope + newline -// ≈ 2900 bytes. 4096 is the tightest round power-of-2 above that with ~1.1 KB -// headroom. See devicecode-lua/src/services/fabric/protocol.lua at -// update-migration tip for the canonical encoding. -const maxLineLen = 4096 - -var ErrLineTooLong = fmt.Errorf("line exceeds %d bytes", maxLineLen) - -// RWTransport implements Transport over an io.Reader + io.Writer. -type RWTransport struct { +type rwTransport struct { r *bufio.Reader mu sync.Mutex w *bufio.Writer closers []io.Closer } -func NewRWTransport(r io.Reader, w io.Writer) *RWTransport { - t := &RWTransport{ +func newRWTransport(r io.Reader, w io.Writer) *rwTransport { + t := &rwTransport{ r: bufio.NewReaderSize(r, maxLineLen), w: bufio.NewWriter(w), } @@ -45,7 +31,7 @@ func NewRWTransport(r io.Reader, w io.Writer) *RWTransport { return t } -func (t *RWTransport) ReadLine() ([]byte, error) { +func (t *rwTransport) ReadLine() ([]byte, error) { var buf []byte for { seg, more, err := t.r.ReadLine() @@ -69,11 +55,10 @@ func (t *RWTransport) ReadLine() ([]byte, error) { if len(buf) > maxLineLen { return nil, ErrLineTooLong } - traceLine("rx", buf) return buf, nil } -func (t *RWTransport) WriteLine(data []byte) error { +func (t *rwTransport) WriteLine(data []byte) error { if len(data) > maxLineLen { return ErrLineTooLong } @@ -85,14 +70,10 @@ func (t *RWTransport) WriteLine(data []byte) error { if err := t.w.WriteByte('\n'); err != nil { return err } - if err := t.w.Flush(); err != nil { - return err - } - traceLine("tx", data) - return nil + return t.w.Flush() } -func (t *RWTransport) Close() error { +func (t *rwTransport) Close() error { var first error for _, c := range t.closers { if err := c.Close(); err != nil && first == nil { diff --git a/x/xxhash/xxhash.go b/x/xxhash/xxhash.go index 7b9fc63..9e319c1 100644 --- a/x/xxhash/xxhash.go +++ b/x/xxhash/xxhash.go @@ -21,12 +21,12 @@ const ( // Hasher is a streaming xxHash32 state. type Hasher struct { - seed uint32 - totalLen uint32 + seed uint32 + totalLen uint32 v1, v2, v3, v4 uint32 - mem [16]byte - memN uint8 // 0..15 - large bool // true once a 16-byte block has been absorbed + mem [16]byte + memN uint8 // 0..15 + large bool // true once a 16-byte block has been absorbed } // New returns a streaming xxHash32 hasher seeded with seed. @@ -36,10 +36,6 @@ func New(seed uint32) *Hasher { return h } -// Reset re-initialises the hasher with seed 0. To re-seed with a different -// value, allocate a new Hasher with New. -func (h *Hasher) Reset() { h.reset(0) } - func (h *Hasher) reset(seed uint32) { h.seed = seed h.totalLen = 0 @@ -137,13 +133,6 @@ func Sum32(p []byte, seed uint32) uint32 { return h.Sum32() } -// SumHex returns the xxHash32 of p (seed 0) as 8 lower-case hex characters, -// matching the wire format used by the Lua reference's M.digest_hex. -func SumHex(p []byte) string { return hex8(Sum32(p, 0)) } - -// VerifyHex compares SumHex(p) to expected for case-sensitive equality. -func VerifyHex(p []byte, expected string) bool { return SumHex(p) == expected } - func round(acc, lane uint32) uint32 { acc += lane * prime32_2 acc = bits.RotateLeft32(acc, 13) @@ -154,14 +143,3 @@ func round(acc, lane uint32) uint32 { func leU32(b []byte) uint32 { return uint32(b[0]) | uint32(b[1])<<8 | uint32(b[2])<<16 | uint32(b[3])<<24 } - -const hexdigits = "0123456789abcdef" - -func hex8(v uint32) string { - var buf [8]byte - for i := 7; i >= 0; i-- { - buf[i] = hexdigits[v&0xf] - v >>= 4 - } - return string(buf[:]) -} diff --git a/x/xxhash/xxhash_test.go b/x/xxhash/xxhash_test.go index 00fef90..a005b59 100644 --- a/x/xxhash/xxhash_test.go +++ b/x/xxhash/xxhash_test.go @@ -19,33 +19,11 @@ var refVectors = []struct { {"123456789", "123456789", "937bad67"}, } -func TestSumHex_KnownAnswer(t *testing.T) { - for _, v := range refVectors { - got := SumHex([]byte(v.input)) - if got != v.hex { - t.Errorf("SumHex(%q): got %s, want %s", v.input, got, v.hex) - } - } -} - func TestSum32_KnownAnswer(t *testing.T) { - // Sum32(_, 0) must agree with SumHex (which forces seed 0). for _, v := range refVectors { - want := SumHex([]byte(v.input)) - got := hex8(Sum32([]byte(v.input), 0)) - if got != want { - t.Errorf("Sum32(%q, 0): got %s, want %s", v.input, got, want) - } - } -} - -func TestVerifyHex(t *testing.T) { - for _, v := range refVectors { - if !VerifyHex([]byte(v.input), v.hex) { - t.Errorf("VerifyHex(%q, %s) returned false", v.input, v.hex) - } - if VerifyHex([]byte(v.input), "deadbeef") { - t.Errorf("VerifyHex(%q, deadbeef) returned true", v.input) + got := testHex8(Sum32([]byte(v.input), 0)) + if got != v.hex { + t.Errorf("Sum32(%q, 0): got %s, want %s", v.input, got, v.hex) } } } @@ -56,7 +34,7 @@ func TestStreaming_ByteByByte(t *testing.T) { for _, b := range []byte(v.input) { h.Write([]byte{b}) } - got := hex8(h.Sum32()) + got := testHex8(h.Sum32()) if got != v.hex { t.Errorf("byte-stream %q: got %s, want %s", v.input, got, v.hex) } @@ -68,13 +46,13 @@ func TestStreaming_OddSplits(t *testing.T) { // 17, and 31 exercise mem-buffer top-up, exact block boundary, and tail // bytes. in := []byte("0123456789abcdef0123456789abcdef") - want := SumHex(in) + want := testHex8(Sum32(in, 0)) for _, split := range []int{0, 1, 7, 15, 16, 17, 31, 32} { h := New(0) h.Write(in[:split]) h.Write(in[split:]) - got := hex8(h.Sum32()) + got := testHex8(h.Sum32()) if got != want { t.Errorf("split=%d: got %s, want %s", split, got, want) } @@ -87,24 +65,11 @@ func TestStreaming_EmptyWritesNoOp(t *testing.T) { h.Write([]byte{}) h.Write([]byte("abc")) h.Write([]byte{}) - if got := hex8(h.Sum32()); got != "32d153ff" { + if got := testHex8(h.Sum32()); got != "32d153ff" { t.Errorf("with empty writes interleaved: got %s, want 32d153ff", got) } } -func TestReset(t *testing.T) { - h := New(0) - h.Write([]byte("abc")) - if hex8(h.Sum32()) != "32d153ff" { - t.Fatalf("first sum mismatch") - } - h.Reset() - h.Write([]byte("abc")) - if hex8(h.Sum32()) != "32d153ff" { - t.Fatalf("post-reset sum mismatch") - } -} - func TestSeedNonZero(t *testing.T) { in := []byte("the quick brown fox jumps over the lazy dog") if Sum32(in, 0) == Sum32(in, 1) { @@ -134,12 +99,23 @@ func TestSum32ContinuesAfter(t *testing.T) { h.Write([]byte("a")) h.Sum32() h.Write([]byte("bc")) - got := hex8(h.Sum32()) + got := testHex8(h.Sum32()) if got != "32d153ff" { t.Errorf("post-Sum32 continuation: got %s, want 32d153ff", got) } } +const hexdigits = "0123456789abcdef" + +func testHex8(v uint32) string { + var buf [8]byte + for i := 7; i >= 0; i-- { + buf[i] = hexdigits[v&0xf] + v >>= 4 + } + return string(buf[:]) +} + func TestLargeBuffer(t *testing.T) { // Confirm one-shot and streaming agree on a buffer comfortably larger // than the 16-byte block size; this exercises the hot loop in Write.