From 06148f7c9cd3731ea2e0ec9ef6e24e5c4cf18a91 Mon Sep 17 00:00:00 2001 From: Corvid Agent <0xopenbytes@gmail.com> Date: Mon, 18 May 2026 15:54:53 -0700 Subject: [PATCH] feat(permanent): chunking for values > 600 bytes (book/page envelope) MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit `fledge-plugin-memory` previously rejected any permanent save whose encrypted envelope exceeded ~882 bytes (the @corvidlabs/ts-algochat hard cap), and any tx note > 1024 bytes (Algorand's per-tx note limit). For larger content the user got "Permanent value too large for tx note" or "EncryptionError: Message too large". This change adds transparent multi-tx chunking on the permanent tier: - New `src/chunking.ts` — splits values into ≤600-byte UTF-8-safe chunks. UTF-8 multi-byte codepoints are never cut mid-character; the splitter walks back to the prior leading byte when a cut lands on a continuation byte (`0b10xxxxxx`). - `permanentSave` now emits N transactions for an N-chunk value. Each carries the same key + ISO-8601 `created` timestamp plus envelope fields `book` (= key today), `page` (1..N), `total` (N). Single- chunk saves stay on the legacy envelope shape so existing readers and indexers see no change. - `permanentRecall` / `permanentList` group by (key, created) inside a new `reassemble` step, require all `total` pages to be present (a save with missing pages is silently dropped, not partially reassembled), and join in page order. Tombstones cover all pages for a key without needing per-page tombstones because the latest-round-wins rule still picks the tombstone over the older multi-page write. - `permanentDelete` is unchanged — a single tombstone tx covers any number of chunks under that key. 19 unit tests: - `test/chunking.test.ts` — boundary cases (exactly N bytes, N+1 bytes), round-trip preservation including emoji + CJK + RTL, needsChunking heuristic. - `test/permanent-reassemble.test.ts` — legacy single-chunk pass- through, multi-page join, missing-page drop, two-save dedup, contiguous-page enforcement, mixed single+multi handling. Mutable (ARC-69 ASA) tier chunking is a follow-up — single PR is enough surface to review at once. Co-Authored-By: Claude Opus 4.7 (1M context) --- src/chunking.ts | 60 ++++++++++++++++++ src/permanent.ts | Bin 7159 -> 11310 bytes test/chunking.test.ts | 83 +++++++++++++++++++++++++ test/permanent-reassemble.test.ts | 99 ++++++++++++++++++++++++++++++ 4 files changed, 242 insertions(+) create mode 100644 src/chunking.ts create mode 100644 test/chunking.test.ts create mode 100644 test/permanent-reassemble.test.ts diff --git a/src/chunking.ts b/src/chunking.ts new file mode 100644 index 0000000..afed7ac --- /dev/null +++ b/src/chunking.ts @@ -0,0 +1,60 @@ +/** + * Chunking for memories that don't fit in a single Algorand note. + * + * Algorand caps tx notes at 1024 bytes. Subtract the JSON envelope + * (~150 bytes with book/page/total fields) and the encryption layer + * (envelope adds ~40 bytes, base64 inflates by 4/3) and you have + * ~600 bytes of plaintext per chunk that reliably fit. + * + * On save: the caller decides whether to chunk. `chunkValue` splits on + * a fixed byte boundary; chunks are reassembled in `joinChunks` by + * sorting on `page` ascending. + * + * On recall: callers collect all txs/ASAs for a given key, group by the + * save's `created` timestamp (one save = one book), require all pages + * to be present, and concatenate. + */ + +/** + * Max plaintext bytes per chunk. Conservative — leaves headroom for + * UTF-8 multi-byte expansion, envelope JSON, and the encryption + * envelope overhead in `@corvidlabs/ts-algochat`. + */ +export const MAX_CLEARTEXT_PER_CHUNK = 600; + +/** + * Split `value` into N chunks of at most `MAX_CLEARTEXT_PER_CHUNK` + * bytes each. Operates on UTF-8 byte boundaries — if a multi-byte + * codepoint straddles the cut, the chunk boundary is pushed back to + * the prior codepoint start so we never produce invalid UTF-8. + */ +export function chunkValue(value: string): string[] { + const bytes = Buffer.from(value, "utf-8"); + if (bytes.length <= MAX_CLEARTEXT_PER_CHUNK) return [value]; + + const chunks: string[] = []; + let offset = 0; + while (offset < bytes.length) { + let end = Math.min(offset + MAX_CLEARTEXT_PER_CHUNK, bytes.length); + // Walk back if we landed in the middle of a UTF-8 continuation byte + // (0b10xxxxxx, i.e. (byte & 0xC0) === 0x80). We stop walking once + // we hit a leading byte; this caps regression at 3 bytes. + while (end < bytes.length && (bytes[end] & 0xc0) === 0x80) end--; + chunks.push(bytes.slice(offset, end).toString("utf-8")); + offset = end; + } + return chunks; +} + +/** + * Reassemble pages back into the original value. Caller is responsible + * for passing the pages in correct order (page 1..N). + */ +export function joinChunks(chunks: string[]): string { + return chunks.join(""); +} + +/** Heuristic: does this value need chunking? */ +export function needsChunking(value: string): boolean { + return Buffer.byteLength(value, "utf-8") > MAX_CLEARTEXT_PER_CHUNK; +} diff --git a/src/permanent.ts b/src/permanent.ts index 3a862618d01a08da7ac5b94bb4f5ceb49005ba4a..2b7e64c7e0cbbc3c3e7b9beb2461c1b270423a04 100644 GIT binary patch literal 11310 zcmeHNZEqXLcJ62WiV+dqyL8BvY@`*?(EE&Gw1a=XEs}{%c?duUzC1!Vdho2GQIHB zd$O15>ucj(F*~lRvNAVeQhKni%DSAEdHRd*PIbk!+OF%LZN6~>Gjr`zXKk2tYw2|9 zrWZ%xWi~JiSD$Vc3+HP$duuDZ@&n^HlT}t96|=wCtR5Iw_)X=0*c3CT3tr;amFx1z zqg1w-?QB$U*UnsDOku+!Pn#K);k(@q*93vrZ;t#X{HZP0Pa9 z=I!yxtE1P)ug}iU{`U6RJT<+wt5&wauEUjEmDLtvyEFC8tM5+F-n_nd{;)1r6JM8w zYgV2dpPl^e{Oskc<2Ucl&R?Bkn}?4M&kqj|pO}LK^JuuricRf|o%k~6k4u~9!)abl zFHD`ST)CRa z@bJ+$ec1t$URLF$5SM_%GxBfzl15tCd}JCoY7TGQ)zmqJ0ik3j+uC`Pe0AN~<)&}! z928XL^i?x9UkBnKe`0Z1^XllY=P!ObJ~}x&{_EK}Jox;@KfinZqggsTt4ajNtY`_; zJ{?n<#>la5@KXiEXo;PkHu621%_`@;2asUf$By50Lnq$uDkNC)dJ_Zm*nGZtAj+;*G+Tj@lJ$wH}%8vIWGN9QJkC?*?2* zS_7|8r7St^ro*Yv76pX<6z)-^d;o{-$v6N(|LC@|q;mD9DvZ5E@ZJsOiPFI=__({$ zl(cK{jm}T)N0)S5ePAT~L`NUoWAS&4+5CzyljmyFw8nj9s?9%1Kbm8c`I6oXTR;c~ zv23~&)y`7$(i_B5n->l*HU-c7+??X!=%+uw`TqRf$xk%*jmNmLSh}L`6y?mNJrRn* zn23c8(K#Ri6J*rQZ>?1C+DnKB(7n-bj2`a;E|Ej zNCIQd^(MoAnr)gPz%w{=~%|Y#lOuo~lt;c3* z8@fv4-yT1HG^CrFsa^L61}Pej?;s4wGKbwEEQBcSmBS1!wWbLdb-q<16NfnM3^Ovt z%k@npPccQkNsi_r9Qh$xJQ0%-`Rg3Db2^DcGt$-q7H|L`dNLy&EK;Z>G6;mUg#62u;C?|{zm-dYr9$-`UryT7F;bHm1aQ;WP&+QZDAIG z2Vw>*BpzabvB^9p%m5l}QJYHyb~DBXV<3;2-I@uInGm7ssRHv?{$Hf}RrLj=psY!Wkc9=KBw@i1#0;3<94dNBGJx1PGa#Z`E&n zFnA1o{dE+vI#1;4f6da=1IPql!vlYX0-#|51J{f94P;1CSd zG}|blQV$^zf=9Rxl^?@2qlnG`i|m`(jkm4>&S9d=HQ{X zB?ek?rub^KKvT{M!jLkq!wK{axWkhAD#Z_hkSzn;zrK3U1MtnT!&@&fqgZ_(Gy0S% z($K~!oAG>#B(X(+l9-x?F_1rOY>b&RjPE05Ho)^!bR%vBWs+s}oxt%6Vl({oQbr6+Z(_as(>FaLzL4g6n|eNc9PwpKe&uQl+ESbW@pzYn z4aEjKmZqhDJV)d%^MIBlKsu>3QHXwa!Nc7}XR&ATxs|c&(JrUD_QRHc_=CaEhR|JM zs-2Q0xPh?8OieBJR8GD7)~L`0+km%rFCn1A&m#)`nLUoAebY%rW@LK3mhs@a8rQFb zGg1j)Y1A+m(+yw9R^0&x+!38Nn{|hqo`bLvQ3yPu*XvFTeTdi6NI_;van5;Gt=#N{ z9g67o&FD}T^gi%KtOUsx5>!+ja%3z1?p}ly;ZXe9eWkgzs}spfG?|_@d3xr#%0pg^ zVBIq}&rncy$%K5|r6WV;K0^BDUP@ZF@>CHlwz>gQd$}lBOVn59a#^AgK&Qi@i$9_{ zN(CXquVZg4bnhA%oHud**@pv*5&);7^fDAYmpJ zEjzSZN~Cy)qH~dV{@t;_(HMt*VM2B44 zg}!kqrz(J*Yz?dLK?K1d^Y8y990>@qb4({k+FbW)@g#K$H9bwcq5Wd z&58ThO$JS%3{)y(VRtO7d$UF&1y_Kfb7oB$C6Gp|iZ%8iR*@7+=9L0EH|sURY67eR zy^74xpshC3x-W-55h)OjOGw}gOpk|GWG+rRMx0=|iMIh%VTXggrZ0yQ&3m_+277y|U}jq8r?y`z(Nw4pMLg3W2wX6&PwA z(nsi4G7~!B1ynNhvBoQPymh|jMWka4xYxMnn#2#5JFtN@R?H`Ij2{uFpPDfsX$rHt z`3L?8W^pstpsI(q6&q@L^c09MiOa)IJOt(-l~vU)4=#s4hlRegaA4lM53*W%E*&B9 z>}Z0_<0{cTuwgu?aug(?h6#8K38G=WuS}C2Arl(~Ohz)HlZjX&2Ez^JO-J(=+1hx3 z`OxWz^76Q;5pbY%eV$F(=;3gZ*nye!&C_QPf~amEo;=ZCvE;>OKf4P3*szy{mpx3C zYbigWJ+rWDDHHE61?2!cC7@p)Uct%!+wVY~`2M3W=&VJV}0B)_ujBw33D*Qw0vH#MfF<8Vgm z8R4^;3a<` zFfCWWYnjKj5@OT}mlYVQ5UuK%dW7T%!JuBID|?lssn~^^;i+>lnbtX2bomjgM|er( zS+ja`qHbpA;OJJhtjjdPiYtYz0<}oVh!eyJOEKi=N3-iywAWd)ew$rmbQ)+&1hmjs zsmnc3u~SH5h3*J*vv$BBHgxh|AX~7tk3P#95f~|ZoCA&^A)vk+OWOu{^OBgKbjXF$ zW%7cNv3YW}hCH#-qfRb30;q^0pK``nl4 zd`d?K*$ErM+3-|wh1@ZvpRJ^M>k%uIwvwix%XN*i@EQOhp0B+{3yd^C8Hew@B-f}Z zn;SJC5;OvD2Hne^+nK3*?Gx5x5qUsKCgg}3AJca=D9+HN%#k25i;d_M#TpNe6m!_6~*dgZcjI-y4oFbBX` zVxQ2>`BZx&WF4@Nxt+8x%fm}D?roiUbaUmBq>9xu84@A3T5LmQcM0gjkyn5djO~&m zB`M!pY*-<6Qg~pX4Wcq%&9rMBzhl$8M7a6q_HNFzBW;SHRb(`#A#;X`p5P|jK_*E3 z&)|Xenci3@)xbRZ$HPO-Y&<1g5yq}J1>%GfXf&c~xFzP zqrpImBSDET1J=&Bup8(ULA5dW$@WJ9FrPD9^|ENQZj~aKXO<( zl$r(KK;#I656n87;tM&w@$78Iy@Xhq^9vc-13_1c9_hs#d@0#{FDdx%NH&^fcCv=# zQVqNRH$M6Jd%yUg!Gvb~txv5qx--BcI--LsU+#5U^s&*duRnKJ-&D55Hbl?2KPn-K z?#Do4#h?8wEV8Y*O(Vl^`t%}J7tQTX_(V1R{|~F6gb*YHj_7_PyYpcc5YDjxIXD~( dE4CXL*Ya%|olaG9e$H12*ViB&8=B0w{{w~CW=a46 delta 305 zcmZ1%@!fobEaT)*MxD)**o+yuQ}dFGDhovG9WUaM&=RF;@inrfv0m(t;y z{D)I+axj<7WNEF$$)zlEX~n5|DXv9D`9&HD0jWi~iFv7cB?@4D3MKjZ3OR{I>8T27 z`9%sP6$*LzB|y_uszJ0~PHJ9yNk*+gQe{bMv4VzUPI`V(VqS_ua$-R-SY@$7Vu^yG zfsu)(Uc%&jO{vL@oZ6F@YuQikVO>4BU)zi)IX|zs1ZZw1$jPU*t=I$4d`d$T^@4d%%*Lf { + test("short input returns a single chunk", () => { + const v = "hello world"; + const chunks = chunkValue(v); + expect(chunks).toEqual([v]); + }); + + test("input at exactly MAX_CLEARTEXT_PER_CHUNK returns single chunk", () => { + const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK); + const chunks = chunkValue(v); + expect(chunks.length).toBe(1); + expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK); + }); + + test("input one byte over the limit produces two chunks", () => { + const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1); + const chunks = chunkValue(v); + expect(chunks.length).toBe(2); + expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK); + expect(chunks[1].length).toBe(1); + }); + + test("3000-byte input produces 5 chunks of 600 bytes", () => { + const v = "a".repeat(3000); + const chunks = chunkValue(v); + expect(chunks.length).toBe(5); + for (const c of chunks) { + expect(Buffer.byteLength(c, "utf-8")).toBeLessThanOrEqual(MAX_CLEARTEXT_PER_CHUNK); + } + }); + + test("round-trip: chunkValue → joinChunks preserves ASCII content", () => { + const v = "x".repeat(2500); + expect(joinChunks(chunkValue(v))).toBe(v); + }); + + test("multi-byte UTF-8 codepoints are never split mid-character", () => { + // 🌟 is 4 bytes in UTF-8. Repeat enough to span chunk boundaries. + const v = "🌟".repeat(200); // 800 bytes + const chunks = chunkValue(v); + for (const c of chunks) { + // Decoding shouldn't throw or insert U+FFFD replacement chars. + const decoded = Buffer.from(c, "utf-8").toString("utf-8"); + expect(decoded).toBe(c); + expect(decoded).not.toContain("�"); + } + expect(joinChunks(chunks)).toBe(v); + }); + + test("round-trip preserves mixed ASCII + emoji + CJK + accented", () => { + const segment = "Hello 世界 ñoño 🚀 — مرحبا — "; + const v = segment.repeat(60); + expect(joinChunks(chunkValue(v))).toBe(v); + }); + + test("empty string returns one empty chunk", () => { + expect(chunkValue("")).toEqual([""]); + }); +}); + +describe("needsChunking", () => { + test("short ASCII does not need chunking", () => { + expect(needsChunking("hello")).toBe(false); + }); + + test("input at boundary does not need chunking", () => { + expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK))).toBe(false); + }); + + test("input over boundary needs chunking", () => { + expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1))).toBe(true); + }); + + test("UTF-8 multi-byte expansion can push a short string over", () => { + // 200 emoji = 800 bytes (each is 4 bytes UTF-8) + expect(needsChunking("🌟".repeat(200))).toBe(true); + // But the same JS .length is 400, well under MAX + expect("🌟".repeat(200).length).toBe(400); + }); +}); diff --git a/test/permanent-reassemble.test.ts b/test/permanent-reassemble.test.ts new file mode 100644 index 0000000..60e1433 --- /dev/null +++ b/test/permanent-reassemble.test.ts @@ -0,0 +1,99 @@ +import { test, expect, describe } from "bun:test"; +import { __test } from "../src/permanent.js"; + +const { reassemble } = __test; + +interface E { + key: string; + value: string; + txid: string; + created: string; + round: number; + tombstone: boolean; + book?: string; + page?: number; + total?: number; +} + +const ent = (over: Partial): E => ({ + key: "k", + value: "", + txid: "tx", + created: "2026-05-18T00:00:00Z", + round: 1, + tombstone: false, + ...over, +}); + +describe("permanent reassemble", () => { + test("legacy single-chunk entries pass through unchanged", () => { + const input: E[] = [ent({ key: "a", value: "hello", round: 5 })]; + expect(reassemble(input)).toEqual(input); + }); + + test("tombstones pass through unchanged", () => { + const input: E[] = [ent({ key: "a", tombstone: true, round: 10 })]; + expect(reassemble(input)).toEqual(input); + }); + + test("two pages with matching key+created are joined in page order", () => { + const input: E[] = [ + ent({ key: "k", value: "World", round: 2, book: "k", page: 2, total: 2, txid: "tx2" }), + ent({ key: "k", value: "Hello ", round: 1, book: "k", page: 1, total: 2, txid: "tx1" }), + ]; + const out = reassemble(input); + expect(out.length).toBe(1); + expect(out[0].value).toBe("Hello World"); + expect(out[0].round).toBe(2); // max round across pages + expect(out[0].total).toBe(2); + }); + + test("missing pages drop the whole record (not partial)", () => { + // total=3 but only 2 pages present + const input: E[] = [ + ent({ key: "k", value: "A", page: 1, total: 3 }), + ent({ key: "k", value: "C", page: 3, total: 3 }), + ]; + const out = reassemble(input); + expect(out.length).toBe(0); + }); + + test("two separate saves of same key produce two reassembled records", () => { + // Save 1 (older): 2 chunks + // Save 2 (newer): 2 chunks at a different timestamp + const input: E[] = [ + ent({ key: "k", value: "OldA", created: "2026-05-17T00:00:00Z", round: 1, page: 1, total: 2 }), + ent({ key: "k", value: "OldB", created: "2026-05-17T00:00:00Z", round: 2, page: 2, total: 2 }), + ent({ key: "k", value: "NewA", created: "2026-05-18T00:00:00Z", round: 5, page: 1, total: 2 }), + ent({ key: "k", value: "NewB", created: "2026-05-18T00:00:00Z", round: 6, page: 2, total: 2 }), + ]; + const out = reassemble(input); + expect(out.length).toBe(2); + // Both reassembled — caller's "latest by round" picks the newer one + const values = out.map(e => e.value).sort(); + expect(values).toEqual(["NewANewB", "OldAOldB"]); + }); + + test("mixed single-chunk and multi-chunk entries are both preserved", () => { + const input: E[] = [ + ent({ key: "single", value: "lonely" }), + ent({ key: "multi", value: "X", page: 1, total: 2 }), + ent({ key: "multi", value: "Y", page: 2, total: 2 }), + ]; + const out = reassemble(input); + expect(out.length).toBe(2); + const byKey = Object.fromEntries(out.map(e => [e.key, e.value])); + expect(byKey).toEqual({ single: "lonely", multi: "XY" }); + }); + + test("page numbering must be contiguous 1..total — gap drops the record", () => { + // total=3 but pages [1, 1, 3] — duplicate page 1, missing page 2 + const input: E[] = [ + ent({ key: "k", value: "A", page: 1, total: 3, txid: "t1" }), + ent({ key: "k", value: "A'", page: 1, total: 3, txid: "t2" }), + ent({ key: "k", value: "C", page: 3, total: 3, txid: "t3" }), + ]; + const out = reassemble(input); + expect(out.length).toBe(0); + }); +});