diff --git a/src/chunking.ts b/src/chunking.ts new file mode 100644 index 0000000..afed7ac --- /dev/null +++ b/src/chunking.ts @@ -0,0 +1,60 @@ +/** + * Chunking for memories that don't fit in a single Algorand note. + * + * Algorand caps tx notes at 1024 bytes. Subtract the JSON envelope + * (~150 bytes with book/page/total fields) and the encryption layer + * (envelope adds ~40 bytes, base64 inflates by 4/3) and you have + * ~600 bytes of plaintext per chunk that reliably fit. + * + * On save: the caller decides whether to chunk. `chunkValue` splits on + * a fixed byte boundary; chunks are reassembled in `joinChunks` by + * sorting on `page` ascending. + * + * On recall: callers collect all txs/ASAs for a given key, group by the + * save's `created` timestamp (one save = one book), require all pages + * to be present, and concatenate. + */ + +/** + * Max plaintext bytes per chunk. Conservative β€” leaves headroom for + * UTF-8 multi-byte expansion, envelope JSON, and the encryption + * envelope overhead in `@corvidlabs/ts-algochat`. + */ +export const MAX_CLEARTEXT_PER_CHUNK = 600; + +/** + * Split `value` into N chunks of at most `MAX_CLEARTEXT_PER_CHUNK` + * bytes each. Operates on UTF-8 byte boundaries β€” if a multi-byte + * codepoint straddles the cut, the chunk boundary is pushed back to + * the prior codepoint start so we never produce invalid UTF-8. + */ +export function chunkValue(value: string): string[] { + const bytes = Buffer.from(value, "utf-8"); + if (bytes.length <= MAX_CLEARTEXT_PER_CHUNK) return [value]; + + const chunks: string[] = []; + let offset = 0; + while (offset < bytes.length) { + let end = Math.min(offset + MAX_CLEARTEXT_PER_CHUNK, bytes.length); + // Walk back if we landed in the middle of a UTF-8 continuation byte + // (0b10xxxxxx, i.e. (byte & 0xC0) === 0x80). We stop walking once + // we hit a leading byte; this caps regression at 3 bytes. + while (end < bytes.length && (bytes[end] & 0xc0) === 0x80) end--; + chunks.push(bytes.slice(offset, end).toString("utf-8")); + offset = end; + } + return chunks; +} + +/** + * Reassemble pages back into the original value. Caller is responsible + * for passing the pages in correct order (page 1..N). + */ +export function joinChunks(chunks: string[]): string { + return chunks.join(""); +} + +/** Heuristic: does this value need chunking? */ +export function needsChunking(value: string): boolean { + return Buffer.byteLength(value, "utf-8") > MAX_CLEARTEXT_PER_CHUNK; +} diff --git a/src/permanent.ts b/src/permanent.ts index 3a86261..2b7e64c 100644 Binary files a/src/permanent.ts and b/src/permanent.ts differ diff --git a/test/chunking.test.ts b/test/chunking.test.ts new file mode 100644 index 0000000..f4d735c --- /dev/null +++ b/test/chunking.test.ts @@ -0,0 +1,83 @@ +import { test, expect, describe } from "bun:test"; +import { chunkValue, joinChunks, needsChunking, MAX_CLEARTEXT_PER_CHUNK } from "../src/chunking.js"; + +describe("chunkValue", () => { + test("short input returns a single chunk", () => { + const v = "hello world"; + const chunks = chunkValue(v); + expect(chunks).toEqual([v]); + }); + + test("input at exactly MAX_CLEARTEXT_PER_CHUNK returns single chunk", () => { + const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK); + const chunks = chunkValue(v); + expect(chunks.length).toBe(1); + expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK); + }); + + test("input one byte over the limit produces two chunks", () => { + const v = "a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1); + const chunks = chunkValue(v); + expect(chunks.length).toBe(2); + expect(chunks[0].length).toBe(MAX_CLEARTEXT_PER_CHUNK); + expect(chunks[1].length).toBe(1); + }); + + test("3000-byte input produces 5 chunks of 600 bytes", () => { + const v = "a".repeat(3000); + const chunks = chunkValue(v); + expect(chunks.length).toBe(5); + for (const c of chunks) { + expect(Buffer.byteLength(c, "utf-8")).toBeLessThanOrEqual(MAX_CLEARTEXT_PER_CHUNK); + } + }); + + test("round-trip: chunkValue β†’ joinChunks preserves ASCII content", () => { + const v = "x".repeat(2500); + expect(joinChunks(chunkValue(v))).toBe(v); + }); + + test("multi-byte UTF-8 codepoints are never split mid-character", () => { + // 🌟 is 4 bytes in UTF-8. Repeat enough to span chunk boundaries. + const v = "🌟".repeat(200); // 800 bytes + const chunks = chunkValue(v); + for (const c of chunks) { + // Decoding shouldn't throw or insert U+FFFD replacement chars. + const decoded = Buffer.from(c, "utf-8").toString("utf-8"); + expect(decoded).toBe(c); + expect(decoded).not.toContain("οΏ½"); + } + expect(joinChunks(chunks)).toBe(v); + }); + + test("round-trip preserves mixed ASCII + emoji + CJK + accented", () => { + const segment = "Hello δΈ–η•Œ Γ±oΓ±o πŸš€ β€” Ω…Ψ±Ψ­Ψ¨Ψ§ β€” "; + const v = segment.repeat(60); + expect(joinChunks(chunkValue(v))).toBe(v); + }); + + test("empty string returns one empty chunk", () => { + expect(chunkValue("")).toEqual([""]); + }); +}); + +describe("needsChunking", () => { + test("short ASCII does not need chunking", () => { + expect(needsChunking("hello")).toBe(false); + }); + + test("input at boundary does not need chunking", () => { + expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK))).toBe(false); + }); + + test("input over boundary needs chunking", () => { + expect(needsChunking("a".repeat(MAX_CLEARTEXT_PER_CHUNK + 1))).toBe(true); + }); + + test("UTF-8 multi-byte expansion can push a short string over", () => { + // 200 emoji = 800 bytes (each is 4 bytes UTF-8) + expect(needsChunking("🌟".repeat(200))).toBe(true); + // But the same JS .length is 400, well under MAX + expect("🌟".repeat(200).length).toBe(400); + }); +}); diff --git a/test/permanent-reassemble.test.ts b/test/permanent-reassemble.test.ts new file mode 100644 index 0000000..60e1433 --- /dev/null +++ b/test/permanent-reassemble.test.ts @@ -0,0 +1,99 @@ +import { test, expect, describe } from "bun:test"; +import { __test } from "../src/permanent.js"; + +const { reassemble } = __test; + +interface E { + key: string; + value: string; + txid: string; + created: string; + round: number; + tombstone: boolean; + book?: string; + page?: number; + total?: number; +} + +const ent = (over: Partial): E => ({ + key: "k", + value: "", + txid: "tx", + created: "2026-05-18T00:00:00Z", + round: 1, + tombstone: false, + ...over, +}); + +describe("permanent reassemble", () => { + test("legacy single-chunk entries pass through unchanged", () => { + const input: E[] = [ent({ key: "a", value: "hello", round: 5 })]; + expect(reassemble(input)).toEqual(input); + }); + + test("tombstones pass through unchanged", () => { + const input: E[] = [ent({ key: "a", tombstone: true, round: 10 })]; + expect(reassemble(input)).toEqual(input); + }); + + test("two pages with matching key+created are joined in page order", () => { + const input: E[] = [ + ent({ key: "k", value: "World", round: 2, book: "k", page: 2, total: 2, txid: "tx2" }), + ent({ key: "k", value: "Hello ", round: 1, book: "k", page: 1, total: 2, txid: "tx1" }), + ]; + const out = reassemble(input); + expect(out.length).toBe(1); + expect(out[0].value).toBe("Hello World"); + expect(out[0].round).toBe(2); // max round across pages + expect(out[0].total).toBe(2); + }); + + test("missing pages drop the whole record (not partial)", () => { + // total=3 but only 2 pages present + const input: E[] = [ + ent({ key: "k", value: "A", page: 1, total: 3 }), + ent({ key: "k", value: "C", page: 3, total: 3 }), + ]; + const out = reassemble(input); + expect(out.length).toBe(0); + }); + + test("two separate saves of same key produce two reassembled records", () => { + // Save 1 (older): 2 chunks + // Save 2 (newer): 2 chunks at a different timestamp + const input: E[] = [ + ent({ key: "k", value: "OldA", created: "2026-05-17T00:00:00Z", round: 1, page: 1, total: 2 }), + ent({ key: "k", value: "OldB", created: "2026-05-17T00:00:00Z", round: 2, page: 2, total: 2 }), + ent({ key: "k", value: "NewA", created: "2026-05-18T00:00:00Z", round: 5, page: 1, total: 2 }), + ent({ key: "k", value: "NewB", created: "2026-05-18T00:00:00Z", round: 6, page: 2, total: 2 }), + ]; + const out = reassemble(input); + expect(out.length).toBe(2); + // Both reassembled β€” caller's "latest by round" picks the newer one + const values = out.map(e => e.value).sort(); + expect(values).toEqual(["NewANewB", "OldAOldB"]); + }); + + test("mixed single-chunk and multi-chunk entries are both preserved", () => { + const input: E[] = [ + ent({ key: "single", value: "lonely" }), + ent({ key: "multi", value: "X", page: 1, total: 2 }), + ent({ key: "multi", value: "Y", page: 2, total: 2 }), + ]; + const out = reassemble(input); + expect(out.length).toBe(2); + const byKey = Object.fromEntries(out.map(e => [e.key, e.value])); + expect(byKey).toEqual({ single: "lonely", multi: "XY" }); + }); + + test("page numbering must be contiguous 1..total β€” gap drops the record", () => { + // total=3 but pages [1, 1, 3] β€” duplicate page 1, missing page 2 + const input: E[] = [ + ent({ key: "k", value: "A", page: 1, total: 3, txid: "t1" }), + ent({ key: "k", value: "A'", page: 1, total: 3, txid: "t2" }), + ent({ key: "k", value: "C", page: 3, total: 3, txid: "t3" }), + ]; + const out = reassemble(input); + expect(out.length).toBe(0); + }); +});