diff --git a/README.md b/README.md index 964e2ab..ff2a966 100644 --- a/README.md +++ b/README.md @@ -166,6 +166,29 @@ lazy proxy directly to `cjson.encode` (cjson bypasses metamethods in C); use `qjson.encode` instead, or call `qjson.materialize(t)` to get a plain Lua table that any third-party encoder can handle. +`qjson.materialize(t, { keep_origin = true })` keeps lightweight provenance on +the returned plain Lua tables so `qjson.encode` can preserve key order and +reuse selected original tokens. Recording is intentionally threshold-based: + +- String children are recorded only when their raw JSON token (including + quotes) is longer than 24 bytes. +- Table children are recorded in the parent only when the child origin is + complete and its raw subtree span is longer than 64 bytes. +- Numbers, booleans, null, and short strings are not recorded. + +Each recorded container tracks whether its provenance is complete: + +- `complete = true`: every child needed to prove byte-for-byte identity is + recorded, so an unchanged container can be emitted as the original slice. +- `complete = false`: provenance is partial. Objects still preserve original + key order for existing keys and can reuse recorded large children, but arrays + fall back to normal array/object encoding. + +Because materialized tables are ordinary Lua tables (no dirty-tracking +metatable), `keep_origin` with partial provenance preserves JSON-equivalent +output rather than guaranteeing byte-identical re-emission of every unchanged +small token. + **Native `next` caveat.** `next(t)` is not proxy-aware: it bypasses the `__pairs` / `__ipairs` hooks and may see qjson implementation fields instead of JSON fields. Do not use native `next` to iterate a lazy proxy or test whether it diff --git a/lua/qjson/table.lua b/lua/qjson/table.lua index e85dc93..381e683 100644 --- a/lua/qjson/table.lua +++ b/lua/qjson/table.lua @@ -30,6 +30,8 @@ local TABLE_TYPE_HINT = setmetatable({}, { __mode = "k" }) -- Weak side-table for keep_origin materialization metadata. -- Maps materialized table -> provenance record used by qjson.encode. local TABLE_ORIGIN = setmetatable({}, { __mode = "k" }) +local ORIGIN_STRING_MIN_RAW = 24 +local ORIGIN_TABLE_MIN_RAW = 64 -- Box scratch used for one-shot FFI returns. Reused across calls to avoid -- per-call allocation; safe because the parent Doc / lazy view holds the @@ -548,29 +550,34 @@ local function materialize_plain(v) return v end -local function cursor_raw_token(ctx, cursor) +local function cursor_token_span(ctx, cursor) local rc = C.qjson_cursor_bytes(cursor, sz_a, sz_b) check(ctx, rc) local bs = tonumber(sz_a[0]) local be = tonumber(sz_b[0]) - return ctx._doc._hold:sub(bs + 1, be), bs, be + return bs, be end -local function scalar_origin_record(v, raw_token) - if rawequal(v, _M.null) then - return { tag = "null", raw = raw_token } - end +local function origin_child_record(v, source, bs, be) + local raw_len = be - bs local tv = type(v) if tv == "string" then - return { tag = "string", value = v, raw = raw_token } - end - if tv == "number" then - return { tag = "number", value = v, raw = raw_token } + if raw_len > ORIGIN_STRING_MIN_RAW then + return { tag = "string", value = v, raw = source:sub(bs + 1, be) }, true + end + return nil, false end - if tv == "boolean" then - return { tag = "boolean", value = v, raw = raw_token } + if tv == "table" then + local child_origin = TABLE_ORIGIN[v] + if child_origin ~= nil + and child_origin.complete == true + and (child_origin.be - child_origin.bs) > ORIGIN_TABLE_MIN_RAW + then + return { tag = "table", origin = child_origin }, true + end + return nil, false end - return nil + return nil, false end local materialize_with_origin @@ -581,6 +588,7 @@ local function materialize_object_with_origin(view) local records = {} local seen = {} local had_duplicates = false + local complete = true local it = new_object_iter(view) while true do @@ -596,7 +604,7 @@ local function materialize_object_with_origin(view) had_duplicates = true end - local raw_token = cursor_raw_token(view, child_box[0]) + local bs, be = cursor_token_span(view, child_box[0]) local child if count == 1 then local cached = cached_child(view, key) @@ -614,16 +622,16 @@ local function materialize_object_with_origin(view) local materialized_child = materialize_with_origin(child) out[key] = materialized_child - local record = scalar_origin_record(materialized_child, raw_token) - local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil - if record == nil and child_origin ~= nil then - record = { tag = "table", origin = child_origin } + local record, captured = origin_child_record(materialized_child, view._doc._hold, bs, be) + if not captured then + complete = false end records[key] = record end TABLE_ORIGIN[out] = { kind = "object", + complete = complete, source = view._doc._hold, bs = view._bs, be = view._be, @@ -638,22 +646,22 @@ end local function materialize_array_with_origin(view) local out = {} local records = {} + local complete = true local i = 0 while true do local rc = C.qjson_cursor_index(view._cur, i, child_box) if rc == QJSON_NOT_FOUND then break end check(view, rc, T_ARR) - local raw_token = cursor_raw_token(view, child_box[0]) + local bs, be = cursor_token_span(view, child_box[0]) local idx = i + 1 local cached = rawget(view, idx) local child = cached or decode_cursor(view, child_box) local materialized_child = materialize_with_origin(child) out[idx] = materialized_child - local record = scalar_origin_record(materialized_child, raw_token) - local child_origin = type(materialized_child) == "table" and TABLE_ORIGIN[materialized_child] or nil - if record == nil and child_origin ~= nil then - record = { tag = "table", origin = child_origin } + local record, captured = origin_child_record(materialized_child, view._doc._hold, bs, be) + if not captured then + complete = false end records[idx] = record i = idx @@ -663,6 +671,7 @@ local function materialize_array_with_origin(view) end TABLE_ORIGIN[out] = { kind = "array", + complete = complete, source = view._doc._hold, bs = view._bs, be = view._be, @@ -973,13 +982,7 @@ local function origin_record_matches(record, value, depth, active) return false end local tag = record.tag - if tag == "null" then - return rawequal(value, _M.null) - elseif tag == "boolean" then - return type(value) == "boolean" and value == record.value - elseif tag == "number" then - return type(value) == "number" and value == record.value - elseif tag == "string" then + if tag == "string" then return type(value) == "string" and value == record.value elseif tag == "table" then if type(value) ~= "table" then @@ -1015,6 +1018,9 @@ local function origin_table_slice(origin) end origin_object_fully_matches = function(t, origin, depth, active) + if origin.complete ~= true then + return false + end if origin.had_duplicates then return false end @@ -1036,6 +1042,9 @@ origin_object_fully_matches = function(t, origin, depth, active) end origin_array_fully_matches = function(t, origin, depth, active) + if origin.complete ~= true then + return false + end if depth > ENCODE_MAX_DEPTH then error(ENCODE_DEPTH_ERROR) end @@ -1067,18 +1076,8 @@ local function encode_origin_child(value, depth, active, record) then return record.raw end - if record.tag == "null" and rawequal(value, _M.null) then - return record.raw - end - if record.tag == "boolean" - and type(value) == "boolean" - and value == record.value - then - return record.raw - end end - -- Numeric scalars intentionally do not reuse raw lexical form when a - -- parent container is being walked; use the normal number encoder. + -- Small scalars and incomplete child tables are re-encoded. return encode(value, depth + 1, active) end @@ -1086,7 +1085,7 @@ local function encode_object_with_origin(t, depth, active, origin) if depth > ENCODE_MAX_DEPTH then error(ENCODE_DEPTH_ERROR) end - if origin_object_fully_matches(t, origin, depth, active) then + if origin.complete == true and origin_object_fully_matches(t, origin, depth, active) then return origin_table_slice(origin) end @@ -1124,7 +1123,7 @@ local function encode_array_with_origin(t, depth, active, origin) if depth > ENCODE_MAX_DEPTH then error(ENCODE_DEPTH_ERROR) end - if origin_array_fully_matches(t, origin, depth, active) then + if origin.complete == true and origin_array_fully_matches(t, origin, depth, active) then return origin_table_slice(origin) end local kind, max = classify_plain_table(t) diff --git a/tests/lua/origin_materialize_spec.lua b/tests/lua/origin_materialize_spec.lua index cd6c9b4..536d935 100644 --- a/tests/lua/origin_materialize_spec.lua +++ b/tests/lua/origin_materialize_spec.lua @@ -1,5 +1,24 @@ local qjson = require("qjson") local cjson = require("cjson") +local LONG_ESC_A = "\\u0061\\u0062\\u0063\\u0064\\u0065" +local LONG_ESC_B = "\\u0066\\u0067\\u0068\\u0069\\u006A" +local EXACT_24_ESC = "\\u0061\\u0062abcdefghij" +local EXACT_64_CHILD_VALUE = string.rep("a", 56) + +local function count_string_sub_calls(fn) + local original = string.sub + local calls = 0 + rawset(string, "sub", function(...) + calls = calls + 1 + return original(...) + end) + local ok, err = pcall(fn) + rawset(string, "sub", original) + if not ok then + error(err, 0) + end + return calls +end describe("qjson.materialize keep_origin", function() it("keeps default materialize semantics when keep_origin is not set", function() @@ -25,11 +44,37 @@ describe("qjson.materialize keep_origin", function() end, "qjson.materialize: opts.keep_origin must be a boolean") end) - it("reuses unchanged escaped string token when parent is changed", function() + it("does not guarantee reuse for short escaped strings when parent is changed", function() local t = qjson.materialize(qjson.decode('{"blob":"\\u0061","x":1}'), { keep_origin = true }) t.x = 2 - assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t)) + assert.are.equal('{"blob":"a","x":2}', qjson.encode(t)) + end) + + it("does not slice raw tokens for dropped provenance records", function() + local doc = qjson.decode('{"n":1,"b":true,"u":null,"s":"x","arr":[1,2],"obj":{"x":1}}') + local sub_calls = count_string_sub_calls(function() + qjson.materialize(doc, { keep_origin = true }) + end) + + assert.are.equal(0, sub_calls) + end) + + it("does not treat an exact 24-byte string token as above threshold", function() + assert.are.equal(24, #('"' .. EXACT_24_ESC .. '"')) + + local t = qjson.materialize(qjson.decode('{"blob":"' .. EXACT_24_ESC .. '","x":1}'), { keep_origin = true }) + t.x = 2 + + assert.are.equal('{"blob":"ababcdefghij","x":2}', qjson.encode(t)) + end) + + it("reuses unchanged escaped string token when raw token is above threshold", function() + local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + t.x = 2 + + assert.are.equal('{"blob":"' .. LONG_ESC_A .. '","x":2}', qjson.encode(t)) end) it("falls back to normal escaping for changed string children", function() @@ -39,15 +84,30 @@ describe("qjson.materialize keep_origin", function() assert.are.equal('{"blob":"line1\\nline2","x":1}', qjson.encode(t)) end) - it("reuses unchanged nested object and array siblings when parent is changed", function() - local src = '{"x":0,"obj":{"k":"\\u0061"},"arr":[1, 2 ,3]}' + it("re-emits small-scalar containers field-by-field when unmodified", function() + local src = '{ "n":1.0, "s":"\\u0061", "b":true, "u":null }' local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) - t.x = 9 - local out = qjson.encode(t) - assert.is_truthy(string.find(out, '"obj":{"k":"\\u0061"}', 1, true)) - assert.is_truthy(string.find(out, '"arr":[1, 2 ,3]', 1, true)) - assert.are.equal(9, cjson.decode(out).x) + + assert.are.equal('{"n":1,"s":"a","b":true,"u":null}', out) + assert.are_not.equal(src, out) + end) + + it("returns original slice for unmodified containers with complete large children", function() + local src = '{ "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + + assert.are.equal(src, qjson.encode(t)) + end) + + it("does not treat an exact 64-byte child container as above threshold", function() + local child = '{"a":"' .. EXACT_64_CHILD_VALUE .. '"}' + assert.are.equal(64, #child) + + local src = '{ "child" : ' .. child .. ' }' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + + assert.are.equal('{"child":' .. child .. '}', qjson.encode(t)) end) it("does not reintroduce duplicate keys after materialization", function() @@ -67,13 +127,20 @@ describe("qjson.materialize keep_origin", function() assert.are.equal('{"n":1,"e":1000,"z":0,"x":2}', qjson.encode(t)) end) - it("does not hide nested table mutations behind a parent raw slice", function() + it("partial origins do not hide nested table mutations behind a parent raw slice", function() local t = qjson.materialize(qjson.decode('{"a":{"x":1},"b":2}'), { keep_origin = true }) t.a.x = 9 assert.are.equal('{"a":{"x":9},"b":2}', qjson.encode(t)) end) + it("falls back to normal array/object classification for incomplete arrays", function() + local src = '[ 1 , 2 , 3 ]' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + + assert.are.equal("[1,2,3]", qjson.encode(t)) + end) + it("still reports circular references after materialization", function() local t = qjson.materialize(qjson.decode('{"a":1}'), { keep_origin = true }) t.self = t @@ -128,13 +195,23 @@ describe("qjson.materialize keep_origin", function() it("keeps source bytes alive for provenance-backed reuse", function() local function materialized() - local src = '{"blob":"\\u0061","x":1}' + local src = '{"blob":"' .. LONG_ESC_A .. '","x":1}' return qjson.materialize(qjson.decode(src), { keep_origin = true }) end local t = materialized() collectgarbage("collect") t.x = 2 - assert.are.equal('{"blob":"\\u0061","x":2}', qjson.encode(t)) + assert.are.equal('{"blob":"' .. LONG_ESC_A .. '","x":2}', qjson.encode(t)) + end) + + it("reuses large complete child subtrees when parent is modified", function() + local src = '{"x":0,"big": { "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }}' + local t = qjson.materialize(qjson.decode(src), { keep_origin = true }) + t.x = 9 + + local out = qjson.encode(t) + assert.are.equal(9, cjson.decode(out).x) + assert.is_truthy(string.find(out, '"big":{ "a":"' .. LONG_ESC_A .. '" , "b":"' .. LONG_ESC_B .. '" }', 1, true)) end) end)