Skip to content

Commit dffb769

Browse files
committed
unix-ffi/json: scanner: fix memo leak by returning wrapper function
py_make_scanner() returned _scan_once (the inner function) instead of scan_once (the wrapper that clears memo in a finally clause). This caused the decoder's string-intern table to accumulate every unique JSON key across calls to json.loads(), leading to unbounded memory growth.
1 parent ce011a3 commit dffb769

2 files changed

Lines changed: 164 additions & 1 deletion

File tree

unix-ffi/json/json/scanner.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -71,7 +71,7 @@ def scan_once(string, idx):
7171
finally:
7272
memo.clear()
7373

74-
return _scan_once
74+
return scan_once
7575

7676

7777
make_scanner = c_make_scanner or py_make_scanner

unix-ffi/json/test_json_memleak.py

Lines changed: 163 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,163 @@
1+
"""Unit tests for json module memory leak fix.
2+
3+
The primary bug: scanner.py py_make_scanner() returned _scan_once (the inner
4+
function) instead of scan_once (the wrapper that clears the memo dict).
5+
This caused the decoder's memo dict to grow unboundedly across calls,
6+
leaking every unique JSON object key string ever parsed.
7+
"""
8+
9+
import sys
10+
import gc
11+
import json
12+
from json.decoder import JSONDecoder
13+
from json import scanner
14+
15+
16+
def test_memo_cleared_after_loads():
17+
"""Memo dict must be empty after json.loads() completes."""
18+
json.loads('{"a": 1, "b": 2, "c": 3}')
19+
assert len(json._default_decoder.memo) == 0, \
20+
"memo not cleared after loads()"
21+
22+
23+
def test_memo_cleared_after_repeated_loads():
24+
"""Repeated loads() must not accumulate memo entries."""
25+
for i in range(100):
26+
json.loads('{"key_%d": %d}' % (i, i))
27+
assert len(json._default_decoder.memo) == 0, \
28+
"memo accumulated entries across loads() calls"
29+
30+
31+
def test_memo_cleared_after_nested_objects():
32+
"""Memo must be cleared even with deeply nested objects."""
33+
json.loads('{"outer": {"middle": {"inner": "value"}}}')
34+
assert len(json._default_decoder.memo) == 0, \
35+
"memo not cleared after nested object parse"
36+
37+
38+
def test_memo_cleared_after_array_of_objects():
39+
"""Memo must be cleared after parsing arrays of objects."""
40+
json.loads('[{"k1": 1}, {"k2": 2}, {"k3": 3}]')
41+
assert len(json._default_decoder.memo) == 0, \
42+
"memo not cleared after array-of-objects parse"
43+
44+
45+
def test_memo_cleared_with_custom_decoder():
46+
"""A fresh JSONDecoder instance must also clear its memo."""
47+
dec = JSONDecoder()
48+
dec.decode('{"x": 1, "y": 2}')
49+
assert len(dec.memo) == 0, \
50+
"custom decoder memo not cleared after decode()"
51+
52+
53+
def test_no_leak_under_repeated_parsing():
54+
"""Memory must not grow when parsing the same structure repeatedly.
55+
56+
Uses gc to measure retained object count.
57+
"""
58+
gc.collect()
59+
baseline = gc.mem_alloc() if hasattr(gc, 'mem_alloc') else None
60+
61+
for _ in range(1000):
62+
json.loads('{"sensor": 42, "status": "ok", "values": [1,2,3]}')
63+
64+
gc.collect()
65+
after = gc.mem_alloc() if hasattr(gc, 'mem_alloc') else None
66+
67+
assert len(json._default_decoder.memo) == 0, \
68+
"memo leaked after 1000 iterations"
69+
70+
if baseline is not None and after is not None:
71+
growth = after - baseline
72+
assert growth < 4096, \
73+
"memory grew by %d bytes over 1000 iterations" % growth
74+
75+
76+
def test_scan_once_is_wrapper():
77+
"""make_scanner must return the wrapper that clears memo, not _scan_once."""
78+
dec = JSONDecoder()
79+
# The scan_once stored on the decoder should be the wrapper (scan_once),
80+
# which has a finally clause that calls memo.clear().
81+
# Verify indirectly: after calling scan_once, memo must be empty.
82+
dec.scan_once('{"test": 1}', 0)
83+
assert len(dec.memo) == 0, \
84+
"scan_once did not clear memo — wrong function returned by make_scanner"
85+
86+
87+
def test_basic_parsing_still_works():
88+
"""Verify the fix doesn't break normal JSON parsing."""
89+
assert json.loads('null') is None
90+
assert json.loads('true') is True
91+
assert json.loads('false') is False
92+
assert json.loads('42') == 42
93+
assert json.loads('3.14') == 3.14
94+
assert json.loads('"hello"') == "hello"
95+
assert json.loads('[1, 2, 3]') == [1, 2, 3]
96+
assert json.loads('{"a": 1}') == {"a": 1}
97+
98+
99+
def test_nested_parsing_still_works():
100+
"""Verify complex nested structures parse correctly."""
101+
data = json.loads(
102+
'{"users": [{"name": "alice", "age": 30}, '
103+
'{"name": "bob", "age": 25}], "count": 2}'
104+
)
105+
assert data["count"] == 2
106+
assert len(data["users"]) == 2
107+
assert data["users"][0]["name"] == "alice"
108+
assert data["users"][1]["age"] == 25
109+
110+
111+
def test_encoding_still_works():
112+
"""Verify encoding is unaffected by the fix."""
113+
assert json.dumps({"a": 1}) == '{"a": 1}'
114+
assert json.dumps([1, 2, 3]) == '[1, 2, 3]'
115+
assert json.dumps(None) == 'null'
116+
assert json.dumps(True) == 'true'
117+
118+
119+
def test_roundtrip():
120+
"""Verify encode/decode roundtrip works correctly."""
121+
original = {"key": [1, 2.5, "three", None, True, False, {"nested": "obj"}]}
122+
encoded = json.dumps(original)
123+
decoded = json.loads(encoded)
124+
assert decoded == original, \
125+
"roundtrip failed: %r != %r" % (decoded, original)
126+
127+
128+
def test_key_interning_within_single_parse():
129+
"""Within a single parse, duplicate keys should still be interned via memo.
130+
131+
The memo is only cleared AFTER the top-level scan_once returns, so
132+
within a single document, key deduplication still works.
133+
"""
134+
data = json.loads('[{"id": 1}, {"id": 2}, {"id": 3}]')
135+
keys = [list(obj.keys())[0] for obj in data]
136+
assert all(k == "id" for k in keys)
137+
# After the parse, memo should be clean
138+
assert len(json._default_decoder.memo) == 0
139+
140+
141+
# --- run all tests ---
142+
143+
def run_tests():
144+
tests = [v for k, v in sorted(globals().items()) if k.startswith('test_')]
145+
passed = 0
146+
failed = 0
147+
for test in tests:
148+
name = test.__name__
149+
try:
150+
test()
151+
print(" PASS:", name)
152+
passed += 1
153+
except Exception as e:
154+
print(" FAIL:", name, "-", e)
155+
failed += 1
156+
print("\n%d passed, %d failed" % (passed, failed))
157+
return failed
158+
159+
160+
if __name__ == '__main__':
161+
print("Running json memory leak tests...")
162+
failures = run_tests()
163+
sys.exit(1 if failures else 0)

0 commit comments

Comments
 (0)