Skip to content

Commit 78c3044

Browse files
committed
unix-ffi/re: fix PCRE2 FFI memory leaks and add pattern cache
search() allocated pcre2_match_data on every call via FFI but never freed it, leaking ~48+ bytes of C heap per regex operation. Compiled patterns (pcre2_code) were also never freed since MicroPython does not call __del__ on pure-Python classes. Fix by: - calling pcre2_match_data_free() after copying the ovector on both the match and no-match paths - caching capture count and ovector size in __init__ to avoid pcre2_pattern_info() per search - adding _free() method for explicit pcre2_code_free() - adding a bounded LRU pattern cache (_CACHE_MAX=64) with eviction for the module-level convenience functions (re.search, re.match, re.sub, re.split, re.findall) so temporary compiled patterns are reused and old ones are explicitly freed on eviction - adding purge() to clear the cache on demand
1 parent dffb769 commit 78c3044

2 files changed

Lines changed: 311 additions & 14 deletions

File tree

unix-ffi/re/re.py

Lines changed: 48 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -28,6 +28,12 @@
2828
"p", "pcre2_match_data_create_from_pattern_8", "Pp"
2929
)
3030

31+
# void pcre2_match_data_free(pcre2_match_data *match_data);
32+
pcre2_match_data_free = pcre2.func("v", "pcre2_match_data_free_8", "p")
33+
34+
# void pcre2_code_free(pcre2_code *code);
35+
pcre2_code_free = pcre2.func("v", "pcre2_code_free_8", "p")
36+
3137
# PCRE2_SIZE that is of type size_t.
3238
# Use ULONG as type to support both 32bit and 64bit.
3339
PCRE2_SIZE_SIZE = uctypes.sizeof({"field": 0 | uctypes.ULONG})
@@ -99,24 +105,28 @@ def span(self, n=0):
99105
class PCREPattern:
100106
def __init__(self, compiled_ptn):
101107
self.obj = compiled_ptn
108+
buf = array.array("i", [0])
109+
pcre2_pattern_info(compiled_ptn, PCRE2_INFO_CAPTURECOUNT, buf)
110+
self._cap_count = buf[0]
111+
self._ov_size = PCRE2_SIZE_SIZE * (self._cap_count + 1) * 2
112+
113+
def _free(self):
114+
if self.obj:
115+
pcre2_code_free(self.obj)
116+
self.obj = None
102117

103118
def search(self, s, pos=0, endpos=-1, _flags=0):
104119
assert endpos == -1, "pos: %d, endpos: %d" % (pos, endpos)
105-
buf = array.array("i", [0])
106-
pcre2_pattern_info(self.obj, PCRE2_INFO_CAPTURECOUNT, buf)
107-
cap_count = buf[0]
108120
match_data = pcre2_match_data_create_from_pattern(self.obj, None)
109121
num = pcre2_match(self.obj, s, len(s), pos, _flags, match_data, None)
110122
if num == -1:
111-
# No match
123+
pcre2_match_data_free(match_data)
112124
return None
113125
ov_ptr = pcre2_get_ovector_pointer(match_data)
114-
# pcre2_get_ovector_pointer return PCRE2_SIZE
115-
ov_buf = uctypes.bytearray_at(ov_ptr, PCRE2_SIZE_SIZE * (cap_count + 1) * 2)
126+
ov_buf = uctypes.bytearray_at(ov_ptr, self._ov_size)
116127
ov = array.array(PCRE2_SIZE_TYPE, ov_buf)
117-
# We don't care how many matching subexpressions we got, we
118-
# care only about total # of capturing ones (including empty)
119-
return PCREMatch(s, cap_count + 1, ov)
128+
pcre2_match_data_free(match_data)
129+
return PCREMatch(s, self._cap_count + 1, ov)
120130

121131
def match(self, s, pos=0, endpos=-1):
122132
return self.search(s, pos, endpos, PCRE2_ANCHORED)
@@ -188,28 +198,52 @@ def compile(pattern, flags=0):
188198
return PCREPattern(regex)
189199

190200

201+
_cache = {}
202+
_CACHE_MAX = 64
203+
204+
205+
def _compile_cached(pattern, flags=0):
206+
key = (pattern, flags)
207+
cached = _cache.get(key)
208+
if cached is not None:
209+
return cached
210+
if len(_cache) >= _CACHE_MAX:
211+
oldest_key = next(iter(_cache))
212+
_cache.pop(oldest_key)._free()
213+
compiled = compile(pattern, flags)
214+
_cache[key] = compiled
215+
return compiled
216+
217+
218+
def purge():
219+
"""Clear the pattern cache and free all compiled patterns."""
220+
for p in _cache.values():
221+
p._free()
222+
_cache.clear()
223+
224+
191225
def search(pattern, string, flags=0):
192-
r = compile(pattern, flags)
226+
r = _compile_cached(pattern, flags)
193227
return r.search(string)
194228

195229

196230
def match(pattern, string, flags=0):
197-
r = compile(pattern, flags | PCRE2_ANCHORED)
231+
r = _compile_cached(pattern, flags | PCRE2_ANCHORED)
198232
return r.search(string)
199233

200234

201235
def sub(pattern, repl, s, count=0, flags=0):
202-
r = compile(pattern, flags)
236+
r = _compile_cached(pattern, flags)
203237
return r.sub(repl, s, count)
204238

205239

206240
def split(pattern, s, maxsplit=0, flags=0):
207-
r = compile(pattern, flags)
241+
r = _compile_cached(pattern, flags)
208242
return r.split(s, maxsplit)
209243

210244

211245
def findall(pattern, s, flags=0):
212-
r = compile(pattern, flags)
246+
r = _compile_cached(pattern, flags)
213247
return r.findall(s)
214248

215249

unix-ffi/re/test_re_memleak.py

Lines changed: 263 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,263 @@
1+
"""Tests for FFI memory leak fixes in the re module.
2+
3+
Verifies that:
4+
- pcre2_match_data is freed after every search/match (C heap leak per operation)
5+
- pcre2_code is freed via cache eviction (C heap leak per compile)
6+
- Pattern cache prevents recompilation and bounds memory usage
7+
"""
8+
import gc
9+
import re
10+
11+
PASS = 0
12+
FAIL = 0
13+
14+
15+
def _get_rss_pages():
16+
"""Return current RSS in pages from /proc/self/statm."""
17+
with open("/proc/self/statm") as f:
18+
return int(f.read().split()[1])
19+
20+
21+
def _run(name, func):
22+
global PASS, FAIL
23+
try:
24+
func()
25+
PASS += 1
26+
print(" PASS:", name)
27+
except Exception as e:
28+
FAIL += 1
29+
print(" FAIL:", name, "-", e)
30+
31+
32+
# ---------------------------------------------------------------------------
33+
# Functional correctness
34+
# ---------------------------------------------------------------------------
35+
36+
def test_search_still_works():
37+
m = re.search(r"a+", "caaab")
38+
assert m.group(0) == "aaa"
39+
40+
41+
def test_match_still_works():
42+
m = re.match(r"a+", "aaab")
43+
assert m.group(0) == "aaa"
44+
assert re.match(r"a+", "bbb") is None
45+
46+
47+
def test_sub_still_works():
48+
assert re.sub("a", "z", "caaab") == "czzzb"
49+
50+
51+
def test_findall_still_works():
52+
assert re.findall(r"\w+ly", "carefully and quickly") == ["carefully", "quickly"]
53+
54+
55+
def test_split_still_works():
56+
assert re.split(r"\W+", "one, two, three") == ["one", "two", "three"]
57+
58+
59+
def test_compiled_pattern_reuse():
60+
"""Compiled patterns work correctly across many calls."""
61+
pat = re.compile(r"(\d+)")
62+
for i in range(100):
63+
m = pat.match(str(i))
64+
assert m is not None
65+
assert m.group(1) == str(i)
66+
67+
68+
def test_no_match_returns_none():
69+
pat = re.compile(r"xyz")
70+
for _ in range(1000):
71+
assert pat.search("abc") is None
72+
73+
74+
def test_groups_and_captures():
75+
m = re.match(r"(\d+)\.(\d+)", "24.1632")
76+
assert m.groups() == ("24", "1632")
77+
assert m.group(2, 1) == ("1632", "24")
78+
79+
80+
def test_sub_with_callable():
81+
assert re.sub("a", lambda m: m.group(0) * 2, "caaab") == "caaaaaab"
82+
83+
84+
# ---------------------------------------------------------------------------
85+
# Pattern cache tests
86+
# ---------------------------------------------------------------------------
87+
88+
def test_cache_reuses_pattern():
89+
"""Same pattern string should return cached compiled pattern."""
90+
re.purge()
91+
re.search(r"test_cache_1", "x")
92+
assert (r"test_cache_1", 0) in re._cache
93+
re.search(r"test_cache_1", "y")
94+
assert len([k for k in re._cache if k[0] == r"test_cache_1"]) == 1
95+
96+
97+
def test_cache_eviction():
98+
"""Exceeding _CACHE_MAX evicts oldest entries."""
99+
re.purge()
100+
for i in range(re._CACHE_MAX + 10):
101+
re.search("evict_%d" % i, "evict_%d" % i)
102+
assert len(re._cache) == re._CACHE_MAX
103+
104+
105+
def test_purge_clears_cache():
106+
re.search("purge_test", "purge_test")
107+
assert len(re._cache) > 0
108+
re.purge()
109+
assert len(re._cache) == 0
110+
111+
112+
# ---------------------------------------------------------------------------
113+
# C heap memory leak tests (use gc.threshold to keep Python heap clean)
114+
# ---------------------------------------------------------------------------
115+
116+
def test_match_data_no_leak():
117+
"""Repeated search/match must not grow the C heap (pcre2_match_data freed).
118+
119+
Before the fix, each search() leaked ~48+ bytes of pcre2_match_data.
120+
Over 50k iterations that's ~2+ MB of RSS growth.
121+
"""
122+
old_thresh = gc.threshold()
123+
gc.threshold(4096)
124+
pat = re.compile(r"(\w+)\s+(\w+)")
125+
gc.collect()
126+
rss_before = _get_rss_pages()
127+
for _ in range(50000):
128+
pat.search("hello world foo bar")
129+
gc.collect()
130+
rss_after = _get_rss_pages()
131+
gc.threshold(old_thresh)
132+
growth = rss_after - rss_before
133+
assert growth < 50, "RSS grew by %d pages; pcre2_match_data likely leaking" % growth
134+
135+
136+
def test_match_data_no_leak_on_no_match():
137+
"""Non-matching searches must also free match_data."""
138+
old_thresh = gc.threshold()
139+
gc.threshold(4096)
140+
pat = re.compile(r"xyz123")
141+
gc.collect()
142+
rss_before = _get_rss_pages()
143+
for _ in range(50000):
144+
pat.search("nothing here")
145+
gc.collect()
146+
rss_after = _get_rss_pages()
147+
gc.threshold(old_thresh)
148+
growth = rss_after - rss_before
149+
assert growth < 50, "RSS grew by %d pages on no-match path" % growth
150+
151+
152+
def test_cached_patterns_no_leak():
153+
"""Convenience functions use the cache, so pcre2_code doesn't leak."""
154+
re.purge()
155+
old_thresh = gc.threshold()
156+
gc.threshold(4096)
157+
gc.collect()
158+
rss_before = _get_rss_pages()
159+
for _ in range(50000):
160+
re.search(r"cached_\d+", "cached_123")
161+
gc.collect()
162+
rss_after = _get_rss_pages()
163+
gc.threshold(old_thresh)
164+
re.purge()
165+
growth = rss_after - rss_before
166+
assert growth < 50, "RSS grew by %d pages; pcre2_code likely leaking" % growth
167+
168+
169+
def test_eviction_frees_code():
170+
"""Evicted patterns must have their pcre2_code freed."""
171+
re.purge()
172+
old_thresh = gc.threshold()
173+
gc.threshold(4096)
174+
gc.collect()
175+
rss_before = _get_rss_pages()
176+
for i in range(500):
177+
re.search("evict_free_%d" % i, "evict_free_%d" % i)
178+
gc.collect()
179+
rss_after = _get_rss_pages()
180+
gc.threshold(old_thresh)
181+
re.purge()
182+
growth = rss_after - rss_before
183+
assert growth < 50, "RSS grew by %d pages during cache eviction" % growth
184+
185+
186+
def test_sub_repeated_no_leak():
187+
"""sub() calls search() in a loop; match_data must be freed each time."""
188+
re.purge()
189+
old_thresh = gc.threshold()
190+
gc.threshold(4096)
191+
gc.collect()
192+
rss_before = _get_rss_pages()
193+
for _ in range(5000):
194+
re.sub(r"\s+", "-", "one two three four five")
195+
gc.collect()
196+
rss_after = _get_rss_pages()
197+
gc.threshold(old_thresh)
198+
re.purge()
199+
growth = rss_after - rss_before
200+
assert growth < 50, "RSS grew by %d pages during repeated sub()" % growth
201+
202+
203+
def test_json_like_workload_no_leak():
204+
"""Simulate JSON parsing regex workload (STRINGCHUNK, WHITESPACE, NUMBER)."""
205+
STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', re.VERBOSE | re.MULTILINE | re.DOTALL)
206+
WHITESPACE = re.compile(r"[ \t\n\r]*", re.VERBOSE | re.MULTILINE | re.DOTALL)
207+
NUMBER_RE = re.compile(
208+
r"(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?",
209+
re.VERBOSE | re.MULTILINE | re.DOTALL,
210+
)
211+
test_strings = ['"hello"', '"world"', ' \n\t ', "12345", "-3.14e10"]
212+
213+
old_thresh = gc.threshold()
214+
gc.threshold(4096)
215+
gc.collect()
216+
rss_before = _get_rss_pages()
217+
for _ in range(10000):
218+
for s in test_strings:
219+
STRINGCHUNK.match(s)
220+
WHITESPACE.match(s)
221+
NUMBER_RE.match(s)
222+
gc.collect()
223+
rss_after = _get_rss_pages()
224+
gc.threshold(old_thresh)
225+
growth = rss_after - rss_before
226+
assert growth < 50, (
227+
"RSS grew by %d pages during JSON-like regex workload" % growth
228+
)
229+
230+
231+
def test_explicit_free():
232+
"""_free() releases pcre2_code and invalidates the pattern."""
233+
p = re.compile(r"explicit_free_test")
234+
assert p.obj is not None
235+
p._free()
236+
assert p.obj is None
237+
238+
239+
if __name__ == "__main__":
240+
print("Running re FFI memory leak tests...")
241+
_run("test_search_still_works", test_search_still_works)
242+
_run("test_match_still_works", test_match_still_works)
243+
_run("test_sub_still_works", test_sub_still_works)
244+
_run("test_findall_still_works", test_findall_still_works)
245+
_run("test_split_still_works", test_split_still_works)
246+
_run("test_compiled_pattern_reuse", test_compiled_pattern_reuse)
247+
_run("test_no_match_returns_none", test_no_match_returns_none)
248+
_run("test_groups_and_captures", test_groups_and_captures)
249+
_run("test_sub_with_callable", test_sub_with_callable)
250+
_run("test_cache_reuses_pattern", test_cache_reuses_pattern)
251+
_run("test_cache_eviction", test_cache_eviction)
252+
_run("test_purge_clears_cache", test_purge_clears_cache)
253+
_run("test_match_data_no_leak", test_match_data_no_leak)
254+
_run("test_match_data_no_leak_on_no_match", test_match_data_no_leak_on_no_match)
255+
_run("test_cached_patterns_no_leak", test_cached_patterns_no_leak)
256+
_run("test_eviction_frees_code", test_eviction_frees_code)
257+
_run("test_sub_repeated_no_leak", test_sub_repeated_no_leak)
258+
_run("test_json_like_workload_no_leak", test_json_like_workload_no_leak)
259+
_run("test_explicit_free", test_explicit_free)
260+
print()
261+
print("%d passed, %d failed" % (PASS, FAIL))
262+
if FAIL:
263+
raise SystemExit(1)

0 commit comments

Comments
 (0)