|
| 1 | +"""Tests for FFI memory leak fixes in the re module. |
| 2 | +
|
| 3 | +Verifies that: |
| 4 | +- pcre2_match_data is freed after every search/match (C heap leak per operation) |
| 5 | +- pcre2_code is freed via cache eviction (C heap leak per compile) |
| 6 | +- Pattern cache prevents recompilation and bounds memory usage |
| 7 | +""" |
| 8 | +import gc |
| 9 | +import re |
| 10 | + |
| 11 | +PASS = 0 |
| 12 | +FAIL = 0 |
| 13 | + |
| 14 | + |
| 15 | +def _get_rss_pages(): |
| 16 | + """Return current RSS in pages from /proc/self/statm.""" |
| 17 | + with open("/proc/self/statm") as f: |
| 18 | + return int(f.read().split()[1]) |
| 19 | + |
| 20 | + |
| 21 | +def _run(name, func): |
| 22 | + global PASS, FAIL |
| 23 | + try: |
| 24 | + func() |
| 25 | + PASS += 1 |
| 26 | + print(" PASS:", name) |
| 27 | + except Exception as e: |
| 28 | + FAIL += 1 |
| 29 | + print(" FAIL:", name, "-", e) |
| 30 | + |
| 31 | + |
| 32 | +# --------------------------------------------------------------------------- |
| 33 | +# Functional correctness |
| 34 | +# --------------------------------------------------------------------------- |
| 35 | + |
| 36 | +def test_search_still_works(): |
| 37 | + m = re.search(r"a+", "caaab") |
| 38 | + assert m.group(0) == "aaa" |
| 39 | + |
| 40 | + |
| 41 | +def test_match_still_works(): |
| 42 | + m = re.match(r"a+", "aaab") |
| 43 | + assert m.group(0) == "aaa" |
| 44 | + assert re.match(r"a+", "bbb") is None |
| 45 | + |
| 46 | + |
| 47 | +def test_sub_still_works(): |
| 48 | + assert re.sub("a", "z", "caaab") == "czzzb" |
| 49 | + |
| 50 | + |
| 51 | +def test_findall_still_works(): |
| 52 | + assert re.findall(r"\w+ly", "carefully and quickly") == ["carefully", "quickly"] |
| 53 | + |
| 54 | + |
| 55 | +def test_split_still_works(): |
| 56 | + assert re.split(r"\W+", "one, two, three") == ["one", "two", "three"] |
| 57 | + |
| 58 | + |
| 59 | +def test_compiled_pattern_reuse(): |
| 60 | + """Compiled patterns work correctly across many calls.""" |
| 61 | + pat = re.compile(r"(\d+)") |
| 62 | + for i in range(100): |
| 63 | + m = pat.match(str(i)) |
| 64 | + assert m is not None |
| 65 | + assert m.group(1) == str(i) |
| 66 | + |
| 67 | + |
| 68 | +def test_no_match_returns_none(): |
| 69 | + pat = re.compile(r"xyz") |
| 70 | + for _ in range(1000): |
| 71 | + assert pat.search("abc") is None |
| 72 | + |
| 73 | + |
| 74 | +def test_groups_and_captures(): |
| 75 | + m = re.match(r"(\d+)\.(\d+)", "24.1632") |
| 76 | + assert m.groups() == ("24", "1632") |
| 77 | + assert m.group(2, 1) == ("1632", "24") |
| 78 | + |
| 79 | + |
| 80 | +def test_sub_with_callable(): |
| 81 | + assert re.sub("a", lambda m: m.group(0) * 2, "caaab") == "caaaaaab" |
| 82 | + |
| 83 | + |
| 84 | +# --------------------------------------------------------------------------- |
| 85 | +# Pattern cache tests |
| 86 | +# --------------------------------------------------------------------------- |
| 87 | + |
| 88 | +def test_cache_reuses_pattern(): |
| 89 | + """Same pattern string should return cached compiled pattern.""" |
| 90 | + re.purge() |
| 91 | + re.search(r"test_cache_1", "x") |
| 92 | + assert (r"test_cache_1", 0) in re._cache |
| 93 | + re.search(r"test_cache_1", "y") |
| 94 | + assert len([k for k in re._cache if k[0] == r"test_cache_1"]) == 1 |
| 95 | + |
| 96 | + |
| 97 | +def test_cache_eviction(): |
| 98 | + """Exceeding _CACHE_MAX evicts oldest entries.""" |
| 99 | + re.purge() |
| 100 | + for i in range(re._CACHE_MAX + 10): |
| 101 | + re.search("evict_%d" % i, "evict_%d" % i) |
| 102 | + assert len(re._cache) == re._CACHE_MAX |
| 103 | + |
| 104 | + |
| 105 | +def test_purge_clears_cache(): |
| 106 | + re.search("purge_test", "purge_test") |
| 107 | + assert len(re._cache) > 0 |
| 108 | + re.purge() |
| 109 | + assert len(re._cache) == 0 |
| 110 | + |
| 111 | + |
| 112 | +# --------------------------------------------------------------------------- |
| 113 | +# C heap memory leak tests (use gc.threshold to keep Python heap clean) |
| 114 | +# --------------------------------------------------------------------------- |
| 115 | + |
| 116 | +def test_match_data_no_leak(): |
| 117 | + """Repeated search/match must not grow the C heap (pcre2_match_data freed). |
| 118 | +
|
| 119 | + Before the fix, each search() leaked ~48+ bytes of pcre2_match_data. |
| 120 | + Over 50k iterations that's ~2+ MB of RSS growth. |
| 121 | + """ |
| 122 | + old_thresh = gc.threshold() |
| 123 | + gc.threshold(4096) |
| 124 | + pat = re.compile(r"(\w+)\s+(\w+)") |
| 125 | + gc.collect() |
| 126 | + rss_before = _get_rss_pages() |
| 127 | + for _ in range(50000): |
| 128 | + pat.search("hello world foo bar") |
| 129 | + gc.collect() |
| 130 | + rss_after = _get_rss_pages() |
| 131 | + gc.threshold(old_thresh) |
| 132 | + growth = rss_after - rss_before |
| 133 | + assert growth < 50, "RSS grew by %d pages; pcre2_match_data likely leaking" % growth |
| 134 | + |
| 135 | + |
| 136 | +def test_match_data_no_leak_on_no_match(): |
| 137 | + """Non-matching searches must also free match_data.""" |
| 138 | + old_thresh = gc.threshold() |
| 139 | + gc.threshold(4096) |
| 140 | + pat = re.compile(r"xyz123") |
| 141 | + gc.collect() |
| 142 | + rss_before = _get_rss_pages() |
| 143 | + for _ in range(50000): |
| 144 | + pat.search("nothing here") |
| 145 | + gc.collect() |
| 146 | + rss_after = _get_rss_pages() |
| 147 | + gc.threshold(old_thresh) |
| 148 | + growth = rss_after - rss_before |
| 149 | + assert growth < 50, "RSS grew by %d pages on no-match path" % growth |
| 150 | + |
| 151 | + |
| 152 | +def test_cached_patterns_no_leak(): |
| 153 | + """Convenience functions use the cache, so pcre2_code doesn't leak.""" |
| 154 | + re.purge() |
| 155 | + old_thresh = gc.threshold() |
| 156 | + gc.threshold(4096) |
| 157 | + gc.collect() |
| 158 | + rss_before = _get_rss_pages() |
| 159 | + for _ in range(50000): |
| 160 | + re.search(r"cached_\d+", "cached_123") |
| 161 | + gc.collect() |
| 162 | + rss_after = _get_rss_pages() |
| 163 | + gc.threshold(old_thresh) |
| 164 | + re.purge() |
| 165 | + growth = rss_after - rss_before |
| 166 | + assert growth < 50, "RSS grew by %d pages; pcre2_code likely leaking" % growth |
| 167 | + |
| 168 | + |
| 169 | +def test_eviction_frees_code(): |
| 170 | + """Evicted patterns must have their pcre2_code freed.""" |
| 171 | + re.purge() |
| 172 | + old_thresh = gc.threshold() |
| 173 | + gc.threshold(4096) |
| 174 | + gc.collect() |
| 175 | + rss_before = _get_rss_pages() |
| 176 | + for i in range(500): |
| 177 | + re.search("evict_free_%d" % i, "evict_free_%d" % i) |
| 178 | + gc.collect() |
| 179 | + rss_after = _get_rss_pages() |
| 180 | + gc.threshold(old_thresh) |
| 181 | + re.purge() |
| 182 | + growth = rss_after - rss_before |
| 183 | + assert growth < 50, "RSS grew by %d pages during cache eviction" % growth |
| 184 | + |
| 185 | + |
| 186 | +def test_sub_repeated_no_leak(): |
| 187 | + """sub() calls search() in a loop; match_data must be freed each time.""" |
| 188 | + re.purge() |
| 189 | + old_thresh = gc.threshold() |
| 190 | + gc.threshold(4096) |
| 191 | + gc.collect() |
| 192 | + rss_before = _get_rss_pages() |
| 193 | + for _ in range(5000): |
| 194 | + re.sub(r"\s+", "-", "one two three four five") |
| 195 | + gc.collect() |
| 196 | + rss_after = _get_rss_pages() |
| 197 | + gc.threshold(old_thresh) |
| 198 | + re.purge() |
| 199 | + growth = rss_after - rss_before |
| 200 | + assert growth < 50, "RSS grew by %d pages during repeated sub()" % growth |
| 201 | + |
| 202 | + |
| 203 | +def test_json_like_workload_no_leak(): |
| 204 | + """Simulate JSON parsing regex workload (STRINGCHUNK, WHITESPACE, NUMBER).""" |
| 205 | + STRINGCHUNK = re.compile(r'(.*?)(["\\\x00-\x1f])', re.VERBOSE | re.MULTILINE | re.DOTALL) |
| 206 | + WHITESPACE = re.compile(r"[ \t\n\r]*", re.VERBOSE | re.MULTILINE | re.DOTALL) |
| 207 | + NUMBER_RE = re.compile( |
| 208 | + r"(-?(?:0|[1-9]\d*))(\.\d+)?([eE][-+]?\d+)?", |
| 209 | + re.VERBOSE | re.MULTILINE | re.DOTALL, |
| 210 | + ) |
| 211 | + test_strings = ['"hello"', '"world"', ' \n\t ', "12345", "-3.14e10"] |
| 212 | + |
| 213 | + old_thresh = gc.threshold() |
| 214 | + gc.threshold(4096) |
| 215 | + gc.collect() |
| 216 | + rss_before = _get_rss_pages() |
| 217 | + for _ in range(10000): |
| 218 | + for s in test_strings: |
| 219 | + STRINGCHUNK.match(s) |
| 220 | + WHITESPACE.match(s) |
| 221 | + NUMBER_RE.match(s) |
| 222 | + gc.collect() |
| 223 | + rss_after = _get_rss_pages() |
| 224 | + gc.threshold(old_thresh) |
| 225 | + growth = rss_after - rss_before |
| 226 | + assert growth < 50, ( |
| 227 | + "RSS grew by %d pages during JSON-like regex workload" % growth |
| 228 | + ) |
| 229 | + |
| 230 | + |
| 231 | +def test_explicit_free(): |
| 232 | + """_free() releases pcre2_code and invalidates the pattern.""" |
| 233 | + p = re.compile(r"explicit_free_test") |
| 234 | + assert p.obj is not None |
| 235 | + p._free() |
| 236 | + assert p.obj is None |
| 237 | + |
| 238 | + |
| 239 | +if __name__ == "__main__": |
| 240 | + print("Running re FFI memory leak tests...") |
| 241 | + _run("test_search_still_works", test_search_still_works) |
| 242 | + _run("test_match_still_works", test_match_still_works) |
| 243 | + _run("test_sub_still_works", test_sub_still_works) |
| 244 | + _run("test_findall_still_works", test_findall_still_works) |
| 245 | + _run("test_split_still_works", test_split_still_works) |
| 246 | + _run("test_compiled_pattern_reuse", test_compiled_pattern_reuse) |
| 247 | + _run("test_no_match_returns_none", test_no_match_returns_none) |
| 248 | + _run("test_groups_and_captures", test_groups_and_captures) |
| 249 | + _run("test_sub_with_callable", test_sub_with_callable) |
| 250 | + _run("test_cache_reuses_pattern", test_cache_reuses_pattern) |
| 251 | + _run("test_cache_eviction", test_cache_eviction) |
| 252 | + _run("test_purge_clears_cache", test_purge_clears_cache) |
| 253 | + _run("test_match_data_no_leak", test_match_data_no_leak) |
| 254 | + _run("test_match_data_no_leak_on_no_match", test_match_data_no_leak_on_no_match) |
| 255 | + _run("test_cached_patterns_no_leak", test_cached_patterns_no_leak) |
| 256 | + _run("test_eviction_frees_code", test_eviction_frees_code) |
| 257 | + _run("test_sub_repeated_no_leak", test_sub_repeated_no_leak) |
| 258 | + _run("test_json_like_workload_no_leak", test_json_like_workload_no_leak) |
| 259 | + _run("test_explicit_free", test_explicit_free) |
| 260 | + print() |
| 261 | + print("%d passed, %d failed" % (PASS, FAIL)) |
| 262 | + if FAIL: |
| 263 | + raise SystemExit(1) |
0 commit comments