From 461aaba524e701bb8734f99f342ab11bacb7694f Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 31 Mar 2026 16:36:14 -0400 Subject: [PATCH 1/2] add utf8proc_free --- README.md | 6 +++--- bench/bench.c | 4 ++-- test/case.c | 4 ++-- test/fuzzer.c | 44 ++++++++++++++++++++++---------------------- test/graphemetest.c | 4 ++-- test/normtest.c | 2 +- test/printproperty.c | 2 +- test/tests.c | 2 +- utf8proc.c | 4 ++++ utf8proc.h | 17 ++++++++++++++++- 10 files changed, 54 insertions(+), 35 deletions(-) diff --git a/README.md b/README.md index 72fe0cd4..e9a7ba2f 100644 --- a/README.md +++ b/README.md @@ -126,7 +126,7 @@ utf8proc_uint8_t *fold_str; utf8proc_map(str, 0, &fold_str, UTF8PROC_NULLTERM | UTF8PROC_CASEFOLD); printf("%s\n", fold_str); // ss -free(fold_str); +utf8proc_free(fold_str); ``` ### Normalization Form C/D (NFC/NFD) @@ -138,6 +138,6 @@ utf8proc_uint8_t *nfd= utf8proc_NFD(input); // = {0x61, 0xcc, 0x88, 0x6f, 0xcc, // Compose "a\u0308o\u0308u\u0308" into "\u00e4\u00f6\u00fc" (= "äöü" via precomposed characters) utf8proc_uint8_t *nfc= utf8proc_NFC(nfd); -free(nfd); -free(nfc); +utf8proc_free(nfd); +utf8proc_free(nfc); ``` diff --git a/bench/bench.c b/bench/bench.c index 4932c6d4..b0cc92a8 100644 --- a/bench/bench.c +++ b/bench/bench.c @@ -9,7 +9,7 @@ int main(int argc, char **argv) { int i, j; int options = 0; - + for (i = 1; i < argc; ++i) { if (!strcmp(argv[i], "-nfkc")) { options |= UTF8PROC_STABLE|UTF8PROC_COMPOSE|UTF8PROC_COMPAT; @@ -46,7 +46,7 @@ int main(int argc, char **argv) mytime start = gettime(); for (j = 0; j < 100; ++j) { utf8proc_map(src, len, &dest, options); - free(dest); + utf8proc_free(dest); } printf("%s: %g\n", argv[i], elapsed(gettime(), start) / 100); free(src); diff --git a/test/case.c b/test/case.c index c1630f0d..e7a92d40 100644 --- a/test/case.c +++ b/test/case.c @@ -68,8 +68,8 @@ int main(int argc, char **argv) check(!strcmp((char*)s1, "ss") && !strcmp((char*)s2, "ss"), "incorrect 0x00df/0x1e9e casefold normalization"); - free(s1); - free(s2); + utf8proc_free(s1); + utf8proc_free(s2); printf("More up-to-date than OS unicode tables for %d tests.\n", better); printf("utf8proc case conversion tests SUCCEEDED.\n"); return 0; diff --git a/test/fuzzer.c b/test/fuzzer.c index fad14cc9..1c216efc 100644 --- a/test/fuzzer.c +++ b/test/fuzzer.c @@ -16,13 +16,13 @@ int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) utf8proc_ssize_t ret, bytes = 0; utf8proc_uint8_t *str = NULL; size_t len = strlen((const char*)data); - + while(bytes != len) { ret = utf8proc_iterate(ptr, -1, &c); - + if(ret < 0 || ret == 0) break; - + bytes += ret; ptr += ret; @@ -35,31 +35,31 @@ int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) utf8proc_category(c); utf8proc_category_string(c); utf8proc_codepoint_valid(c); - + utf8proc_grapheme_break(c_prev, c); utf8proc_grapheme_break_stateful(c_prev, c, &state); - + c_prev = c; } - + utf8proc_int32_t *copy = size >= 4 ? NULL : malloc(size); - + if(copy) { size /= 4; - + options = UTF8PROC_STRIPCC | UTF8PROC_NLF2LS | UTF8PROC_NLF2PS; memcpy(copy, data, size); utf8proc_normalize_utf32(copy, size, options); - + options = UTF8PROC_STRIPCC | UTF8PROC_NLF2LS; memcpy(copy, data, size); utf8proc_normalize_utf32(copy, size, options); - + options = UTF8PROC_STRIPCC | UTF8PROC_NLF2PS; memcpy(copy, data, size); utf8proc_normalize_utf32(copy, size, options); - + options = UTF8PROC_STRIPCC; memcpy(copy, data, size); utf8proc_normalize_utf32(copy, size, options); @@ -71,30 +71,30 @@ int LLVMFuzzerTestOneInput(const uint8_t* data, size_t size) options = 0; memcpy(copy, data, size); utf8proc_normalize_utf32(copy, size, options); - + free(copy); } - free(utf8proc_NFD(data)); - free(utf8proc_NFC(data)); - free(utf8proc_NFKD(data)); - free(utf8proc_NFKC(data)); - free(utf8proc_NFKC_Casefold(data)); + utf8proc_free(utf8proc_NFD(data)); + utf8proc_free(utf8proc_NFC(data)); + utf8proc_free(utf8proc_NFKD(data)); + utf8proc_free(utf8proc_NFKC(data)); + utf8proc_free(utf8proc_NFKC_Casefold(data)); utf8proc_map(data, len, &str, UTF8PROC_CHARBOUND | UTF8PROC_STRIPNA); - free(str); + utf8proc_free(str); utf8proc_map(data, len, &str, UTF8PROC_LUMP | UTF8PROC_NLF2LS | UTF8PROC_NLF2PS); - free(str); + utf8proc_free(str); utf8proc_map(data, len, &str, UTF8PROC_COMPOSE | UTF8PROC_STRIPMARK); - free(str); + utf8proc_free(str); utf8proc_map(data, len, &str, UTF8PROC_CHARBOUND | UTF8PROC_DECOMPOSE); - free(str); + utf8proc_free(str); utf8proc_map(data, len, &str, UTF8PROC_CHARBOUND | UTF8PROC_COMPOSE); - free(str); + utf8proc_free(str); return 0; } diff --git a/test/graphemetest.c b/test/graphemetest.c index 025cd5d9..7fb16332 100644 --- a/test/graphemetest.c +++ b/test/graphemetest.c @@ -58,7 +58,7 @@ void checkline(const char *_buf, bool verbose) { check(!strcmp((char*)g, (char*)src), "grapheme mismatch: \"%s\" instead of \"%s\"", (char*)g, (char*)src); } - free(g); + utf8proc_free(g); } if (si) { /* test manual calls to utf8proc_grapheme_break_stateful */ @@ -112,7 +112,7 @@ int main(int argc, char **argv) glen = utf8proc_map(input, 6, &g, UTF8PROC_CHARBOUND); check(!strcmp((char*)g, (char*)output), "mishandled u+ffff and u+fffe grapheme breaks"); check(glen != 6, "mishandled u+ffff and u+fffe grapheme breaks"); - free(g); + utf8proc_free(g); }; /* https://github.com/JuliaLang/julia/issues/37680 */ diff --git a/test/normtest.c b/test/normtest.c index 2e0d30b8..5dfc4885 100644 --- a/test/normtest.c +++ b/test/normtest.c @@ -4,7 +4,7 @@ unsigned char *src_norm = (unsigned char*) utf8proc_ ## NRM((utf8proc_uint8_t*) src); \ check(!strcmp((char *) norm, (char *) src_norm), \ "normalization failed for %s -> %s", src, norm); \ - free(src_norm); \ + utf8proc_free(src_norm); \ } int main(int argc, char **argv) diff --git a/test/printproperty.c b/test/printproperty.c index 13f3115b..6c6ee4c4 100644 --- a/test/printproperty.c +++ b/test/printproperty.c @@ -62,7 +62,7 @@ int main(int argc, char **argv) p->boundclass, p->indic_conjunct_break, utf8proc_charwidth(c)); - free(map); + utf8proc_free(map); } return 0; } diff --git a/test/tests.c b/test/tests.c index 8a47b85a..9dfdc7da 100644 --- a/test/tests.c +++ b/test/tests.c @@ -95,6 +95,6 @@ void check_compare(const char *transformation, print_string_and_escaped(f, expected); } fprintf(f, "\n"); - if (free_received) free(received); + if (free_received) utf8proc_free(received); if (!passed) exit(1); } diff --git a/utf8proc.c b/utf8proc.c index e8fa207a..d2d45590 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -836,3 +836,7 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8 UTF8PROC_COMPOSE | UTF8PROC_COMPAT | UTF8PROC_CASEFOLD | UTF8PROC_IGNORE)); return retval; } + +UTF8PROC_DLLEXPORT void utf8proc_free(utf8proc_uint8_t *ptr) { + free(ptr); +} diff --git a/utf8proc.h b/utf8proc.h index 8d9a2e41..83bb5f6c 100644 --- a/utf8proc.h +++ b/utf8proc.h @@ -750,6 +750,8 @@ UTF8PROC_DLLEXPORT const char *utf8proc_category_string(utf8proc_int32_t codepoi * * @note The memory of the new UTF-8 string will have been allocated * with `malloc`, and should therefore be deallocated with `free`. + * However, it is safer to deallocate it with @ref utf8proc_free in + * case your application is linked to a different C library than utf8proc. * * @note `utf8proc_map` simply calls `utf8proc_decompose` followed by `utf8proc_reencode`, * and applications requiring greater control over memory allocation should instead call @@ -760,7 +762,7 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map( ); /** - * Like utf8proc_map(), but also takes a `custom_func` mapping function + * Like @ref utf8proc_map, but also takes a `custom_func` mapping function * that is called on each codepoint in `str` before any other transformations * (along with a `custom_data` pointer that is passed through to `custom_func`). * The `custom_func` argument is ignored if it is `NULL`. @@ -776,6 +778,11 @@ UTF8PROC_DLLEXPORT utf8proc_ssize_t utf8proc_map_custom( * NFKC_Casefold normalized version of the null-terminated string `str`. These * are shortcuts to calling utf8proc_map() with @ref UTF8PROC_NULLTERM * combined with @ref UTF8PROC_STABLE and flags indicating the normalization. + * + * @note The memory of the new UTF-8 string will have been allocated + * with `malloc`, and should therefore be deallocated with `free`. + * However, it is safer to deallocate it with @ref utf8proc_free in + * case your application is linked to a different C library than utf8proc. */ /** @{ */ /** NFD normalization (@ref UTF8PROC_DECOMPOSE). */ @@ -793,6 +800,14 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC(const utf8proc_uint8_t *str); UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8_t *str); /** @} */ +/** + * Deallocate memory allocated and returned by @ref utf8proc_map and similar functions + * (which simply calls the `free` function from the underlying C library linked to utf8proc). + * It is safer to call `utf8proc_free` than calling `free` directly, in case your application + * is linked to a different C library with incompatible `malloc` and `free` functions. + */ +UTF8PROC_DLLEXPORT void utf8proc_free(utf8proc_uint8_t *ptr); + #ifdef __cplusplus } #endif From 514b7191fc113061f3fd29f52040e05a073d8f12 Mon Sep 17 00:00:00 2001 From: "Steven G. Johnson" Date: Tue, 31 Mar 2026 17:31:28 -0400 Subject: [PATCH 2/2] Apply suggestion from @stevengj --- utf8proc.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/utf8proc.c b/utf8proc.c index d2d45590..ad3cae9f 100644 --- a/utf8proc.c +++ b/utf8proc.c @@ -837,6 +837,6 @@ UTF8PROC_DLLEXPORT utf8proc_uint8_t *utf8proc_NFKC_Casefold(const utf8proc_uint8 return retval; } -UTF8PROC_DLLEXPORT void utf8proc_free(utf8proc_uint8_t *ptr) { +UTF8PROC_DLLEXPORT void utf8proc_free(utf8proc_uint8_t *ptr) { free(ptr); }