Skip to content

Commit 6d22ebc

Browse files
committed
Add function to check string for invalid UTF-8 characters
1 parent e1e29de commit 6d22ebc

2 files changed

Lines changed: 69 additions & 0 deletions

File tree

include/osmium/io/detail/string_util.hpp

Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -142,6 +142,55 @@ namespace osmium {
142142
return 0;
143143
}
144144

145+
/**
146+
* Check that a string contains only valid UTF-8. Returns nullptr
147+
* if the string is valid, otherwise it returns a pointer to the
148+
* first invalid character.
149+
*
150+
* Based on https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
151+
* License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
152+
*/
153+
inline char const* utf8_check(char const* str) noexcept {
154+
auto* s = reinterpret_cast<unsigned char const*>(str);
155+
while (*s) {
156+
if (*s < 0x80) {
157+
/* 0xxxxxxx */
158+
s++;
159+
} else if ((s[0] & 0xe0) == 0xc0) {
160+
/* 110XXXXx 10xxxxxx */
161+
if ((s[1] & 0xc0) != 0x80 || (s[0] & 0xfe) == 0xc0) /* overlong? */ {
162+
return reinterpret_cast<char const *>(s);
163+
} else {
164+
s += 2;
165+
}
166+
} else if ((s[0] & 0xf0) == 0xe0) {
167+
/* 1110XXXX 10Xxxxxx 10xxxxxx */
168+
if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80 ||
169+
(s[0] == 0xe0 && (s[1] & 0xe0) == 0x80) || /* overlong? */
170+
(s[0] == 0xed && (s[1] & 0xe0) == 0xa0) || /* surrogate? */
171+
(s[0] == 0xef && s[1] == 0xbf &&
172+
(s[2] & 0xfe) == 0xbe)) /* U+FFFE or U+FFFF? */ {
173+
return reinterpret_cast<char const *>(s);
174+
} else {
175+
s += 3;
176+
}
177+
} else if ((s[0] & 0xf8) == 0xf0) {
178+
/* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
179+
if ((s[1] & 0xc0) != 0x80 || (s[2] & 0xc0) != 0x80 ||
180+
(s[3] & 0xc0) != 0x80 ||
181+
(s[0] == 0xf0 && (s[1] & 0xf0) == 0x80) || /* overlong? */
182+
(s[0] == 0xf4 && s[1] > 0x8f) || s[0] > 0xf4) /* > U+10FFFF? */ {
183+
return reinterpret_cast<char const *>(s);
184+
} else {
185+
s += 4;
186+
}
187+
} else {
188+
return reinterpret_cast<char const *>(s);
189+
}
190+
}
191+
return nullptr;
192+
}
193+
145194
inline uint32_t next_utf8_codepoint(char const** begin, const char* end) {
146195
const auto* it = reinterpret_cast<const uint8_t*>(*begin);
147196
uint32_t cp = 0xffU & *it;

test/t/io/test_output_utils.cpp

Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -231,3 +231,23 @@ TEST_CASE("incomplete Unicode codepoint") {
231231
}
232232
}
233233

234+
TEST_CASE("utf-8 check for valid UTF-8") {
235+
REQUIRE(osmium::io::detail::utf8_check("") == nullptr);
236+
REQUIRE(osmium::io::detail::utf8_check("a") == nullptr);
237+
REQUIRE(osmium::io::detail::utf8_check("abc") == nullptr);
238+
239+
const char* s = u8cast(u8"\n_\u01a2_\u30dc_\U0001d11e_\U0001f680");
240+
REQUIRE(osmium::io::detail::utf8_check(s) == nullptr);
241+
}
242+
243+
TEST_CASE("utf-8 check for illegal value") {
244+
const char* s = "abc\xff";
245+
REQUIRE(osmium::io::detail::utf8_check(s) != nullptr);
246+
}
247+
248+
TEST_CASE("utf-8 check for incomplete Unicode codepoint") {
249+
std::string s{u8cast(u8"\U0001f680")}; // rocket
250+
s.resize(s.size() - 1);
251+
REQUIRE(osmium::io::detail::utf8_check(s.c_str()) != nullptr);
252+
}
253+

0 commit comments

Comments
 (0)