@@ -142,6 +142,55 @@ namespace osmium {
142142 return 0 ;
143143 }
144144
145+ /* *
146+ * Check that a string contains only valid UTF-8. Returns nullptr
147+ * if the string is valid, otherwise it returns a pointer to the
148+ * first invalid character.
149+ *
150+ * Based on https://www.cl.cam.ac.uk/~mgk25/ucs/utf8_check.c
151+ * License: http://www.cl.cam.ac.uk/~mgk25/short-license.html
152+ */
153+ inline char const * utf8_check (char const * str) noexcept {
154+ auto * s = reinterpret_cast <unsigned char const *>(str);
155+ while (*s) {
156+ if (*s < 0x80 ) {
157+ /* 0xxxxxxx */
158+ s++;
159+ } else if ((s[0 ] & 0xe0 ) == 0xc0 ) {
160+ /* 110XXXXx 10xxxxxx */
161+ if ((s[1 ] & 0xc0 ) != 0x80 || (s[0 ] & 0xfe ) == 0xc0 ) /* overlong? */ {
162+ return reinterpret_cast <char const *>(s);
163+ } else {
164+ s += 2 ;
165+ }
166+ } else if ((s[0 ] & 0xf0 ) == 0xe0 ) {
167+ /* 1110XXXX 10Xxxxxx 10xxxxxx */
168+ if ((s[1 ] & 0xc0 ) != 0x80 || (s[2 ] & 0xc0 ) != 0x80 ||
169+ (s[0 ] == 0xe0 && (s[1 ] & 0xe0 ) == 0x80 ) || /* overlong? */
170+ (s[0 ] == 0xed && (s[1 ] & 0xe0 ) == 0xa0 ) || /* surrogate? */
171+ (s[0 ] == 0xef && s[1 ] == 0xbf &&
172+ (s[2 ] & 0xfe ) == 0xbe )) /* U+FFFE or U+FFFF? */ {
173+ return reinterpret_cast <char const *>(s);
174+ } else {
175+ s += 3 ;
176+ }
177+ } else if ((s[0 ] & 0xf8 ) == 0xf0 ) {
178+ /* 11110XXX 10XXxxxx 10xxxxxx 10xxxxxx */
179+ if ((s[1 ] & 0xc0 ) != 0x80 || (s[2 ] & 0xc0 ) != 0x80 ||
180+ (s[3 ] & 0xc0 ) != 0x80 ||
181+ (s[0 ] == 0xf0 && (s[1 ] & 0xf0 ) == 0x80 ) || /* overlong? */
182+ (s[0 ] == 0xf4 && s[1 ] > 0x8f ) || s[0 ] > 0xf4 ) /* > U+10FFFF? */ {
183+ return reinterpret_cast <char const *>(s);
184+ } else {
185+ s += 4 ;
186+ }
187+ } else {
188+ return reinterpret_cast <char const *>(s);
189+ }
190+ }
191+ return nullptr ;
192+ }
193+
145194 inline uint32_t next_utf8_codepoint (char const ** begin, const char * end) {
146195 const auto * it = reinterpret_cast <const uint8_t *>(*begin);
147196 uint32_t cp = 0xffU & *it;
0 commit comments