|
| 1 | +/* skip test if not UTF8 server encoding */ |
| 2 | +SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset |
| 3 | +\if :skip_test |
| 4 | +\quit |
| 5 | +\endif |
| 6 | + |
| 7 | +CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text |
| 8 | + AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT; |
| 9 | +/ |
| 10 | +CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea |
| 11 | + AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT; |
| 12 | +/ |
| 13 | +CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int |
| 14 | + AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT; |
| 15 | +/ |
| 16 | +CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[] |
| 17 | + AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT; |
| 18 | +/ |
| 19 | +CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text |
| 20 | + AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT; |
| 21 | +/ |
| 22 | +CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean |
| 23 | + AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT; |
| 24 | +/ |
| 25 | + |
| 26 | + |
| 27 | +CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text); |
| 28 | +INSERT INTO regress_encoding |
| 29 | +VALUES ('café', |
| 30 | + 'caf' || test_bytea_to_text('\xc3'), |
| 31 | + 'café' || test_bytea_to_text('\x00') || 'dcba', |
| 32 | + 'caf' || test_bytea_to_text('\xc300') || 'dcba'); |
| 33 | + |
| 34 | +SELECT good, truncated, with_nul FROM regress_encoding; |
| 35 | + |
| 36 | +SELECT length(good) FROM regress_encoding; |
| 37 | +SELECT substring(good, 3, 1) FROM regress_encoding; |
| 38 | +SELECT substring(good, 4, 1) FROM regress_encoding; |
| 39 | +SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding; |
| 40 | +SELECT reverse(good) FROM regress_encoding; |
| 41 | + |
| 42 | +-- invalid short mb character = error |
| 43 | +SELECT length(truncated) FROM regress_encoding; |
| 44 | +SELECT substring(truncated, 1, 3) FROM regress_encoding; |
| 45 | +SELECT substring(truncated, 1, 4) FROM regress_encoding; |
| 46 | +SELECT reverse(truncated) FROM regress_encoding; |
| 47 | +-- invalid short mb character = silently dropped |
| 48 | +SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding; |
| 49 | + |
| 50 | +-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string |
| 51 | +-- contains NUL at a character boundary position, some functions treat it as a |
| 52 | +-- character while others treat it as a terminator, as implementation details. |
| 53 | + |
| 54 | +-- NUL = terminator |
| 55 | +SELECT length(with_nul) FROM regress_encoding; |
| 56 | +SELECT substring(with_nul, 3, 1) FROM regress_encoding; |
| 57 | +SELECT substring(with_nul, 4, 1) FROM regress_encoding; |
| 58 | +SELECT substring(with_nul, 5, 1) FROM regress_encoding; |
| 59 | +SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding; |
| 60 | +SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding; |
| 61 | +-- NUL = character |
| 62 | +SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding; |
| 63 | + |
| 64 | +-- If a corrupted string contains NUL in the tail bytes of a multibyte |
| 65 | +-- character (invalid in all encodings), it is considered part of the |
| 66 | +-- character for length purposes. An error will only be raised in code paths |
| 67 | +-- that convert or verify encodings. |
| 68 | + |
| 69 | +SELECT length(truncated_with_nul) FROM regress_encoding; |
| 70 | +SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding; |
| 71 | +SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding; |
| 72 | +SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding; |
| 73 | +SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding; |
| 74 | +SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding; |
| 75 | +SELECT reverse(truncated_with_nul) FROM regress_encoding; |
| 76 | + |
| 77 | +-- unbounded: sequence would overrun the string! |
| 78 | +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3) |
| 79 | +FROM regress_encoding; |
| 80 | + |
| 81 | +-- condition detected when using the length/range variants |
| 82 | +SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3) |
| 83 | +FROM regress_encoding; |
| 84 | +SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3) |
| 85 | +FROM regress_encoding; |
| 86 | + |
| 87 | +-- unbounded: sequence would overrun the string, if the terminator were really |
| 88 | +-- the end of it |
| 89 | +SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3) |
| 90 | +FROM regress_encoding; |
| 91 | +SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3) |
| 92 | +FROM regress_encoding; |
| 93 | + |
| 94 | +-- condition detected when using the cstr variants |
| 95 | +SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3) |
| 96 | +FROM regress_encoding; |
| 97 | + |
| 98 | +DROP TABLE regress_encoding; |
| 99 | + |
| 100 | +-- mb<->wchar conversions |
| 101 | +CREATE FUNCTION test_encoding(encoding text, description text, input bytea) |
| 102 | +RETURNS VOID LANGUAGE plpgsql AS |
| 103 | +$$ |
| 104 | +DECLARE |
| 105 | + prefix text; |
| 106 | + len int; |
| 107 | + wchars int[]; |
| 108 | + round_trip bytea; |
| 109 | + result text; |
| 110 | +BEGIN |
| 111 | + prefix := rpad(encoding || ' ' || description || ':', 28); |
| 112 | + |
| 113 | + -- XXX could also test validation, length functions and include client |
| 114 | + -- only encodings with these test cases |
| 115 | + |
| 116 | + IF test_valid_server_encoding(encoding) THEN |
| 117 | + wchars := test_text_to_wchars(encoding, test_bytea_to_text(input)); |
| 118 | + round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars)); |
| 119 | + if input = round_trip then |
| 120 | + result := 'OK'; |
| 121 | + elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then |
| 122 | + result := 'truncated'; |
| 123 | + else |
| 124 | + result := 'failed'; |
| 125 | + end if; |
| 126 | + RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result; |
| 127 | + END IF; |
| 128 | +END; |
| 129 | +$$; |
| 130 | +/ |
| 131 | +-- No validation is done on the encoding itself, just the length to avoid |
| 132 | +-- overruns, so some of the byte sequences below are bogus. They cover |
| 133 | +-- all code branches, server encodings only for now. |
| 134 | +CREATE TABLE encoding_tests (encoding text, description text, input bytea); |
| 135 | +INSERT INTO encoding_tests VALUES |
| 136 | + -- LATIN1, other single-byte encodings |
| 137 | + ('LATIN1', 'ASCII', 'a'), |
| 138 | + ('LATIN1', 'extended', '\xe9'), |
| 139 | + -- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion): |
| 140 | + -- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length) |
| 141 | + -- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length) |
| 142 | + -- 2 80..ff (CS1) |
| 143 | + ('EUC_JP', 'ASCII', 'a'), |
| 144 | + ('EUC_JP', 'CS1, short', '\x80'), |
| 145 | + ('EUC_JP', 'CS1', '\x8002'), |
| 146 | + ('EUC_JP', 'CS2, short', '\x8e'), |
| 147 | + ('EUC_JP', 'CS2', '\x8e02'), |
| 148 | + ('EUC_JP', 'CS3, short', '\x8f'), |
| 149 | + ('EUC_JP', 'CS3, short', '\x8f02'), |
| 150 | + ('EUC_JP', 'CS3', '\x8f0203'), |
| 151 | + -- EUC_CN |
| 152 | + -- 3 8e (CS2, not used but arbitrarily considered to have length 3) |
| 153 | + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) |
| 154 | + -- 2 80..ff (CS1) |
| 155 | + ('EUC_CN', 'ASCII', 'a'), |
| 156 | + ('EUC_CN', 'CS1, short', '\x80'), |
| 157 | + ('EUC_CN', 'CS1', '\x8002'), |
| 158 | + ('EUC_CN', 'CS2, short', '\x8e'), |
| 159 | + ('EUC_CN', 'CS2, short', '\x8e02'), |
| 160 | + ('EUC_CN', 'CS2', '\x8e0203'), |
| 161 | + ('EUC_CN', 'CS3, short', '\x8f'), |
| 162 | + ('EUC_CN', 'CS3, short', '\x8f02'), |
| 163 | + ('EUC_CN', 'CS3', '\x8f0203'), |
| 164 | + -- EUC_TW: |
| 165 | + -- 4 8e (CS2) |
| 166 | + -- 3 8f (CS3, not used but arbitrarily considered to have length 3) |
| 167 | + -- 2 80..ff (CS1) |
| 168 | + ('EUC_TW', 'ASCII', 'a'), |
| 169 | + ('EUC_TW', 'CS1, short', '\x80'), |
| 170 | + ('EUC_TW', 'CS1', '\x8002'), |
| 171 | + ('EUC_TW', 'CS2, short', '\x8e'), |
| 172 | + ('EUC_TW', 'CS2, short', '\x8e02'), |
| 173 | + ('EUC_TW', 'CS2, short', '\x8e0203'), |
| 174 | + ('EUC_TW', 'CS2', '\x8e020304'), |
| 175 | + ('EUC_TW', 'CS3, short', '\x8f'), |
| 176 | + ('EUC_TW', 'CS3, short', '\x8f02'), |
| 177 | + ('EUC_TW', 'CS3', '\x8f0203'), |
| 178 | + -- UTF8 |
| 179 | + -- 2 c0..df |
| 180 | + -- 3 e0..ef |
| 181 | + -- 4 f0..f7 (but maximum real codepoint U+10ffff has f4) |
| 182 | + -- 5 f8..fb (not supported) |
| 183 | + -- 6 fc..fd (not supported) |
| 184 | + ('UTF8', 'ASCII', 'a'), |
| 185 | + ('UTF8', '2 byte, short', '\xdf'), |
| 186 | + ('UTF8', '2 byte', '\xdf82'), |
| 187 | + ('UTF8', '3 byte, short', '\xef'), |
| 188 | + ('UTF8', '3 byte, short', '\xef82'), |
| 189 | + ('UTF8', '3 byte', '\xef8283'), |
| 190 | + ('UTF8', '4 byte, short', '\xf7'), |
| 191 | + ('UTF8', '4 byte, short', '\xf782'), |
| 192 | + ('UTF8', '4 byte, short', '\xf78283'), |
| 193 | + ('UTF8', '4 byte', '\xf7828384'), |
| 194 | + ('UTF8', '5 byte, unsupported', '\xfb'), |
| 195 | + ('UTF8', '5 byte, unsupported', '\xfb82'), |
| 196 | + ('UTF8', '5 byte, unsupported', '\xfb8283'), |
| 197 | + ('UTF8', '5 byte, unsupported', '\xfb828384'), |
| 198 | + ('UTF8', '5 byte, unsupported', '\xfb82838485'), |
| 199 | + ('UTF8', '6 byte, unsupported', '\xfd'), |
| 200 | + ('UTF8', '6 byte, unsupported', '\xfd82'), |
| 201 | + ('UTF8', '6 byte, unsupported', '\xfd8283'), |
| 202 | + ('UTF8', '6 byte, unsupported', '\xfd828384'), |
| 203 | + ('UTF8', '6 byte, unsupported', '\xfd82838485'), |
| 204 | + ('UTF8', '6 byte, unsupported', '\xfd8283848586'), |
| 205 | + -- MULE_INTERNAL |
| 206 | + -- 2 81..8d LC1 |
| 207 | + -- 3 90..99 LC2 |
| 208 | + ('MULE_INTERNAL', 'ASCII', 'a'), |
| 209 | + ('MULE_INTERNAL', 'LC1, short', '\x81'), |
| 210 | + ('MULE_INTERNAL', 'LC1', '\x8182'), |
| 211 | + ('MULE_INTERNAL', 'LC2, short', '\x90'), |
| 212 | + ('MULE_INTERNAL', 'LC2, short', '\x9082'), |
| 213 | + ('MULE_INTERNAL', 'LC2', '\x908283'); |
| 214 | + |
| 215 | +SELECT COUNT(test_encoding(encoding, description, input)) > 0 |
| 216 | +FROM encoding_tests; |
| 217 | + |
| 218 | +-- substring fetches a slice of a toasted value; unused tail of that slice is |
| 219 | +-- an incomplete char (bug #19406) |
| 220 | +CREATE TABLE toast_3b_utf8 (c text); |
| 221 | +INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000)); |
| 222 | +SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8; |
| 223 | +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; |
| 224 | +-- diagnose incomplete char iff within the substring |
| 225 | +UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280'); |
| 226 | +SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8; |
| 227 | +SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8; |
| 228 | +-- substring needing last byte of its slice_size |
| 229 | +ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8; |
| 230 | +UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000); |
| 231 | +SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8; |
| 232 | + |
| 233 | +DROP TABLE encoding_tests; |
| 234 | +DROP TABLE toast_4b_utf8; |
| 235 | +DROP FUNCTION test_encoding; |
| 236 | +DROP FUNCTION test_wchars_to_text; |
| 237 | +DROP FUNCTION test_text_to_wchars; |
| 238 | +DROP FUNCTION test_valid_server_encoding; |
| 239 | +DROP FUNCTION test_mblen_func; |
| 240 | +DROP FUNCTION test_bytea_to_text; |
| 241 | +DROP FUNCTION test_text_to_bytea; |
| 242 | + |
| 243 | + |
| 244 | +-- substring slow path: multi-byte escape char vs. multi-byte pattern char. |
| 245 | +SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7'); |
| 246 | +-- Levenshtein distance metric: exercise character length cache. |
| 247 | +SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name); |
| 248 | +-- JSON errcontext: truncate long data. |
| 249 | +SELECT repeat(U&'\00A7', 30)::json; |
0 commit comments