Skip to content

Commit 381543e

Browse files
committed
Sync files with PG 14.22
from a6ff9f5ffb2(Translation updates) to 966473719c7(Stamp 14.22.)
1 parent 0379652 commit 381543e

13 files changed

Lines changed: 890 additions & 18 deletions

File tree

src/oracle_test/modules/test_regex/test_regex.c

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -425,7 +425,8 @@ parse_test_flags(test_re_flags *flags, text *opts)
425425
ereport(ERROR,
426426
(errcode(ERRCODE_INVALID_PARAMETER_VALUE),
427427
errmsg("invalid regular expression test option: \"%.*s\"",
428-
pg_mblen(opt_p + i), opt_p + i)));
428+
pg_mblen_range(opt_p + i, opt_p + opt_len),
429+
opt_p + i)));
429430
break;
430431
}
431432
}

src/oracle_test/regress/expected/.gitignore

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,8 @@
33
/create_function_0.out
44
/create_function_1.out
55
/create_function_2.out
6+
/encoding.out
7+
/encoding_1.out
68
/largeobject.out
79
/largeobject_1.out
810
/misc.out

src/oracle_test/regress/expected/arrays.out

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1536,6 +1536,11 @@ select '[0:1]={1.1,2.2}'::float8[];
15361536
(1 row)
15371537

15381538
-- all of the above should be accepted
1539+
-- some day we might allow these cases, but for now they're errors:
1540+
select array[]::oidvector;
1541+
ERROR: array is not a valid oidvector
1542+
select array[]::int2vector;
1543+
ERROR: array is not a valid int2vector
15391544
-- tests for array aggregates
15401545
CREATE TEMP TABLE arraggtest ( f1 INT[], f2 TEXT[][], f3 FLOAT[]);
15411546
INSERT INTO arraggtest (f1, f2, f3) VALUES
Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,16 @@
1+
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
2+
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
3+
-- of EUC_KR, also run the test in UTF8.
4+
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
5+
\if :skip_test
6+
\quit
7+
\endif
8+
-- Exercise is_multibyte_char_in_char (non-UTF8) slow path.
9+
SELECT POSITION(
10+
convert_from('\xbcf6c7d0', 'EUC_KR') IN
11+
convert_from('\xb0fac7d02c20bcf6c7d02c20b1e2bcfa2c20bbee', 'EUC_KR'));
12+
position
13+
----------
14+
5
15+
(1 row)
16+
Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,6 @@
1+
-- This test is about EUC_KR encoding, chosen as perhaps the most prevalent
2+
-- non-UTF8, multibyte encoding as of 2026-01. Since UTF8 can represent all
3+
-- of EUC_KR, also run the test in UTF8.
4+
SELECT getdatabaseencoding() NOT IN ('EUC_KR', 'UTF8') AS skip_test \gset
5+
\if :skip_test
6+
\quit
Lines changed: 249 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,249 @@
1+
/* skip test if not UTF8 server encoding */
2+
SELECT getdatabaseencoding() <> 'UTF8' AS skip_test \gset
3+
\if :skip_test
4+
\quit
5+
\endif
6+
7+
CREATE FUNCTION test_bytea_to_text(bytea) RETURNS text
8+
AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT;
9+
/
10+
CREATE FUNCTION test_text_to_bytea(text) RETURNS bytea
11+
AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT;
12+
/
13+
CREATE FUNCTION test_mblen_func(text, text, text, int) RETURNS int
14+
AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT;
15+
/
16+
CREATE FUNCTION test_text_to_wchars(text, text) RETURNS int[]
17+
AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT;
18+
/
19+
CREATE FUNCTION test_wchars_to_text(text, int[]) RETURNS text
20+
AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT;
21+
/
22+
CREATE FUNCTION test_valid_server_encoding(text) RETURNS boolean
23+
AS '@libdir@/oraregress@DLSUFFIX@' LANGUAGE C STRICT;
24+
/
25+
26+
27+
CREATE TABLE regress_encoding(good text, truncated text, with_nul text, truncated_with_nul text);
28+
INSERT INTO regress_encoding
29+
VALUES ('café',
30+
'caf' || test_bytea_to_text('\xc3'),
31+
'café' || test_bytea_to_text('\x00') || 'dcba',
32+
'caf' || test_bytea_to_text('\xc300') || 'dcba');
33+
34+
SELECT good, truncated, with_nul FROM regress_encoding;
35+
36+
SELECT length(good) FROM regress_encoding;
37+
SELECT substring(good, 3, 1) FROM regress_encoding;
38+
SELECT substring(good, 4, 1) FROM regress_encoding;
39+
SELECT regexp_replace(good, '^caf(.)$', '\1') FROM regress_encoding;
40+
SELECT reverse(good) FROM regress_encoding;
41+
42+
-- invalid short mb character = error
43+
SELECT length(truncated) FROM regress_encoding;
44+
SELECT substring(truncated, 1, 3) FROM regress_encoding;
45+
SELECT substring(truncated, 1, 4) FROM regress_encoding;
46+
SELECT reverse(truncated) FROM regress_encoding;
47+
-- invalid short mb character = silently dropped
48+
SELECT regexp_replace(truncated, '^caf(.)$', '\1') FROM regress_encoding;
49+
50+
-- PostgreSQL doesn't allow strings to contain NUL. If a corrupted string
51+
-- contains NUL at a character boundary position, some functions treat it as a
52+
-- character while others treat it as a terminator, as implementation details.
53+
54+
-- NUL = terminator
55+
SELECT length(with_nul) FROM regress_encoding;
56+
SELECT substring(with_nul, 3, 1) FROM regress_encoding;
57+
SELECT substring(with_nul, 4, 1) FROM regress_encoding;
58+
SELECT substring(with_nul, 5, 1) FROM regress_encoding;
59+
SELECT convert_to(substring(with_nul, 5, 1), 'UTF8') FROM regress_encoding;
60+
SELECT regexp_replace(with_nul, '^caf(.)$', '\1') FROM regress_encoding;
61+
-- NUL = character
62+
SELECT with_nul, reverse(with_nul), reverse(reverse(with_nul)) FROM regress_encoding;
63+
64+
-- If a corrupted string contains NUL in the tail bytes of a multibyte
65+
-- character (invalid in all encodings), it is considered part of the
66+
-- character for length purposes. An error will only be raised in code paths
67+
-- that convert or verify encodings.
68+
69+
SELECT length(truncated_with_nul) FROM regress_encoding;
70+
SELECT substring(truncated_with_nul, 3, 1) FROM regress_encoding;
71+
SELECT substring(truncated_with_nul, 4, 1) FROM regress_encoding;
72+
SELECT convert_to(substring(truncated_with_nul, 4, 1), 'UTF8') FROM regress_encoding;
73+
SELECT substring(truncated_with_nul, 5, 1) FROM regress_encoding;
74+
SELECT regexp_replace(truncated_with_nul, '^caf(.)dcba$', '\1') = test_bytea_to_text('\xc300') FROM regress_encoding;
75+
SELECT reverse(truncated_with_nul) FROM regress_encoding;
76+
77+
-- unbounded: sequence would overrun the string!
78+
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated, 3)
79+
FROM regress_encoding;
80+
81+
-- condition detected when using the length/range variants
82+
SELECT test_mblen_func('pg_mblen_with_len', 'UTF8', truncated, 3)
83+
FROM regress_encoding;
84+
SELECT test_mblen_func('pg_mblen_range', 'UTF8', truncated, 3)
85+
FROM regress_encoding;
86+
87+
-- unbounded: sequence would overrun the string, if the terminator were really
88+
-- the end of it
89+
SELECT test_mblen_func('pg_mblen_unbounded', 'UTF8', truncated_with_nul, 3)
90+
FROM regress_encoding;
91+
SELECT test_mblen_func('pg_encoding_mblen', 'GB18030', truncated_with_nul, 3)
92+
FROM regress_encoding;
93+
94+
-- condition detected when using the cstr variants
95+
SELECT test_mblen_func('pg_mblen_cstr', 'UTF8', truncated_with_nul, 3)
96+
FROM regress_encoding;
97+
98+
DROP TABLE regress_encoding;
99+
100+
-- mb<->wchar conversions
101+
CREATE FUNCTION test_encoding(encoding text, description text, input bytea)
102+
RETURNS VOID LANGUAGE plpgsql AS
103+
$$
104+
DECLARE
105+
prefix text;
106+
len int;
107+
wchars int[];
108+
round_trip bytea;
109+
result text;
110+
BEGIN
111+
prefix := rpad(encoding || ' ' || description || ':', 28);
112+
113+
-- XXX could also test validation, length functions and include client
114+
-- only encodings with these test cases
115+
116+
IF test_valid_server_encoding(encoding) THEN
117+
wchars := test_text_to_wchars(encoding, test_bytea_to_text(input));
118+
round_trip = test_text_to_bytea(test_wchars_to_text(encoding, wchars));
119+
if input = round_trip then
120+
result := 'OK';
121+
elsif length(input) > length(round_trip) and round_trip = substr(input, 1, length(round_trip)) then
122+
result := 'truncated';
123+
else
124+
result := 'failed';
125+
end if;
126+
RAISE NOTICE '% % -> % -> % = %', prefix, input, wchars, round_trip, result;
127+
END IF;
128+
END;
129+
$$;
130+
/
131+
-- No validation is done on the encoding itself, just the length to avoid
132+
-- overruns, so some of the byte sequences below are bogus. They cover
133+
-- all code branches, server encodings only for now.
134+
CREATE TABLE encoding_tests (encoding text, description text, input bytea);
135+
INSERT INTO encoding_tests VALUES
136+
-- LATIN1, other single-byte encodings
137+
('LATIN1', 'ASCII', 'a'),
138+
('LATIN1', 'extended', '\xe9'),
139+
-- EUC_JP, EUC_JIS_2004, EUR_KR (for the purposes of wchar conversion):
140+
-- 2 8e (CS2, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
141+
-- 3 8f (CS3, not used by EUR_KR but arbitrarily considered to have EUC_JP length)
142+
-- 2 80..ff (CS1)
143+
('EUC_JP', 'ASCII', 'a'),
144+
('EUC_JP', 'CS1, short', '\x80'),
145+
('EUC_JP', 'CS1', '\x8002'),
146+
('EUC_JP', 'CS2, short', '\x8e'),
147+
('EUC_JP', 'CS2', '\x8e02'),
148+
('EUC_JP', 'CS3, short', '\x8f'),
149+
('EUC_JP', 'CS3, short', '\x8f02'),
150+
('EUC_JP', 'CS3', '\x8f0203'),
151+
-- EUC_CN
152+
-- 3 8e (CS2, not used but arbitrarily considered to have length 3)
153+
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
154+
-- 2 80..ff (CS1)
155+
('EUC_CN', 'ASCII', 'a'),
156+
('EUC_CN', 'CS1, short', '\x80'),
157+
('EUC_CN', 'CS1', '\x8002'),
158+
('EUC_CN', 'CS2, short', '\x8e'),
159+
('EUC_CN', 'CS2, short', '\x8e02'),
160+
('EUC_CN', 'CS2', '\x8e0203'),
161+
('EUC_CN', 'CS3, short', '\x8f'),
162+
('EUC_CN', 'CS3, short', '\x8f02'),
163+
('EUC_CN', 'CS3', '\x8f0203'),
164+
-- EUC_TW:
165+
-- 4 8e (CS2)
166+
-- 3 8f (CS3, not used but arbitrarily considered to have length 3)
167+
-- 2 80..ff (CS1)
168+
('EUC_TW', 'ASCII', 'a'),
169+
('EUC_TW', 'CS1, short', '\x80'),
170+
('EUC_TW', 'CS1', '\x8002'),
171+
('EUC_TW', 'CS2, short', '\x8e'),
172+
('EUC_TW', 'CS2, short', '\x8e02'),
173+
('EUC_TW', 'CS2, short', '\x8e0203'),
174+
('EUC_TW', 'CS2', '\x8e020304'),
175+
('EUC_TW', 'CS3, short', '\x8f'),
176+
('EUC_TW', 'CS3, short', '\x8f02'),
177+
('EUC_TW', 'CS3', '\x8f0203'),
178+
-- UTF8
179+
-- 2 c0..df
180+
-- 3 e0..ef
181+
-- 4 f0..f7 (but maximum real codepoint U+10ffff has f4)
182+
-- 5 f8..fb (not supported)
183+
-- 6 fc..fd (not supported)
184+
('UTF8', 'ASCII', 'a'),
185+
('UTF8', '2 byte, short', '\xdf'),
186+
('UTF8', '2 byte', '\xdf82'),
187+
('UTF8', '3 byte, short', '\xef'),
188+
('UTF8', '3 byte, short', '\xef82'),
189+
('UTF8', '3 byte', '\xef8283'),
190+
('UTF8', '4 byte, short', '\xf7'),
191+
('UTF8', '4 byte, short', '\xf782'),
192+
('UTF8', '4 byte, short', '\xf78283'),
193+
('UTF8', '4 byte', '\xf7828384'),
194+
('UTF8', '5 byte, unsupported', '\xfb'),
195+
('UTF8', '5 byte, unsupported', '\xfb82'),
196+
('UTF8', '5 byte, unsupported', '\xfb8283'),
197+
('UTF8', '5 byte, unsupported', '\xfb828384'),
198+
('UTF8', '5 byte, unsupported', '\xfb82838485'),
199+
('UTF8', '6 byte, unsupported', '\xfd'),
200+
('UTF8', '6 byte, unsupported', '\xfd82'),
201+
('UTF8', '6 byte, unsupported', '\xfd8283'),
202+
('UTF8', '6 byte, unsupported', '\xfd828384'),
203+
('UTF8', '6 byte, unsupported', '\xfd82838485'),
204+
('UTF8', '6 byte, unsupported', '\xfd8283848586'),
205+
-- MULE_INTERNAL
206+
-- 2 81..8d LC1
207+
-- 3 90..99 LC2
208+
('MULE_INTERNAL', 'ASCII', 'a'),
209+
('MULE_INTERNAL', 'LC1, short', '\x81'),
210+
('MULE_INTERNAL', 'LC1', '\x8182'),
211+
('MULE_INTERNAL', 'LC2, short', '\x90'),
212+
('MULE_INTERNAL', 'LC2, short', '\x9082'),
213+
('MULE_INTERNAL', 'LC2', '\x908283');
214+
215+
SELECT COUNT(test_encoding(encoding, description, input)) > 0
216+
FROM encoding_tests;
217+
218+
-- substring fetches a slice of a toasted value; unused tail of that slice is
219+
-- an incomplete char (bug #19406)
220+
CREATE TABLE toast_3b_utf8 (c text);
221+
INSERT INTO toast_3b_utf8 VALUES (repeat(U&'\2026', 4000));
222+
SELECT SUBSTRING(c FROM 1 FOR 1) FROM toast_3b_utf8;
223+
SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8;
224+
-- diagnose incomplete char iff within the substring
225+
UPDATE toast_3b_utf8 SET c = c || test_bytea_to_text('\xe280');
226+
SELECT SUBSTRING(c FROM 4000 FOR 1) FROM toast_3b_utf8;
227+
SELECT SUBSTRING(c FROM 4001 FOR 1) FROM toast_3b_utf8;
228+
-- substring needing last byte of its slice_size
229+
ALTER TABLE toast_3b_utf8 RENAME TO toast_4b_utf8;
230+
UPDATE toast_4b_utf8 SET c = repeat(U&'\+01F680', 3000);
231+
SELECT SUBSTRING(c FROM 3000 FOR 1) FROM toast_4b_utf8;
232+
233+
DROP TABLE encoding_tests;
234+
DROP TABLE toast_4b_utf8;
235+
DROP FUNCTION test_encoding;
236+
DROP FUNCTION test_wchars_to_text;
237+
DROP FUNCTION test_text_to_wchars;
238+
DROP FUNCTION test_valid_server_encoding;
239+
DROP FUNCTION test_mblen_func;
240+
DROP FUNCTION test_bytea_to_text;
241+
DROP FUNCTION test_text_to_bytea;
242+
243+
244+
-- substring slow path: multi-byte escape char vs. multi-byte pattern char.
245+
SELECT SUBSTRING('a' SIMILAR U&'\00AC' ESCAPE U&'\00A7');
246+
-- Levenshtein distance metric: exercise character length cache.
247+
SELECT U&"real\00A7_name" FROM (select 1) AS x(real_name);
248+
-- JSON errcontext: truncate long data.
249+
SELECT repeat(U&'\00A7', 30)::json;

0 commit comments

Comments
 (0)