diff --git a/.github/workflows/build-cloudberry-rocky8.yml b/.github/workflows/build-cloudberry-rocky8.yml
index 2abf88060e3..11cf52e0710 100644
--- a/.github/workflows/build-cloudberry-rocky8.yml
+++ b/.github/workflows/build-cloudberry-rocky8.yml
@@ -318,7 +318,8 @@ jobs:
"make_configs":["gpcontrib/orafce:installcheck",
"gpcontrib/zstd:installcheck",
"gpcontrib/gp_sparse_vector:installcheck",
- "gpcontrib/gp_toolkit:installcheck"]
+ "gpcontrib/gp_toolkit:installcheck",
+ "gpcontrib/gp_url_tools:installcheck"]
},
{"test":"ic-fixme",
"make_configs":["src/test/regress:installcheck-fixme"],
diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml
index ca75f7b42e7..364684a904a 100644
--- a/.github/workflows/build-cloudberry.yml
+++ b/.github/workflows/build-cloudberry.yml
@@ -312,7 +312,8 @@ jobs:
"gpcontrib/zstd:installcheck",
"gpcontrib/gp_sparse_vector:installcheck",
"gpcontrib/gp_toolkit:installcheck",
- "gpcontrib/gp_exttable_fdw:installcheck"]
+ "gpcontrib/gp_exttable_fdw:installcheck",
+ "gpcontrib/gp_url_tools:installcheck"]
},
{"test":"ic-diskquota",
"make_configs":["gpcontrib/diskquota:installcheck"],
diff --git a/gpcontrib/gp_url_tools/Makefile b/gpcontrib/gp_url_tools/Makefile
new file mode 100644
index 00000000000..c161751a88e
--- /dev/null
+++ b/gpcontrib/gp_url_tools/Makefile
@@ -0,0 +1,17 @@
+DATA = $(wildcard sql/*.sql)
+MODULES = $(patsubst %.c,%,$(wildcard src/*.c))
+EXTENSION = gp_url_tools
+
+TESTS = $(wildcard test/sql/*.sql)
+REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS))
+REGRESS_OPTS = --inputdir=test
+
+ifdef USE_PGXS
+PG_CONFIG = pg_config
+PGXS := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
+else
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/gpcontrib/gp_url_tools/README.md b/gpcontrib/gp_url_tools/README.md
new file mode 100644
index 00000000000..b95fa95a4ed
--- /dev/null
+++ b/gpcontrib/gp_url_tools/README.md
@@ -0,0 +1,75 @@
+
+
+# gp_url_tools: Cloudberry extension providing functionality for working with URL addresses
+
+### Features
+`gp_url_tools` is an extension for the Cloudberry database that gives implementation
+for functions that encode/decode url/uri.
+
+### Functions
+The extension creates the `url_tools_schema` schema and adds four SQL functions:
+
+- `url_tools_schema.encode_url`/`.encode_uri`
+ Encodes a text value for use as a URL/URI component by replacing reserved characters with percent-encoded sequences.
+
+- `url_tools_schema.decode_url`/`.decode_uri`
+ Decodes percent-encoded sequences in a URL/URI-encoded text value back to their original characters (human-readable).
+
+### Usage
+```sql
+CREATE EXTENSION gp_url_tools;
+```
+```sql
+SELECT url_tools_schema.encode_url('Hello World');
+```
+```bash
+ encode_url
+───────────────
+ Hello%20World
+(1 row)
+```
+```sql
+SELECT url_tools_schema.decode_url('Hello%20World');
+```
+```bash
+ decode_url
+─────────────
+ Hello World
+(1 row)
+```
+```sql
+SELECT url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)');
+```
+```bash
+ encode_uri
+────────────────────────────────────────────────────────────────────────────────────────────
+ https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)
+```
+```sql
+SELECT url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)');
+```
+```bash
+ decode_uri
+────────────────────────────────────────────────────
+ https://ru.wikipedia.org/wiki/Greenplum_(компания)
+```
+
+### Acknowledgments
+Thank you very much for the extension for PostgreSQL: https://github.com/okbob/url_encode, its sources were very useful.
diff --git a/gpcontrib/gp_url_tools/gp_url_tools.control b/gpcontrib/gp_url_tools/gp_url_tools.control
new file mode 100644
index 00000000000..cb16430ad62
--- /dev/null
+++ b/gpcontrib/gp_url_tools/gp_url_tools.control
@@ -0,0 +1,6 @@
+# gp_url_tools extension
+comment = 'Functions for working with URL-s'
+default_version = '1.0'
+module_pathname = '$libdir/gp_url_tools'
+relocatable = true
+trusted = true
diff --git a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
new file mode 100644
index 00000000000..3b2a773719a
--- /dev/null
+++ b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
@@ -0,0 +1,27 @@
+/* gp_url_tools--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION gp_url_tools" to load this file. \quit
+
+CREATE SCHEMA IF NOT EXISTS url_tools_schema;
+GRANT USAGE ON SCHEMA url_tools_schema TO public;
+
+CREATE FUNCTION url_tools_schema.encode_url(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'encode_url'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION url_tools_schema.decode_url(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'decode_url'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION url_tools_schema.encode_uri(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'encode_uri'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION url_tools_schema.decode_uri(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'decode_uri'
+LANGUAGE C IMMUTABLE STRICT;
diff --git a/gpcontrib/gp_url_tools/src/gp_url_tools.c b/gpcontrib/gp_url_tools/src/gp_url_tools.c
new file mode 100644
index 00000000000..7c397828ded
--- /dev/null
+++ b/gpcontrib/gp_url_tools/src/gp_url_tools.c
@@ -0,0 +1,262 @@
+/*-------------------------------------------------------------------------
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements. See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership. The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License. You may obtain a copy of the License at
+ *
+ * http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied. See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * gp_url_tools.c
+ *
+ * IDENTIFICATION
+ * gpcontrib/gp_url_tools/src/gp_url_tools.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(encode_url);
+PG_FUNCTION_INFO_V1(decode_url);
+PG_FUNCTION_INFO_V1(encode_uri);
+PG_FUNCTION_INFO_V1(decode_uri);
+
+static const unsigned int utf16_low[2] = {0xD800, 0xDC00};
+static const unsigned int utf16_high[2] = {0xDBFF, 0xDFFF};
+static const unsigned int utf16_decode = 0x03FF;
+static const unsigned int utf16_decode_base = 0x10000;
+static const int utf8_with_percent_length = 3; // Example: '%20
+static const int utf16_with_percent_length = 6; // Example: '%u0430'
+static const int utf16_surrogate_pair_length = 12; // Example: '%uD800%uDC00'
+static const int utf16_second_codepoint_offset = 8; // '%uD800%uDC00' => ('%uD800%u'.lenght == 8)
+static const int utf16_past_first_codepoint_offset = 6; // '%uD800%uDC00' => ('%uD800'.lenght == 6)
+
+static unsigned char hex_char_to_value(char c) {
+ if ('0' <= c && c <= '9')
+ return c - '0';
+ if ('A' <= c && c <= 'F')
+ return c - 'A' + 10;
+ if ('a' <= c && c <= 'f')
+ return c - 'a' + 10;
+ ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+ errmsg("invalid hexadecimal digit: \"%c\"", c)));
+ pg_unreachable();
+}
+
+static bool allowed_character(const char c, const char *unreserved_special) {
+ return ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') ||
+ ('a' <= c && c <= 'z') || (strchr(unreserved_special, c) != NULL);
+}
+
+static char *write_character(char *output, const char c) {
+ *output = c;
+ return ++output;
+}
+
+static void valid_encoding_length(char *current, char *end, int length) {
+ Assert(current + length <= end);
+}
+
+static text *encode(text *input, const char *unreserved_special) {
+ int input_length, output_length;
+ text *output;
+ char *cinput, *coutput, *current, *cend;
+
+ // Convert input data for processing
+ cinput = text_to_cstring(input);
+ input_length = strlen(cinput);
+ /*
+ * Worst case: every input byte becomes '%XX' (3 output chars).
+ * The +1 accounts for the null terminator
+ */
+ output_length = 3 * input_length + 1;
+ coutput = palloc(sizeof(*coutput) * output_length);
+ current = coutput;
+ cend = coutput + output_length;
+
+ for (int i = 0; i < input_length; ++i) {
+ if (allowed_character(cinput[i], unreserved_special)) {
+ // Allowed character => copy it into result string
+ valid_encoding_length(current, cend, 1);
+ current = write_character(current, cinput[i]);
+ } else {
+ // Percent-encode byte as '%XX'
+ valid_encoding_length(current, cend, 3);
+ current += sprintf(current, "%%%02X", (unsigned char)cinput[i]);
+ }
+ }
+ // Terminate result string
+ valid_encoding_length(current, cend, 1);
+ current = write_character(current, '\0');
+
+ output = cstring_to_text(coutput);
+ pfree(coutput);
+ return output;
+}
+
+static bool valid_utf16(unsigned int byte, int byte_num) {
+ return utf16_low[byte_num] <= byte && byte <= utf16_high[byte_num];
+}
+
+static unsigned int decode_utf16_pair(unsigned int bytes[2]) {
+ Assert(valid_utf16(bytes[0], 0));
+ Assert(valid_utf16(bytes[1], 1));
+
+ return (utf16_decode_base + ((bytes[0] & utf16_decode) << 10) +
+ (bytes[1] & utf16_decode));
+}
+
+/*
+ * Check whether the sequence starts with a percent-encoded UTF-8 byte (%XX).
+ *
+ * A UTF-8 percent-encoded byte starts with '%' followed by exactly two hex
+ * digits (e.g. "%20", "%D0"). This is distinguished from a UTF-16 sequence
+ * which starts with '%u' or '%U' (e.g. "%uD83D").
+ *
+ * Requires at least 3 characters: '%' + 2 hex digits.
+ */
+static bool is_utf8(const char *sequence, int length) {
+ return utf8_with_percent_length <= length && sequence[0] == '%' &&
+ sequence[1] != 'u' && sequence[1] != 'U';
+}
+
+/*
+ * Check whether the sequence starts with a legacy percent-encoded UTF-16 unit
+ * ('%uXXXX' or '%UXXXX'). Requires at least 6 characters: '%u' + 4 hex digits.
+ */
+static bool is_utf16(const char *sequence, int length) {
+ return utf16_with_percent_length <= length && sequence[0] == '%' &&
+ (sequence[1] == 'u' || sequence[1] == 'U');
+}
+
+static void fetch_utf16(unsigned int *byte, const char *input) {
+ for (int i = 0; i < 4; ++i)
+ *byte = ((*byte) << 4) | hex_char_to_value(input[i]);
+}
+
+static text *decode(text *input, const char *unreserved_special) {
+ int input_length;
+ text *output;
+ char *cinput, *coutput, *current;
+
+ cinput = text_to_cstring(input);
+ input_length = strlen(cinput);
+ coutput = palloc(sizeof(*coutput) * (input_length + 1));
+ current = coutput;
+
+ for (int i = 0; i < input_length;) {
+ if (cinput[i] == '%') {
+ // Special character => start process '%XX' sequence of chars
+ if (is_utf16(cinput + i, input_length - i)) {
+ unsigned int result = 0;
+ unsigned int bytes[2] = {};
+ unsigned char buffer[10] = {};
+
+ fetch_utf16(bytes, cinput + i + 2);
+
+ if (valid_utf16(bytes[0], 0)) {
+ if (input_length - i < utf16_surrogate_pair_length) {
+ ereport(
+ ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid sequence: not enough characters "
+ "to decode UTF-16 symbol from %d position",
+ i)));
+ }
+
+ fetch_utf16(bytes + 1,
+ cinput + i + utf16_second_codepoint_offset);
+ if (!valid_utf16(bytes[1], 1)) {
+ ereport(
+ ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid UTF-16 byte: characters from %d "
+ "position define invalid UTF-16 symbol",
+ i + utf16_past_first_codepoint_offset)));
+ }
+
+ result = decode_utf16_pair(bytes);
+ i += utf16_surrogate_pair_length;
+ } else {
+ result = bytes[0];
+ i += utf16_with_percent_length;
+ }
+
+ unicode_to_utf8((pg_wchar)result, buffer);
+ memcpy(current, buffer, pg_utf_mblen(buffer));
+ current += pg_utf_mblen(buffer);
+ } else if (is_utf8(cinput + i, input_length - i)) {
+ current =
+ write_character(current, (hex_char_to_value(cinput[i + 1]) << 4) |
+ hex_char_to_value(cinput[i + 2]));
+ i += 3;
+ } else {
+ // '%' starts a special sequence, but there are not enough
+ // characters left to decode it => error 'incorrect sequence of tokens'
+ ereport(ERROR,
+ (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("invalid sequence: not enough characters to "
+ "decode any UTF-typed symbol from %d position",
+ i)));
+ }
+ } else if (allowed_character(cinput[i], unreserved_special)) {
+ // Copy an unescaped character that is allowed
+ current = write_character(current, cinput[i]);
+ i += 1;
+ } else {
+ ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+ errmsg("disallowed characters in URL: \"%c\"",
+ cinput[i])));
+ }
+ }
+ current = write_character(current, '\0');
+
+ output = cstring_to_text(coutput);
+ pfree(coutput);
+ return output;
+}
+
+static const char *url_unreserved_special = ".-~_";
+
+Datum encode_url(PG_FUNCTION_ARGS) {
+ if (PG_ARGISNULL(0))
+ PG_RETURN_NULL();
+ PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), url_unreserved_special));
+}
+
+Datum decode_url(PG_FUNCTION_ARGS) {
+ if (PG_ARGISNULL(0))
+ PG_RETURN_NULL();
+ PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), url_unreserved_special));
+}
+
+static const char *uri_unreserved_special = "-_.!~*'();/?:@&=+$,#";
+
+Datum encode_uri(PG_FUNCTION_ARGS) {
+ if (PG_ARGISNULL(0))
+ PG_RETURN_NULL();
+ PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), uri_unreserved_special));
+}
+
+Datum decode_uri(PG_FUNCTION_ARGS) {
+ if (PG_ARGISNULL(0))
+ PG_RETURN_NULL();
+ PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), uri_unreserved_special));
+}
diff --git a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
new file mode 100644
index 00000000000..ba6043c0e51
--- /dev/null
+++ b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
@@ -0,0 +1,76 @@
+-- start_ignore
+CREATE EXTENSION IF NOT EXISTS gp_url_tools;
+-- end_ignore
+SET client_encoding TO UTF8;
+-- Basic encode/decode with ASCII and %XX escaping.
+SELECT url_tools_schema.encode_url('Hello World');
+ encode_url
+---------------
+ Hello%20World
+(1 row)
+
+SELECT url_tools_schema.decode_url('Hello%20World');
+ decode_url
+-------------
+ Hello World
+(1 row)
+
+-- encode_url() should escape reserved URL characters like ':'.
+SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/'));
+ encode_url
+------------------
+ http%3A
+
+ hu.wikipedia.org
+ wiki
+ S%C3%A3o_Paulo
+(5 rows)
+
+-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping.
+SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo');
+ encode_uri
+---------------------------------------------
+ http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo
+(1 row)
+
+SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo'));
+ md5
+----------------------------------
+ 147ded7d471df9cf050bc13242cbf39e
+(1 row)
+
+-- Legacy UTF-16 %uXXXX decoding for BMP characters.
+SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV'));
+ md5
+----------------------------------
+ d155b1f894fcd5540ba5881fb71753e1
+(1 row)
+
+-- Single UTF-16 surrogate pair should decode to one Unicode character.
+SELECT url_tools_schema.decode_url('%uD83D%uDE00');
+ decode_url
+------------
+ 😀
+(1 row)
+
+-- Surrogate pair should also decode correctly in the middle of a string.
+SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world');
+ decode_url
+-------------
+ hello😀world
+(1 row)
+
+-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair.
+SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00');
+ decode_url
+------------
+ A ã 浪 😀
+(1 row)
+
+-- Truncated surrogate pair should raise an error.
+SELECT url_tools_schema.decode_url('%uD83D');
+ERROR: invalid sequence: not enough characters to decode UTF-16 symbol from 0 position
+-- High surrogate followed by a non-low-surrogate code unit should fail.
+SELECT url_tools_schema.decode_url('%uD83D%u0041');
+ERROR: invalid UTF-16 byte: characters from 6 position define invalid UTF-16 symbol
+DROP EXTENSION gp_url_tools;
diff --git a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
new file mode 100644
index 00000000000..33ebc6781c1
--- /dev/null
+++ b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
@@ -0,0 +1,35 @@
+-- start_ignore
+CREATE EXTENSION IF NOT EXISTS gp_url_tools;
+-- end_ignore
+SET client_encoding TO UTF8;
+
+-- Basic encode/decode with ASCII and %XX escaping.
+SELECT url_tools_schema.encode_url('Hello World');
+SELECT url_tools_schema.decode_url('Hello%20World');
+
+-- encode_url() should escape reserved URL characters like ':'.
+SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/'));
+
+-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping.
+SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo');
+SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo'));
+
+-- Legacy UTF-16 %uXXXX decoding for BMP characters.
+SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV'));
+
+-- Single UTF-16 surrogate pair should decode to one Unicode character.
+SELECT url_tools_schema.decode_url('%uD83D%uDE00');
+
+-- Surrogate pair should also decode correctly in the middle of a string.
+SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world');
+
+-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair.
+SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00');
+
+-- Truncated surrogate pair should raise an error.
+SELECT url_tools_schema.decode_url('%uD83D');
+
+-- High surrogate followed by a non-low-surrogate code unit should fail.
+SELECT url_tools_schema.decode_url('%uD83D%u0041');
+
+DROP EXTENSION gp_url_tools;
diff --git a/pom.xml b/pom.xml
index 0e000093399..98e1931d8da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -155,6 +155,9 @@ code or new licensing patterns.
gpcontrib/diskquota/**
+ gpcontrib/gp_url_tools/Makefile
+ gpcontrib/gp_url_tools/gp_url_tools.control
+
getversion
.git-blame-ignore-revs
.dir-locals.el