From a30369573350d68b414aaa3494767695ab172c06 Mon Sep 17 00:00:00 2001
From: Ovchinnikov Andrew <63587191+AndrewOvvv@users.noreply.github.com>
Date: Tue, 23 Jun 2026 19:47:50 +0300
Subject: [PATCH 1/2] Feat: Import gp_url_tools into gpcontrib from greenplum

---
 gpcontrib/gp_url_tools/.clang-format          | 192 ++++++++++++++
 gpcontrib/gp_url_tools/Makefile               |  11 +
 gpcontrib/gp_url_tools/README.md              |  45 ++++
 gpcontrib/gp_url_tools/gp_url_tools.control   |   6 +
 .../gp_url_tools/sql/gp_url_tools--1.0.sql    |  27 ++
 gpcontrib/gp_url_tools/src/gp_url_tools.c     | 249 ++++++++++++++++++
 .../test/expected/gp_url_tools.out            |  41 +++
 .../gp_url_tools/test/sql/gp_url_tools.sql    |  13 +
 8 files changed, 584 insertions(+)
 create mode 100644 gpcontrib/gp_url_tools/.clang-format
 create mode 100644 gpcontrib/gp_url_tools/Makefile
 create mode 100644 gpcontrib/gp_url_tools/README.md
 create mode 100644 gpcontrib/gp_url_tools/gp_url_tools.control
 create mode 100644 gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
 create mode 100644 gpcontrib/gp_url_tools/src/gp_url_tools.c
 create mode 100644 gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
 create mode 100644 gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql

diff --git a/gpcontrib/gp_url_tools/.clang-format b/gpcontrib/gp_url_tools/.clang-format
new file mode 100644
index 00000000000..6f46b9469d9
--- /dev/null
+++ b/gpcontrib/gp_url_tools/.clang-format
@@ -0,0 +1,192 @@
+---
+Language:        Cpp
+# BasedOnStyle:  LLVM
+AccessModifierOffset: -2
+AlignAfterOpenBracket: Align
+AlignArrayOfStructures: None
+AlignConsecutiveMacros: None
+AlignConsecutiveAssignments: None
+AlignConsecutiveBitFields: None
+AlignConsecutiveDeclarations: None
+AlignEscapedNewlines: Right
+AlignOperands:   Align
+AlignTrailingComments: true
+AllowAllArgumentsOnNextLine: true
+AllowAllParametersOfDeclarationOnNextLine: true
+AllowShortEnumsOnASingleLine: true
+AllowShortBlocksOnASingleLine: Never
+AllowShortCaseLabelsOnASingleLine: false
+AllowShortFunctionsOnASingleLine: All
+AllowShortLambdasOnASingleLine: All
+AllowShortIfStatementsOnASingleLine: Never
+AllowShortLoopsOnASingleLine: false
+AlwaysBreakAfterDefinitionReturnType: None
+AlwaysBreakAfterReturnType: None
+AlwaysBreakBeforeMultilineStrings: false
+AlwaysBreakTemplateDeclarations: MultiLine
+AttributeMacros:
+  - __capability
+BinPackArguments: true
+BinPackParameters: true
+BraceWrapping:
+  AfterCaseLabel:  false
+  AfterClass:      false
+  AfterControlStatement: Never
+  AfterEnum:       false
+  AfterFunction:   false
+  AfterNamespace:  false
+  AfterObjCDeclaration: false
+  AfterStruct:     false
+  AfterUnion:      false
+  AfterExternBlock: false
+  BeforeCatch:     false
+  BeforeElse:      false
+  BeforeLambdaBody: false
+  BeforeWhile:     false
+  IndentBraces:    false
+  SplitEmptyFunction: true
+  SplitEmptyRecord: true
+  SplitEmptyNamespace: true
+BreakBeforeBinaryOperators: None
+BreakBeforeConceptDeclarations: true
+BreakBeforeBraces: Attach
+BreakBeforeInheritanceComma: false
+BreakInheritanceList: BeforeColon
+BreakBeforeTernaryOperators: true
+BreakConstructorInitializersBeforeComma: false
+BreakConstructorInitializers: BeforeColon
+BreakAfterJavaFieldAnnotations: false
+BreakStringLiterals: true
+ColumnLimit:     80
+CommentPragmas:  '^ IWYU pragma:'
+QualifierAlignment: Leave
+CompactNamespaces: false
+ConstructorInitializerIndentWidth: 4
+ContinuationIndentWidth: 4
+Cpp11BracedListStyle: true
+DeriveLineEnding: true
+DerivePointerAlignment: false
+DisableFormat:   false
+EmptyLineAfterAccessModifier: Never
+EmptyLineBeforeAccessModifier: LogicalBlock
+ExperimentalAutoDetectBinPacking: false
+PackConstructorInitializers: BinPack
+BasedOnStyle:    ''
+ConstructorInitializerAllOnOneLineOrOnePerLine: false
+AllowAllConstructorInitializersOnNextLine: true
+FixNamespaceComments: true
+ForEachMacros:
+  - foreach
+  - Q_FOREACH
+  - BOOST_FOREACH
+IfMacros:
+  - KJ_IF_MAYBE
+IncludeBlocks:   Preserve
+IncludeCategories:
+  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
+    Priority:        2
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
+    Priority:        3
+    SortPriority:    0
+    CaseSensitive:   false
+  - Regex:           '.*'
+    Priority:        1
+    SortPriority:    0
+    CaseSensitive:   false
+IncludeIsMainRegex: '(Test)?$'
+IncludeIsMainSourceRegex: ''
+IndentAccessModifiers: false
+IndentCaseLabels: false
+IndentCaseBlocks: false
+IndentGotoLabels: true
+IndentPPDirectives: None
+IndentExternBlock: AfterExternBlock
+IndentRequires:  false
+IndentWidth:     4
+IndentWrappedFunctionNames: false
+InsertTrailingCommas: None
+JavaScriptQuotes: Leave
+JavaScriptWrapImports: true
+KeepEmptyLinesAtTheStartOfBlocks: true
+LambdaBodyIndentation: Signature
+MacroBlockBegin: ''
+MacroBlockEnd:   ''
+MaxEmptyLinesToKeep: 1
+NamespaceIndentation: None
+ObjCBinPackProtocolList: Auto
+ObjCBlockIndentWidth: 2
+ObjCBreakBeforeNestedBlockParam: true
+ObjCSpaceAfterProperty: false
+ObjCSpaceBeforeProtocolList: true
+PenaltyBreakAssignment: 2
+PenaltyBreakBeforeFirstCallParameter: 19
+PenaltyBreakComment: 300
+PenaltyBreakFirstLessLess: 120
+PenaltyBreakOpenParenthesis: 0
+PenaltyBreakString: 1000
+PenaltyBreakTemplateDeclaration: 10
+PenaltyExcessCharacter: 1000000
+PenaltyReturnTypeOnItsOwnLine: 60
+PenaltyIndentedWhitespace: 0
+PointerAlignment: Right
+PPIndentWidth:   -1
+ReferenceAlignment: Pointer
+ReflowComments:  true
+RemoveBracesLLVM: false
+SeparateDefinitionBlocks: Leave
+ShortNamespaceLines: 1
+SortIncludes:    CaseSensitive
+SortJavaStaticImport: Before
+SortUsingDeclarations: true
+SpaceAfterCStyleCast: false
+SpaceAfterLogicalNot: false
+SpaceAfterTemplateKeyword: true
+SpaceBeforeAssignmentOperators: true
+SpaceBeforeCaseColon: false
+SpaceBeforeCpp11BracedList: false
+SpaceBeforeCtorInitializerColon: true
+SpaceBeforeInheritanceColon: true
+SpaceBeforeParens: ControlStatements
+SpaceBeforeParensOptions:
+  AfterControlStatements: true
+  AfterForeachMacros: true
+  AfterFunctionDefinitionName: false
+  AfterFunctionDeclarationName: false
+  AfterIfMacros:   true
+  AfterOverloadedOperator: false
+  BeforeNonEmptyParentheses: false
+SpaceAroundPointerQualifiers: Default
+SpaceBeforeRangeBasedForLoopColon: true
+SpaceInEmptyBlock: false
+SpaceInEmptyParentheses: false
+SpacesBeforeTrailingComments: 1
+SpacesInAngles:  Never
+SpacesInConditionalStatement: false
+SpacesInContainerLiterals: true
+SpacesInCStyleCastParentheses: false
+SpacesInLineCommentPrefix:
+  Minimum:         1
+  Maximum:         -1
+SpacesInParentheses: false
+SpacesInSquareBrackets: false
+SpaceBeforeSquareBrackets: false
+BitFieldColonSpacing: Both
+Standard:        Latest
+StatementAttributeLikeMacros:
+  - Q_EMIT
+StatementMacros:
+  - Q_UNUSED
+  - QT_REQUIRE_VERSION
+TabWidth:        8
+UseCRLF:         false
+UseTab:          Never
+WhitespaceSensitiveMacros:
+  - STRINGIZE
+  - PP_STRINGIZE
+  - BOOST_PP_STRINGIZE
+  - NS_SWIFT_NAME
+  - CF_SWIFT_NAME
+...
+
diff --git a/gpcontrib/gp_url_tools/Makefile b/gpcontrib/gp_url_tools/Makefile
new file mode 100644
index 00000000000..55c4767d637
--- /dev/null
+++ b/gpcontrib/gp_url_tools/Makefile
@@ -0,0 +1,11 @@
+DATA         = $(wildcard sql/*.sql)
+MODULES      = $(patsubst %.c,%,$(wildcard src/*.c))
+EXTENSION    = gp_url_tools
+
+TESTS        = $(wildcard test/sql/*.sql)
+REGRESS      = $(patsubst test/sql/%.sql,%,$(TESTS))
+REGRESS_OPTS = --inputdir=test
+
+PG_CONFIG    = pg_config
+PGXS        := $(shell $(PG_CONFIG) --pgxs)
+include $(PGXS)
diff --git a/gpcontrib/gp_url_tools/README.md b/gpcontrib/gp_url_tools/README.md
new file mode 100644
index 00000000000..172e6dc8155
--- /dev/null
+++ b/gpcontrib/gp_url_tools/README.md
@@ -0,0 +1,45 @@
+# gp_url_tools: Greenplum extension providing functionality for working with URL addresses
+
+### Features
+gp_url_tools is an extension for the Greenplum database that gives implementation 
+for functions that encode/decode url/uri.
+
+### Installation
+Install from source:
+```
+git clone https://github.com/open-gpdb/gp_url_tools.git
+cd gp_url_tools
+# Build it. Building would require GP installed nearby and sourcing greenplum_path.sh
+source <path_to_gp>/greenplum_path.sh
+make && make install
+```
+
+### Usage
+```
+=# create extension gp_url_tools;
+
+=# select url_tools_schema.encode_url('Hello World');
+  encode_url
+───────────────
+ Hello%20World
+(1 row)
+
+=# select url_tools_schema.decode_url('Hello%20World');
+ decode_url  
+─────────────
+ Hello World
+(1 row)
+
+=# select url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)');
+                                         encode_uri                  
+────────────────────────────────────────────────────────────────────────────────────────────
+ https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)
+
+=# select url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)');
+                     decode_uri               
+────────────────────────────────────────────────────
+ https://ru.wikipedia.org/wiki/Greenplum_(компания)
+```
+
+### Acknowledgments
+Thank you very much for the extension for postgrsql: https://github.com/okbob/url_encode, its sources were very useful.
diff --git a/gpcontrib/gp_url_tools/gp_url_tools.control b/gpcontrib/gp_url_tools/gp_url_tools.control
new file mode 100644
index 00000000000..ecbfab545b6
--- /dev/null
+++ b/gpcontrib/gp_url_tools/gp_url_tools.control
@@ -0,0 +1,6 @@
+# gp_url_tools extension
+comment = 'Greenplum extension providing functionality for working with URL addresses'
+default_version = '1.0'
+module_pathname = '$libdir/gp_url_tools'
+relocatable = true
+trusted = true
diff --git a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
new file mode 100644
index 00000000000..04183a97bc7
--- /dev/null
+++ b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
@@ -0,0 +1,27 @@
+/* gp_url_tools--1.0.sql */
+
+-- complain if script is sourced in psql, rather than via CREATE EXTENSION
+\echo Use "CREATE EXTENSION gp_url_tools" to load this file. \quit
+
+CREATE SCHEMA IF NOT EXISTS url_tools_schema;
+
+CREATE FUNCTION url_tools_schema.encode_url(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'encode_url'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION url_tools_schema.decode_url(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'decode_url'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION url_tools_schema.encode_uri(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'encode_uri'
+LANGUAGE C IMMUTABLE STRICT;
+
+CREATE FUNCTION url_tools_schema.decode_uri(text)
+RETURNS text
+AS 'MODULE_PATHNAME', 'decode_uri'
+LANGUAGE C IMMUTABLE STRICT;
+
diff --git a/gpcontrib/gp_url_tools/src/gp_url_tools.c b/gpcontrib/gp_url_tools/src/gp_url_tools.c
new file mode 100644
index 00000000000..49720a3a95f
--- /dev/null
+++ b/gpcontrib/gp_url_tools/src/gp_url_tools.c
@@ -0,0 +1,249 @@
+#include "postgres.h"
+
+#include "fmgr.h"
+#include "mb/pg_wchar.h"
+#include "utils/builtins.h"
+
+PG_MODULE_MAGIC;
+
+PG_FUNCTION_INFO_V1(encode_url);
+PG_FUNCTION_INFO_V1(decode_url);
+PG_FUNCTION_INFO_V1(encode_uri);
+PG_FUNCTION_INFO_V1(decode_uri);
+
+Datum url_encode(PG_FUNCTION_ARGS);
+Datum url_decode(PG_FUNCTION_ARGS);
+Datum uri_encode(PG_FUNCTION_ARGS);
+Datum uri_decode(PG_FUNCTION_ARGS);
+
+static bool allowed_character(const char c, const char *unreserved_special);
+static unsigned char char2hex(char c);
+static char *write_character(char *output, const char c);
+static void valid_encoding_length(char *current, char *end, int length);
+static text *encode(text *input, const char *unreserved_special);
+static bool valid_utf16(unsigned int byte, int byte_num);
+static unsigned int decode_utf16_pair(unsigned int bytes[2]);
+static text *decode(text *input, const char *unreserved_special);
+static bool is_utf8(const char *sequence, int length);
+static bool is_utf16(const char *sequence, int length);
+static void fetch_utf16(unsigned int *byte, const char *input);
+
+static const unsigned int utf16_low[2] = {0xD800, 0xDC00};
+static const unsigned int utf16_high[2] = {0xDBFF, 0xDFFF};
+static const unsigned int utf16_decode = 0x03FF;
+static const unsigned int utf16_decode_base = 0x10000;
+
+unsigned char char2hex(char c) {
+    if ('0' <= c && c <= '9') {
+        return c - '0';
+    } else if ('A' <= c && c <= 'Z') {
+        return c - 'A' + 10;
+    } else if ('a' <= c && c <= 'z') {
+        return c - 'a' + 10;
+    }
+    ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
+                    errmsg("invalid hexadecimal digit: \"%c\"", c)));
+    return -1;
+}
+
+bool allowed_character(const char c, const char *unreserved_special) {
+    return ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') ||
+           ('a' <= c && c <= 'z') || (strchr(unreserved_special, c) != NULL);
+}
+
+char *write_character(char *output, const char c) {
+    *output = c;
+    return ++output;
+}
+
+void valid_encoding_length(char *current, char *end, int length) {
+    Assert(current + length <= end);
+}
+
+text *encode(text *input, const char *unreserved_special) {
+    int input_length, output_length;
+    text *output;
+    char *cinput, *coutput, *current, *cend;
+
+    // Convert input data for processing
+    cinput = text_to_cstring(input);
+    input_length = strlen(cinput);
+    // Allocate memory for result url string (allocate more memory for bad
+    // cases)
+    output_length = 3 * input_length + 1;
+    coutput = palloc(sizeof(*coutput) * output_length);
+    current = coutput;
+    cend = coutput + output_length;
+
+    for (int i = 0; i < input_length; ++i) {
+        if (allowed_character(cinput[i], unreserved_special)) {
+            // single character => does not encode it or skip it
+            valid_encoding_length(current, cend, 1);
+            current = write_character(current, cinput[i]);
+        } else {
+            // some characters => process them all into '%XX' or '%XXXX'
+            // notation
+            valid_encoding_length(current, cend, 2);
+            current += sprintf(current, "%%%02X", (unsigned char)cinput[i]);
+        }
+    }
+    valid_encoding_length(current, cend, 1);
+    current = write_character(current, 0);
+
+    // Convert to text and return
+    output = cstring_to_text(coutput);
+    pfree(coutput);
+    return output;
+}
+
+bool valid_utf16(unsigned int byte, int byte_num) {
+    return utf16_low[byte_num] <= byte && byte <= utf16_high[byte_num];
+}
+
+unsigned int decode_utf16_pair(unsigned int bytes[2]) {
+    Assert(valid_utf16(bytes[0], 0));
+    Assert(valid_utf16(bytes[1], 1));
+
+    return (utf16_decode_base + ((bytes[0] & utf16_decode) << 10) +
+            (bytes[1] & utf16_decode));
+}
+
+// Check that sequence of bytes starts with 'symbol' in UTF-8 encoding
+//
+// UTF-16 'symbols' starts with '%' or '%', and 'XX' after it.
+// 'XX' - hex sequence that encode bytes
+bool is_utf8(const char *sequence, int length) {
+    return 3 <= length && sequence[0] == '%' && sequence[1] != 'u' &&
+           sequence[1] != 'U';
+}
+
+// Check that sequence of bytes starts with 'symbol' in UTF-16 encoding
+//
+// UTF-16 'symbols' starts with '%u' or '%U', and 'XXXX' after it.
+// 'XXXX' - hex sequence that encode bytes (optinally sequence 'XXXX' ->
+// 'XXXXXXXX')
+bool is_utf16(const char *sequence, int length) {
+    return 6 <= length && sequence[0] == '%' &&
+           (sequence[1] == 'u' || sequence[1] == 'U');
+}
+
+void fetch_utf16(unsigned int *byte, const char *input) {
+    for (int i = 0; i < 4; ++i) {
+        *byte = ((*byte) << 4) | char2hex(input[i]);
+    }
+}
+
+text *decode(text *input, const char *unreserved_special) {
+    int input_length;
+    text *output;
+    char *cinput, *coutput, *current;
+
+    // Convert input data for processing
+    cinput = text_to_cstring(input);
+    input_length = strlen(cinput);
+    // Allocate memory for result string
+    coutput = palloc(sizeof(*coutput) * (input_length + 1));
+    current = coutput;
+
+    for (int i = 0; i < input_length;) {
+        if (cinput[i] == '%') {
+            // special character => start process '%XX' or '%XXXX' sequence of
+            // chars
+            if (is_utf16(cinput + i, input_length - i)) {
+                unsigned int result = 0;
+                unsigned int bytes[2] = {0, 0};
+                unsigned char buffer[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+
+                fetch_utf16(bytes, cinput + i + 2);
+
+                if (valid_utf16(bytes[0], 0)) {
+                    if (10 < input_length - i) {
+                        ereport(
+                            ERROR,
+                            (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                             errmsg("invalid sequence: not enough characters "
+                                    "to decode UTF-16 symbol from %d position",
+                                    i)));
+                    }
+
+                    fetch_utf16(bytes + 1, cinput + i + 6);
+                    if (!valid_utf16(bytes[1], 1)) {
+                        ereport(
+                            ERROR,
+                            (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                             errmsg("invalid UTF-16 byte: characters from %d "
+                                    "position define invalid UTF-16 symbol",
+                                    i + 6)));
+                    }
+
+                    result = decode_utf16_pair(bytes);
+                    i += 10;
+                } else {
+                    result = bytes[0];
+                    i += 6;
+                }
+
+                unicode_to_utf8((pg_wchar)result, buffer);
+                strncpy(current, (const char *)buffer, pg_utf_mblen(buffer));
+                current += pg_utf_mblen(buffer);
+            } else if (is_utf8(cinput + i, input_length - i)) {
+                current =
+                    write_character(current, (char2hex(cinput[i + 1]) << 4) |
+                                                 char2hex(cinput[i + 2]));
+                i += 3;
+            } else {
+                // common case: not enough characters in line to decode special
+                // sequence => error 'incorrect sequence of tokens'
+                ereport(ERROR,
+                        (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                         errmsg("invalid sequence: not enough characters to "
+                                "decode any UTF-typed symbol from %d position",
+                                i)));
+            }
+        } else if (allowed_character(cinput[i], unreserved_special)) {
+            // allowed and not '%' character => just copy it into result string
+            current = write_character(current, cinput[i]);
+            i += 1;
+        } else {
+            // cinput[i] - is not '%' and not allowed character => error
+            // 'unexpected character'
+            ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
+                            errmsg("unalloweed characters in url code: \"%c\"",
+                                   cinput[i])));
+        }
+    }
+    current = write_character(current, 0);
+
+    // Convert to text and return
+    output = cstring_to_text(coutput);
+    pfree(coutput);
+    return output;
+}
+
+Datum encode_url(PG_FUNCTION_ARGS) {
+    if (PG_ARGISNULL(0)) {
+        PG_RETURN_NULL();
+    }
+    PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), ".-~_"));
+}
+
+Datum decode_url(PG_FUNCTION_ARGS) {
+    if (PG_ARGISNULL(0)) {
+        PG_RETURN_NULL();
+    }
+    PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), ".-~_"));
+}
+
+Datum encode_uri(PG_FUNCTION_ARGS) {
+    if (PG_ARGISNULL(0)) {
+        PG_RETURN_NULL();
+    }
+    PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#"));
+}
+
+Datum decode_uri(PG_FUNCTION_ARGS) {
+    if (PG_ARGISNULL(0)) {
+        PG_RETURN_NULL();
+    }
+    PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#"));
+}
diff --git a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
new file mode 100644
index 00000000000..16cf3df2ea4
--- /dev/null
+++ b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
@@ -0,0 +1,41 @@
+CREATE EXTENSION gp_url_tools;
+SELECT url_tools_schema.encode_url('Hello World');
+  encode_url   
+---------------
+ Hello%20World
+(1 row)
+
+SELECT url_tools_schema.decode_url('Hello%20World');
+ decode_url  
+-------------
+ Hello World
+(1 row)
+
+SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/'));
+    encode_url    
+------------------
+ http%3A
+ 
+ hu.wikipedia.org
+ wiki
+ S%C3%A3o_Paulo
+(5 rows)
+
+SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo');
+                 encode_uri                  
+---------------------------------------------
+ http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo
+(1 row)
+
+SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo'));
+               md5                
+----------------------------------
+ 147ded7d471df9cf050bc13242cbf39e
+(1 row)
+
+SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV'));
+               md5                
+----------------------------------
+ d155b1f894fcd5540ba5881fb71753e1
+(1 row)
+
diff --git a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
new file mode 100644
index 00000000000..dc9a1fe0819
--- /dev/null
+++ b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
@@ -0,0 +1,13 @@
+CREATE EXTENSION gp_url_tools;
+
+SELECT url_tools_schema.encode_url('Hello World');
+SELECT url_tools_schema.decode_url('Hello%20World');
+
+SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/'));
+
+SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo');
+SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo'));
+
+SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV'));
+
+

From 51ecfef21c3fe74016d93358417eaae69d84230f Mon Sep 17 00:00:00 2001
From: Vladislav Shchetinin <v.shchetinin.v@gmail.com>
Date: Thu, 25 Jun 2026 11:34:59 +0300
Subject: [PATCH 2/2] Feat: Adapt gp_url_tools for Cloudberry and fix UTF-16
 surrogate decoding

---
 .github/workflows/build-cloudberry-rocky8.yml |   3 +-
 .github/workflows/build-cloudberry.yml        |   3 +-
 gpcontrib/gp_url_tools/.clang-format          | 192 -----------------
 gpcontrib/gp_url_tools/Makefile               |   6 +
 gpcontrib/gp_url_tools/README.md              |  72 +++++--
 gpcontrib/gp_url_tools/gp_url_tools.control   |   2 +-
 .../gp_url_tools/sql/gp_url_tools--1.0.sql    |   2 +-
 gpcontrib/gp_url_tools/src/gp_url_tools.c     | 195 ++++++++++--------
 .../test/expected/gp_url_tools.out            |  37 +++-
 .../gp_url_tools/test/sql/gp_url_tools.sql    |  24 ++-
 pom.xml                                       |   3 +
 11 files changed, 229 insertions(+), 310 deletions(-)
 delete mode 100644 gpcontrib/gp_url_tools/.clang-format

diff --git a/.github/workflows/build-cloudberry-rocky8.yml b/.github/workflows/build-cloudberry-rocky8.yml
index 2abf88060e3..11cf52e0710 100644
--- a/.github/workflows/build-cloudberry-rocky8.yml
+++ b/.github/workflows/build-cloudberry-rocky8.yml
@@ -318,7 +318,8 @@ jobs:
                "make_configs":["gpcontrib/orafce:installcheck",
                                "gpcontrib/zstd:installcheck",
                                "gpcontrib/gp_sparse_vector:installcheck",
-                               "gpcontrib/gp_toolkit:installcheck"]
+                               "gpcontrib/gp_toolkit:installcheck",
+                               "gpcontrib/gp_url_tools:installcheck"]
               },
               {"test":"ic-fixme",
                "make_configs":["src/test/regress:installcheck-fixme"],
diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml
index ca75f7b42e7..364684a904a 100644
--- a/.github/workflows/build-cloudberry.yml
+++ b/.github/workflows/build-cloudberry.yml
@@ -312,7 +312,8 @@ jobs:
                                "gpcontrib/zstd:installcheck",
                                "gpcontrib/gp_sparse_vector:installcheck",
                                "gpcontrib/gp_toolkit:installcheck",
-                               "gpcontrib/gp_exttable_fdw:installcheck"]
+                               "gpcontrib/gp_exttable_fdw:installcheck",
+                               "gpcontrib/gp_url_tools:installcheck"]
               },
               {"test":"ic-diskquota",
                "make_configs":["gpcontrib/diskquota:installcheck"],
diff --git a/gpcontrib/gp_url_tools/.clang-format b/gpcontrib/gp_url_tools/.clang-format
deleted file mode 100644
index 6f46b9469d9..00000000000
--- a/gpcontrib/gp_url_tools/.clang-format
+++ /dev/null
@@ -1,192 +0,0 @@
----
-Language:        Cpp
-# BasedOnStyle:  LLVM
-AccessModifierOffset: -2
-AlignAfterOpenBracket: Align
-AlignArrayOfStructures: None
-AlignConsecutiveMacros: None
-AlignConsecutiveAssignments: None
-AlignConsecutiveBitFields: None
-AlignConsecutiveDeclarations: None
-AlignEscapedNewlines: Right
-AlignOperands:   Align
-AlignTrailingComments: true
-AllowAllArgumentsOnNextLine: true
-AllowAllParametersOfDeclarationOnNextLine: true
-AllowShortEnumsOnASingleLine: true
-AllowShortBlocksOnASingleLine: Never
-AllowShortCaseLabelsOnASingleLine: false
-AllowShortFunctionsOnASingleLine: All
-AllowShortLambdasOnASingleLine: All
-AllowShortIfStatementsOnASingleLine: Never
-AllowShortLoopsOnASingleLine: false
-AlwaysBreakAfterDefinitionReturnType: None
-AlwaysBreakAfterReturnType: None
-AlwaysBreakBeforeMultilineStrings: false
-AlwaysBreakTemplateDeclarations: MultiLine
-AttributeMacros:
-  - __capability
-BinPackArguments: true
-BinPackParameters: true
-BraceWrapping:
-  AfterCaseLabel:  false
-  AfterClass:      false
-  AfterControlStatement: Never
-  AfterEnum:       false
-  AfterFunction:   false
-  AfterNamespace:  false
-  AfterObjCDeclaration: false
-  AfterStruct:     false
-  AfterUnion:      false
-  AfterExternBlock: false
-  BeforeCatch:     false
-  BeforeElse:      false
-  BeforeLambdaBody: false
-  BeforeWhile:     false
-  IndentBraces:    false
-  SplitEmptyFunction: true
-  SplitEmptyRecord: true
-  SplitEmptyNamespace: true
-BreakBeforeBinaryOperators: None
-BreakBeforeConceptDeclarations: true
-BreakBeforeBraces: Attach
-BreakBeforeInheritanceComma: false
-BreakInheritanceList: BeforeColon
-BreakBeforeTernaryOperators: true
-BreakConstructorInitializersBeforeComma: false
-BreakConstructorInitializers: BeforeColon
-BreakAfterJavaFieldAnnotations: false
-BreakStringLiterals: true
-ColumnLimit:     80
-CommentPragmas:  '^ IWYU pragma:'
-QualifierAlignment: Leave
-CompactNamespaces: false
-ConstructorInitializerIndentWidth: 4
-ContinuationIndentWidth: 4
-Cpp11BracedListStyle: true
-DeriveLineEnding: true
-DerivePointerAlignment: false
-DisableFormat:   false
-EmptyLineAfterAccessModifier: Never
-EmptyLineBeforeAccessModifier: LogicalBlock
-ExperimentalAutoDetectBinPacking: false
-PackConstructorInitializers: BinPack
-BasedOnStyle:    ''
-ConstructorInitializerAllOnOneLineOrOnePerLine: false
-AllowAllConstructorInitializersOnNextLine: true
-FixNamespaceComments: true
-ForEachMacros:
-  - foreach
-  - Q_FOREACH
-  - BOOST_FOREACH
-IfMacros:
-  - KJ_IF_MAYBE
-IncludeBlocks:   Preserve
-IncludeCategories:
-  - Regex:           '^"(llvm|llvm-c|clang|clang-c)/'
-    Priority:        2
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '^(<|"(gtest|gmock|isl|json)/)'
-    Priority:        3
-    SortPriority:    0
-    CaseSensitive:   false
-  - Regex:           '.*'
-    Priority:        1
-    SortPriority:    0
-    CaseSensitive:   false
-IncludeIsMainRegex: '(Test)?$'
-IncludeIsMainSourceRegex: ''
-IndentAccessModifiers: false
-IndentCaseLabels: false
-IndentCaseBlocks: false
-IndentGotoLabels: true
-IndentPPDirectives: None
-IndentExternBlock: AfterExternBlock
-IndentRequires:  false
-IndentWidth:     4
-IndentWrappedFunctionNames: false
-InsertTrailingCommas: None
-JavaScriptQuotes: Leave
-JavaScriptWrapImports: true
-KeepEmptyLinesAtTheStartOfBlocks: true
-LambdaBodyIndentation: Signature
-MacroBlockBegin: ''
-MacroBlockEnd:   ''
-MaxEmptyLinesToKeep: 1
-NamespaceIndentation: None
-ObjCBinPackProtocolList: Auto
-ObjCBlockIndentWidth: 2
-ObjCBreakBeforeNestedBlockParam: true
-ObjCSpaceAfterProperty: false
-ObjCSpaceBeforeProtocolList: true
-PenaltyBreakAssignment: 2
-PenaltyBreakBeforeFirstCallParameter: 19
-PenaltyBreakComment: 300
-PenaltyBreakFirstLessLess: 120
-PenaltyBreakOpenParenthesis: 0
-PenaltyBreakString: 1000
-PenaltyBreakTemplateDeclaration: 10
-PenaltyExcessCharacter: 1000000
-PenaltyReturnTypeOnItsOwnLine: 60
-PenaltyIndentedWhitespace: 0
-PointerAlignment: Right
-PPIndentWidth:   -1
-ReferenceAlignment: Pointer
-ReflowComments:  true
-RemoveBracesLLVM: false
-SeparateDefinitionBlocks: Leave
-ShortNamespaceLines: 1
-SortIncludes:    CaseSensitive
-SortJavaStaticImport: Before
-SortUsingDeclarations: true
-SpaceAfterCStyleCast: false
-SpaceAfterLogicalNot: false
-SpaceAfterTemplateKeyword: true
-SpaceBeforeAssignmentOperators: true
-SpaceBeforeCaseColon: false
-SpaceBeforeCpp11BracedList: false
-SpaceBeforeCtorInitializerColon: true
-SpaceBeforeInheritanceColon: true
-SpaceBeforeParens: ControlStatements
-SpaceBeforeParensOptions:
-  AfterControlStatements: true
-  AfterForeachMacros: true
-  AfterFunctionDefinitionName: false
-  AfterFunctionDeclarationName: false
-  AfterIfMacros:   true
-  AfterOverloadedOperator: false
-  BeforeNonEmptyParentheses: false
-SpaceAroundPointerQualifiers: Default
-SpaceBeforeRangeBasedForLoopColon: true
-SpaceInEmptyBlock: false
-SpaceInEmptyParentheses: false
-SpacesBeforeTrailingComments: 1
-SpacesInAngles:  Never
-SpacesInConditionalStatement: false
-SpacesInContainerLiterals: true
-SpacesInCStyleCastParentheses: false
-SpacesInLineCommentPrefix:
-  Minimum:         1
-  Maximum:         -1
-SpacesInParentheses: false
-SpacesInSquareBrackets: false
-SpaceBeforeSquareBrackets: false
-BitFieldColonSpacing: Both
-Standard:        Latest
-StatementAttributeLikeMacros:
-  - Q_EMIT
-StatementMacros:
-  - Q_UNUSED
-  - QT_REQUIRE_VERSION
-TabWidth:        8
-UseCRLF:         false
-UseTab:          Never
-WhitespaceSensitiveMacros:
-  - STRINGIZE
-  - PP_STRINGIZE
-  - BOOST_PP_STRINGIZE
-  - NS_SWIFT_NAME
-  - CF_SWIFT_NAME
-...
-
diff --git a/gpcontrib/gp_url_tools/Makefile b/gpcontrib/gp_url_tools/Makefile
index 55c4767d637..c161751a88e 100644
--- a/gpcontrib/gp_url_tools/Makefile
+++ b/gpcontrib/gp_url_tools/Makefile
@@ -6,6 +6,12 @@ TESTS        = $(wildcard test/sql/*.sql)
 REGRESS      = $(patsubst test/sql/%.sql,%,$(TESTS))
 REGRESS_OPTS = --inputdir=test
 
+ifdef USE_PGXS
 PG_CONFIG    = pg_config
 PGXS        := $(shell $(PG_CONFIG) --pgxs)
 include $(PGXS)
+else
+top_builddir = ../..
+include $(top_builddir)/src/Makefile.global
+include $(top_srcdir)/contrib/contrib-global.mk
+endif
diff --git a/gpcontrib/gp_url_tools/README.md b/gpcontrib/gp_url_tools/README.md
index 172e6dc8155..b95fa95a4ed 100644
--- a/gpcontrib/gp_url_tools/README.md
+++ b/gpcontrib/gp_url_tools/README.md
@@ -1,45 +1,75 @@
-# gp_url_tools: Greenplum extension providing functionality for working with URL addresses
+<!--
+  Licensed to the Apache Software Foundation (ASF) under one
+  or more contributor license agreements.  See the NOTICE file
+  distributed with this work for additional information
+  regarding copyright ownership.  The ASF licenses this file
+  to you under the Apache License, Version 2.0 (the
+  "License"); you may not use this file except in compliance
+  with the License.  You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+  Unless required by applicable law or agreed to in writing,
+  software distributed under the License is distributed on an
+  "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+  KIND, either express or implied.  See the License for the
+  specific language governing permissions and limitations
+  under the License.
+-->
+
+# gp_url_tools: Cloudberry extension providing functionality for working with URL addresses
 
 ### Features
-gp_url_tools is an extension for the Greenplum database that gives implementation 
+`gp_url_tools` is an extension for the Cloudberry database that gives implementation
 for functions that encode/decode url/uri.
 
-### Installation
-Install from source:
-```
-git clone https://github.com/open-gpdb/gp_url_tools.git
-cd gp_url_tools
-# Build it. Building would require GP installed nearby and sourcing greenplum_path.sh
-source <path_to_gp>/greenplum_path.sh
-make && make install
-```
+### Functions
+The extension creates the `url_tools_schema` schema and adds four SQL functions:
+
+- `url_tools_schema.encode_url`/`.encode_uri`  
+  Encodes a text value for use as a URL/URI component by replacing reserved characters with percent-encoded sequences.
+
+- `url_tools_schema.decode_url`/`.decode_uri`  
+  Decodes percent-encoded sequences in a URL/URI-encoded text value back to their original characters (human-readable).
 
 ### Usage
+```sql
+CREATE EXTENSION gp_url_tools;
 ```
-=# create extension gp_url_tools;
-
-=# select url_tools_schema.encode_url('Hello World');
+```sql
+SELECT url_tools_schema.encode_url('Hello World');
+```
+```bash
   encode_url
 ───────────────
  Hello%20World
 (1 row)
-
-=# select url_tools_schema.decode_url('Hello%20World');
+```
+```sql
+SELECT url_tools_schema.decode_url('Hello%20World');
+```
+```bash
  decode_url  
 ─────────────
  Hello World
 (1 row)
-
-=# select url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)');
+```
+```sql
+SELECT url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)');
+```
+```bash
                                          encode_uri                  
 ────────────────────────────────────────────────────────────────────────────────────────────
  https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)
-
-=# select url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)');
+```
+```sql
+SELECT url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)');
+```
+```bash
                      decode_uri               
 ────────────────────────────────────────────────────
  https://ru.wikipedia.org/wiki/Greenplum_(компания)
 ```
 
 ### Acknowledgments
-Thank you very much for the extension for postgrsql: https://github.com/okbob/url_encode, its sources were very useful.
+Thank you very much for the extension for PostgreSQL: https://github.com/okbob/url_encode, its sources were very useful.
diff --git a/gpcontrib/gp_url_tools/gp_url_tools.control b/gpcontrib/gp_url_tools/gp_url_tools.control
index ecbfab545b6..cb16430ad62 100644
--- a/gpcontrib/gp_url_tools/gp_url_tools.control
+++ b/gpcontrib/gp_url_tools/gp_url_tools.control
@@ -1,5 +1,5 @@
 # gp_url_tools extension
-comment = 'Greenplum extension providing functionality for working with URL addresses'
+comment = 'Functions for working with URL-s'
 default_version = '1.0'
 module_pathname = '$libdir/gp_url_tools'
 relocatable = true
diff --git a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
index 04183a97bc7..3b2a773719a 100644
--- a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
+++ b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql
@@ -4,6 +4,7 @@
 \echo Use "CREATE EXTENSION gp_url_tools" to load this file. \quit
 
 CREATE SCHEMA IF NOT EXISTS url_tools_schema;
+GRANT USAGE ON SCHEMA url_tools_schema TO public;
 
 CREATE FUNCTION url_tools_schema.encode_url(text)
 RETURNS text
@@ -24,4 +25,3 @@ CREATE FUNCTION url_tools_schema.decode_uri(text)
 RETURNS text
 AS 'MODULE_PATHNAME', 'decode_uri'
 LANGUAGE C IMMUTABLE STRICT;
-
diff --git a/gpcontrib/gp_url_tools/src/gp_url_tools.c b/gpcontrib/gp_url_tools/src/gp_url_tools.c
index 49720a3a95f..7c397828ded 100644
--- a/gpcontrib/gp_url_tools/src/gp_url_tools.c
+++ b/gpcontrib/gp_url_tools/src/gp_url_tools.c
@@ -1,3 +1,30 @@
+/*-------------------------------------------------------------------------
+ *
+ * Licensed to the Apache Software Foundation (ASF) under one
+ * or more contributor license agreements.  See the NOTICE file
+ * distributed with this work for additional information
+ * regarding copyright ownership.  The ASF licenses this file
+ * to you under the Apache License, Version 2.0 (the
+ * "License"); you may not use this file except in compliance
+ * with the License.  You may obtain a copy of the License at
+ *
+ *   http://www.apache.org/licenses/LICENSE-2.0
+ *
+ * Unless required by applicable law or agreed to in writing,
+ * software distributed under the License is distributed on an
+ * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY
+ * KIND, either express or implied.  See the License for the
+ * specific language governing permissions and limitations
+ * under the License.
+ *
+ * gp_url_tools.c
+ *
+ * IDENTIFICATION
+ *	  gpcontrib/gp_url_tools/src/gp_url_tools.c
+ *
+ *-------------------------------------------------------------------------
+ */
+
 #include "postgres.h"
 
 #include "fmgr.h"
@@ -11,56 +38,43 @@ PG_FUNCTION_INFO_V1(decode_url);
 PG_FUNCTION_INFO_V1(encode_uri);
 PG_FUNCTION_INFO_V1(decode_uri);
 
-Datum url_encode(PG_FUNCTION_ARGS);
-Datum url_decode(PG_FUNCTION_ARGS);
-Datum uri_encode(PG_FUNCTION_ARGS);
-Datum uri_decode(PG_FUNCTION_ARGS);
-
-static bool allowed_character(const char c, const char *unreserved_special);
-static unsigned char char2hex(char c);
-static char *write_character(char *output, const char c);
-static void valid_encoding_length(char *current, char *end, int length);
-static text *encode(text *input, const char *unreserved_special);
-static bool valid_utf16(unsigned int byte, int byte_num);
-static unsigned int decode_utf16_pair(unsigned int bytes[2]);
-static text *decode(text *input, const char *unreserved_special);
-static bool is_utf8(const char *sequence, int length);
-static bool is_utf16(const char *sequence, int length);
-static void fetch_utf16(unsigned int *byte, const char *input);
-
 static const unsigned int utf16_low[2] = {0xD800, 0xDC00};
 static const unsigned int utf16_high[2] = {0xDBFF, 0xDFFF};
 static const unsigned int utf16_decode = 0x03FF;
 static const unsigned int utf16_decode_base = 0x10000;
-
-unsigned char char2hex(char c) {
-    if ('0' <= c && c <= '9') {
+static const int utf8_with_percent_length = 3;          // Example: '%20
+static const int utf16_with_percent_length = 6;         // Example: '%u0430'
+static const int utf16_surrogate_pair_length = 12;      // Example: '%uD800%uDC00'
+static const int utf16_second_codepoint_offset = 8;     // '%uD800%uDC00' => ('%uD800%u'.lenght == 8)
+static const int utf16_past_first_codepoint_offset = 6; // '%uD800%uDC00' => ('%uD800'.lenght == 6)
+
+static unsigned char hex_char_to_value(char c) {
+    if ('0' <= c && c <= '9')
         return c - '0';
-    } else if ('A' <= c && c <= 'Z') {
+    if ('A' <= c && c <= 'F')
         return c - 'A' + 10;
-    } else if ('a' <= c && c <= 'z') {
+    if ('a' <= c && c <= 'f')
         return c - 'a' + 10;
-    }
     ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE),
                     errmsg("invalid hexadecimal digit: \"%c\"", c)));
-    return -1;
+    pg_unreachable();
 }
 
-bool allowed_character(const char c, const char *unreserved_special) {
+static bool allowed_character(const char c, const char *unreserved_special) {
     return ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') ||
            ('a' <= c && c <= 'z') || (strchr(unreserved_special, c) != NULL);
 }
 
-char *write_character(char *output, const char c) {
+static char *write_character(char *output, const char c) {
     *output = c;
     return ++output;
 }
 
-void valid_encoding_length(char *current, char *end, int length) {
+static void valid_encoding_length(char *current, char *end, int length) {
     Assert(current + length <= end);
 }
 
-text *encode(text *input, const char *unreserved_special) {
+static text *encode(text *input, const char *unreserved_special) {
     int input_length, output_length;
     text *output;
     char *cinput, *coutput, *current, *cend;
@@ -68,8 +82,10 @@ text *encode(text *input, const char *unreserved_special) {
     // Convert input data for processing
     cinput = text_to_cstring(input);
     input_length = strlen(cinput);
-    // Allocate memory for result url string (allocate more memory for bad
-    // cases)
+    /*
+     * Worst case: every input byte becomes '%XX' (3 output chars).
+     * The +1 accounts for the null terminator
+     */
     output_length = 3 * input_length + 1;
     coutput = palloc(sizeof(*coutput) * output_length);
     current = coutput;
@@ -77,30 +93,29 @@ text *encode(text *input, const char *unreserved_special) {
 
     for (int i = 0; i < input_length; ++i) {
         if (allowed_character(cinput[i], unreserved_special)) {
-            // single character => does not encode it or skip it
+            // Allowed character => copy it into result string
             valid_encoding_length(current, cend, 1);
             current = write_character(current, cinput[i]);
         } else {
-            // some characters => process them all into '%XX' or '%XXXX'
-            // notation
-            valid_encoding_length(current, cend, 2);
+            // Percent-encode byte as '%XX'
+            valid_encoding_length(current, cend, 3);
             current += sprintf(current, "%%%02X", (unsigned char)cinput[i]);
         }
     }
+    // Terminate result string
     valid_encoding_length(current, cend, 1);
-    current = write_character(current, 0);
+    current = write_character(current, '\0');
 
-    // Convert to text and return
     output = cstring_to_text(coutput);
     pfree(coutput);
     return output;
 }
 
-bool valid_utf16(unsigned int byte, int byte_num) {
+static bool valid_utf16(unsigned int byte, int byte_num) {
     return utf16_low[byte_num] <= byte && byte <= utf16_high[byte_num];
 }
 
-unsigned int decode_utf16_pair(unsigned int bytes[2]) {
+static unsigned int decode_utf16_pair(unsigned int bytes[2]) {
     Assert(valid_utf16(bytes[0], 0));
     Assert(valid_utf16(bytes[1], 1));
 
@@ -108,56 +123,56 @@ unsigned int decode_utf16_pair(unsigned int bytes[2]) {
             (bytes[1] & utf16_decode));
 }
 
-// Check that sequence of bytes starts with 'symbol' in UTF-8 encoding
-//
-// UTF-16 'symbols' starts with '%' or '%', and 'XX' after it.
-// 'XX' - hex sequence that encode bytes
-bool is_utf8(const char *sequence, int length) {
-    return 3 <= length && sequence[0] == '%' && sequence[1] != 'u' &&
-           sequence[1] != 'U';
+/*
+ * Check whether the sequence starts with a percent-encoded UTF-8 byte (%XX).
+ *
+ * A UTF-8 percent-encoded byte starts with '%' followed by exactly two hex
+ * digits (e.g. "%20", "%D0").  This is distinguished from a UTF-16 sequence
+ * which starts with '%u' or '%U' (e.g. "%uD83D").
+ *
+ * Requires at least 3 characters: '%' + 2 hex digits.
+ */
+static bool is_utf8(const char *sequence, int length) {
+    return utf8_with_percent_length <= length && sequence[0] == '%' &&
+           sequence[1] != 'u' && sequence[1] != 'U';
 }
 
-// Check that sequence of bytes starts with 'symbol' in UTF-16 encoding
-//
-// UTF-16 'symbols' starts with '%u' or '%U', and 'XXXX' after it.
-// 'XXXX' - hex sequence that encode bytes (optinally sequence 'XXXX' ->
-// 'XXXXXXXX')
-bool is_utf16(const char *sequence, int length) {
-    return 6 <= length && sequence[0] == '%' &&
+/*
+ * Check whether the sequence starts with a legacy percent-encoded UTF-16 unit
+ * ('%uXXXX' or '%UXXXX'). Requires at least 6 characters: '%u' + 4 hex digits.
+ */
+static bool is_utf16(const char *sequence, int length) {
+    return utf16_with_percent_length <= length && sequence[0] == '%' &&
            (sequence[1] == 'u' || sequence[1] == 'U');
 }
 
-void fetch_utf16(unsigned int *byte, const char *input) {
-    for (int i = 0; i < 4; ++i) {
-        *byte = ((*byte) << 4) | char2hex(input[i]);
-    }
+static void fetch_utf16(unsigned int *byte, const char *input) {
+    for (int i = 0; i < 4; ++i)
+        *byte = ((*byte) << 4) | hex_char_to_value(input[i]);
 }
 
-text *decode(text *input, const char *unreserved_special) {
+static text *decode(text *input, const char *unreserved_special) {
     int input_length;
     text *output;
     char *cinput, *coutput, *current;
 
-    // Convert input data for processing
     cinput = text_to_cstring(input);
     input_length = strlen(cinput);
-    // Allocate memory for result string
     coutput = palloc(sizeof(*coutput) * (input_length + 1));
     current = coutput;
 
     for (int i = 0; i < input_length;) {
         if (cinput[i] == '%') {
-            // special character => start process '%XX' or '%XXXX' sequence of
-            // chars
+            // Special character => start process '%XX' sequence of chars
             if (is_utf16(cinput + i, input_length - i)) {
                 unsigned int result = 0;
-                unsigned int bytes[2] = {0, 0};
-                unsigned char buffer[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0};
+                unsigned int bytes[2] = {};
+                unsigned char buffer[10] = {};
 
                 fetch_utf16(bytes, cinput + i + 2);
 
                 if (valid_utf16(bytes[0], 0)) {
-                    if (10 < input_length - i) {
+                    if (input_length - i < utf16_surrogate_pair_length) {
                         ereport(
                             ERROR,
                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
@@ -166,34 +181,35 @@ text *decode(text *input, const char *unreserved_special) {
                                     i)));
                     }
 
-                    fetch_utf16(bytes + 1, cinput + i + 6);
+                    fetch_utf16(bytes + 1,
+                                cinput + i + utf16_second_codepoint_offset);
                     if (!valid_utf16(bytes[1], 1)) {
                         ereport(
                             ERROR,
                             (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
                              errmsg("invalid UTF-16 byte: characters from %d "
                                     "position define invalid UTF-16 symbol",
-                                    i + 6)));
+                                    i + utf16_past_first_codepoint_offset)));
                     }
 
                     result = decode_utf16_pair(bytes);
-                    i += 10;
+                    i += utf16_surrogate_pair_length;
                 } else {
                     result = bytes[0];
-                    i += 6;
+                    i += utf16_with_percent_length;
                 }
 
                 unicode_to_utf8((pg_wchar)result, buffer);
-                strncpy(current, (const char *)buffer, pg_utf_mblen(buffer));
+                memcpy(current, buffer, pg_utf_mblen(buffer));
                 current += pg_utf_mblen(buffer);
             } else if (is_utf8(cinput + i, input_length - i)) {
                 current =
-                    write_character(current, (char2hex(cinput[i + 1]) << 4) |
-                                                 char2hex(cinput[i + 2]));
+                    write_character(current, (hex_char_to_value(cinput[i + 1]) << 4) |
+                                                 hex_char_to_value(cinput[i + 2]));
                 i += 3;
             } else {
-                // common case: not enough characters in line to decode special
-                // sequence => error 'incorrect sequence of tokens'
+                // '%' starts a special sequence, but there are not enough
+                // characters left to decode it => error 'incorrect sequence of tokens'
                 ereport(ERROR,
                         (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
                          errmsg("invalid sequence: not enough characters to "
@@ -201,49 +217,46 @@ text *decode(text *input, const char *unreserved_special) {
                                 i)));
             }
         } else if (allowed_character(cinput[i], unreserved_special)) {
-            // allowed and not '%' character => just copy it into result string
+            // Copy an unescaped character that is allowed
             current = write_character(current, cinput[i]);
             i += 1;
         } else {
-            // cinput[i] - is not '%' and not allowed character => error
-            // 'unexpected character'
             ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE),
-                            errmsg("unalloweed characters in url code: \"%c\"",
+                            errmsg("disallowed characters in URL: \"%c\"",
                                    cinput[i])));
         }
     }
-    current = write_character(current, 0);
+    current = write_character(current, '\0');
 
-    // Convert to text and return
     output = cstring_to_text(coutput);
     pfree(coutput);
     return output;
 }
 
+static const char *url_unreserved_special = ".-~_";
+
 Datum encode_url(PG_FUNCTION_ARGS) {
-    if (PG_ARGISNULL(0)) {
+    if (PG_ARGISNULL(0))
         PG_RETURN_NULL();
-    }
-    PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), ".-~_"));
+    PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), url_unreserved_special));
 }
 
 Datum decode_url(PG_FUNCTION_ARGS) {
-    if (PG_ARGISNULL(0)) {
+    if (PG_ARGISNULL(0))
         PG_RETURN_NULL();
-    }
-    PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), ".-~_"));
+    PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), url_unreserved_special));
 }
 
+static const char *uri_unreserved_special = "-_.!~*'();/?:@&=+$,#";
+
 Datum encode_uri(PG_FUNCTION_ARGS) {
-    if (PG_ARGISNULL(0)) {
+    if (PG_ARGISNULL(0))
         PG_RETURN_NULL();
-    }
-    PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#"));
+    PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), uri_unreserved_special));
 }
 
 Datum decode_uri(PG_FUNCTION_ARGS) {
-    if (PG_ARGISNULL(0)) {
+    if (PG_ARGISNULL(0))
         PG_RETURN_NULL();
-    }
-    PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#"));
+    PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), uri_unreserved_special));
 }
diff --git a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
index 16cf3df2ea4..ba6043c0e51 100644
--- a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
+++ b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out
@@ -1,4 +1,8 @@
-CREATE EXTENSION gp_url_tools;
+-- start_ignore
+CREATE EXTENSION IF NOT EXISTS gp_url_tools;
+-- end_ignore
+SET client_encoding TO UTF8;
+-- Basic encode/decode with ASCII and %XX escaping.
 SELECT url_tools_schema.encode_url('Hello World');
   encode_url   
 ---------------
@@ -11,6 +15,7 @@ SELECT url_tools_schema.decode_url('Hello%20World');
  Hello World
 (1 row)
 
+-- encode_url() should escape reserved URL characters like ':'.
 SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/'));
     encode_url    
 ------------------
@@ -21,6 +26,7 @@ SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://h
  S%C3%A3o_Paulo
 (5 rows)
 
+-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping.
 SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo');
                  encode_uri                  
 ---------------------------------------------
@@ -33,9 +39,38 @@ SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Pa
  147ded7d471df9cf050bc13242cbf39e
 (1 row)
 
+-- Legacy UTF-16 %uXXXX decoding for BMP characters.
 SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV'));
                md5                
 ----------------------------------
  d155b1f894fcd5540ba5881fb71753e1
 (1 row)
 
+-- Single UTF-16 surrogate pair should decode to one Unicode character.
+SELECT url_tools_schema.decode_url('%uD83D%uDE00');
+ decode_url 
+------------
+ 😀
+(1 row)
+
+-- Surrogate pair should also decode correctly in the middle of a string.
+SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world');
+ decode_url  
+-------------
+ hello😀world
+(1 row)
+
+-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair.
+SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00');
+ decode_url 
+------------
+ A ã 浪 😀
+(1 row)
+
+-- Truncated surrogate pair should raise an error.
+SELECT url_tools_schema.decode_url('%uD83D');
+ERROR:  invalid sequence: not enough characters to decode UTF-16 symbol from 0 position
+-- High surrogate followed by a non-low-surrogate code unit should fail.
+SELECT url_tools_schema.decode_url('%uD83D%u0041');
+ERROR:  invalid UTF-16 byte: characters from 6 position define invalid UTF-16 symbol
+DROP EXTENSION gp_url_tools;
diff --git a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
index dc9a1fe0819..33ebc6781c1 100644
--- a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
+++ b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql
@@ -1,13 +1,35 @@
-CREATE EXTENSION gp_url_tools;
+-- start_ignore
+CREATE EXTENSION IF NOT EXISTS gp_url_tools;
+-- end_ignore
+SET client_encoding TO UTF8;
 
+-- Basic encode/decode with ASCII and %XX escaping.
 SELECT url_tools_schema.encode_url('Hello World');
 SELECT url_tools_schema.decode_url('Hello%20World');
 
+-- encode_url() should escape reserved URL characters like ':'.
 SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/'));
 
+-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping.
 SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo');
 SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo'));
 
+-- Legacy UTF-16 %uXXXX decoding for BMP characters.
 SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV'));
 
+-- Single UTF-16 surrogate pair should decode to one Unicode character.
+SELECT url_tools_schema.decode_url('%uD83D%uDE00');
 
+-- Surrogate pair should also decode correctly in the middle of a string.
+SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world');
+
+-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair.
+SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00');
+
+-- Truncated surrogate pair should raise an error.
+SELECT url_tools_schema.decode_url('%uD83D');
+
+-- High surrogate followed by a non-low-surrogate code unit should fail.
+SELECT url_tools_schema.decode_url('%uD83D%u0041');
+
+DROP EXTENSION gp_url_tools;
diff --git a/pom.xml b/pom.xml
index 0e000093399..98e1931d8da 100644
--- a/pom.xml
+++ b/pom.xml
@@ -155,6 +155,9 @@ code or new licensing patterns.
 
             <exclude>gpcontrib/diskquota/**</exclude>
 
+            <exclude>gpcontrib/gp_url_tools/Makefile</exclude>
+            <exclude>gpcontrib/gp_url_tools/gp_url_tools.control</exclude>
+
             <exclude>getversion</exclude>
             <exclude>.git-blame-ignore-revs</exclude>
             <exclude>.dir-locals.el</exclude>