From a30369573350d68b414aaa3494767695ab172c06 Mon Sep 17 00:00:00 2001 From: Ovchinnikov Andrew <63587191+AndrewOvvv@users.noreply.github.com> Date: Tue, 23 Jun 2026 19:47:50 +0300 Subject: [PATCH 1/2] Feat: Import gp_url_tools into gpcontrib from greenplum --- gpcontrib/gp_url_tools/.clang-format | 192 ++++++++++++++ gpcontrib/gp_url_tools/Makefile | 11 + gpcontrib/gp_url_tools/README.md | 45 ++++ gpcontrib/gp_url_tools/gp_url_tools.control | 6 + .../gp_url_tools/sql/gp_url_tools--1.0.sql | 27 ++ gpcontrib/gp_url_tools/src/gp_url_tools.c | 249 ++++++++++++++++++ .../test/expected/gp_url_tools.out | 41 +++ .../gp_url_tools/test/sql/gp_url_tools.sql | 13 + 8 files changed, 584 insertions(+) create mode 100644 gpcontrib/gp_url_tools/.clang-format create mode 100644 gpcontrib/gp_url_tools/Makefile create mode 100644 gpcontrib/gp_url_tools/README.md create mode 100644 gpcontrib/gp_url_tools/gp_url_tools.control create mode 100644 gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql create mode 100644 gpcontrib/gp_url_tools/src/gp_url_tools.c create mode 100644 gpcontrib/gp_url_tools/test/expected/gp_url_tools.out create mode 100644 gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql diff --git a/gpcontrib/gp_url_tools/.clang-format b/gpcontrib/gp_url_tools/.clang-format new file mode 100644 index 00000000000..6f46b9469d9 --- /dev/null +++ b/gpcontrib/gp_url_tools/.clang-format @@ -0,0 +1,192 @@ +--- +Language: Cpp +# BasedOnStyle: LLVM +AccessModifierOffset: -2 +AlignAfterOpenBracket: Align +AlignArrayOfStructures: None +AlignConsecutiveMacros: None +AlignConsecutiveAssignments: None +AlignConsecutiveBitFields: None +AlignConsecutiveDeclarations: None +AlignEscapedNewlines: Right +AlignOperands: Align +AlignTrailingComments: true +AllowAllArgumentsOnNextLine: true +AllowAllParametersOfDeclarationOnNextLine: true +AllowShortEnumsOnASingleLine: true +AllowShortBlocksOnASingleLine: Never +AllowShortCaseLabelsOnASingleLine: false +AllowShortFunctionsOnASingleLine: All +AllowShortLambdasOnASingleLine: All +AllowShortIfStatementsOnASingleLine: Never +AllowShortLoopsOnASingleLine: false +AlwaysBreakAfterDefinitionReturnType: None +AlwaysBreakAfterReturnType: None +AlwaysBreakBeforeMultilineStrings: false +AlwaysBreakTemplateDeclarations: MultiLine +AttributeMacros: + - __capability +BinPackArguments: true +BinPackParameters: true +BraceWrapping: + AfterCaseLabel: false + AfterClass: false + AfterControlStatement: Never + AfterEnum: false + AfterFunction: false + AfterNamespace: false + AfterObjCDeclaration: false + AfterStruct: false + AfterUnion: false + AfterExternBlock: false + BeforeCatch: false + BeforeElse: false + BeforeLambdaBody: false + BeforeWhile: false + IndentBraces: false + SplitEmptyFunction: true + SplitEmptyRecord: true + SplitEmptyNamespace: true +BreakBeforeBinaryOperators: None +BreakBeforeConceptDeclarations: true +BreakBeforeBraces: Attach +BreakBeforeInheritanceComma: false +BreakInheritanceList: BeforeColon +BreakBeforeTernaryOperators: true +BreakConstructorInitializersBeforeComma: false +BreakConstructorInitializers: BeforeColon +BreakAfterJavaFieldAnnotations: false +BreakStringLiterals: true +ColumnLimit: 80 +CommentPragmas: '^ IWYU pragma:' +QualifierAlignment: Leave +CompactNamespaces: false +ConstructorInitializerIndentWidth: 4 +ContinuationIndentWidth: 4 +Cpp11BracedListStyle: true +DeriveLineEnding: true +DerivePointerAlignment: false +DisableFormat: false +EmptyLineAfterAccessModifier: Never +EmptyLineBeforeAccessModifier: LogicalBlock +ExperimentalAutoDetectBinPacking: false +PackConstructorInitializers: BinPack +BasedOnStyle: '' +ConstructorInitializerAllOnOneLineOrOnePerLine: false +AllowAllConstructorInitializersOnNextLine: true +FixNamespaceComments: true +ForEachMacros: + - foreach + - Q_FOREACH + - BOOST_FOREACH +IfMacros: + - KJ_IF_MAYBE +IncludeBlocks: Preserve +IncludeCategories: + - Regex: '^"(llvm|llvm-c|clang|clang-c)/' + Priority: 2 + SortPriority: 0 + CaseSensitive: false + - Regex: '^(<|"(gtest|gmock|isl|json)/)' + Priority: 3 + SortPriority: 0 + CaseSensitive: false + - Regex: '.*' + Priority: 1 + SortPriority: 0 + CaseSensitive: false +IncludeIsMainRegex: '(Test)?$' +IncludeIsMainSourceRegex: '' +IndentAccessModifiers: false +IndentCaseLabels: false +IndentCaseBlocks: false +IndentGotoLabels: true +IndentPPDirectives: None +IndentExternBlock: AfterExternBlock +IndentRequires: false +IndentWidth: 4 +IndentWrappedFunctionNames: false +InsertTrailingCommas: None +JavaScriptQuotes: Leave +JavaScriptWrapImports: true +KeepEmptyLinesAtTheStartOfBlocks: true +LambdaBodyIndentation: Signature +MacroBlockBegin: '' +MacroBlockEnd: '' +MaxEmptyLinesToKeep: 1 +NamespaceIndentation: None +ObjCBinPackProtocolList: Auto +ObjCBlockIndentWidth: 2 +ObjCBreakBeforeNestedBlockParam: true +ObjCSpaceAfterProperty: false +ObjCSpaceBeforeProtocolList: true +PenaltyBreakAssignment: 2 +PenaltyBreakBeforeFirstCallParameter: 19 +PenaltyBreakComment: 300 +PenaltyBreakFirstLessLess: 120 +PenaltyBreakOpenParenthesis: 0 +PenaltyBreakString: 1000 +PenaltyBreakTemplateDeclaration: 10 +PenaltyExcessCharacter: 1000000 +PenaltyReturnTypeOnItsOwnLine: 60 +PenaltyIndentedWhitespace: 0 +PointerAlignment: Right +PPIndentWidth: -1 +ReferenceAlignment: Pointer +ReflowComments: true +RemoveBracesLLVM: false +SeparateDefinitionBlocks: Leave +ShortNamespaceLines: 1 +SortIncludes: CaseSensitive +SortJavaStaticImport: Before +SortUsingDeclarations: true +SpaceAfterCStyleCast: false +SpaceAfterLogicalNot: false +SpaceAfterTemplateKeyword: true +SpaceBeforeAssignmentOperators: true +SpaceBeforeCaseColon: false +SpaceBeforeCpp11BracedList: false +SpaceBeforeCtorInitializerColon: true +SpaceBeforeInheritanceColon: true +SpaceBeforeParens: ControlStatements +SpaceBeforeParensOptions: + AfterControlStatements: true + AfterForeachMacros: true + AfterFunctionDefinitionName: false + AfterFunctionDeclarationName: false + AfterIfMacros: true + AfterOverloadedOperator: false + BeforeNonEmptyParentheses: false +SpaceAroundPointerQualifiers: Default +SpaceBeforeRangeBasedForLoopColon: true +SpaceInEmptyBlock: false +SpaceInEmptyParentheses: false +SpacesBeforeTrailingComments: 1 +SpacesInAngles: Never +SpacesInConditionalStatement: false +SpacesInContainerLiterals: true +SpacesInCStyleCastParentheses: false +SpacesInLineCommentPrefix: + Minimum: 1 + Maximum: -1 +SpacesInParentheses: false +SpacesInSquareBrackets: false +SpaceBeforeSquareBrackets: false +BitFieldColonSpacing: Both +Standard: Latest +StatementAttributeLikeMacros: + - Q_EMIT +StatementMacros: + - Q_UNUSED + - QT_REQUIRE_VERSION +TabWidth: 8 +UseCRLF: false +UseTab: Never +WhitespaceSensitiveMacros: + - STRINGIZE + - PP_STRINGIZE + - BOOST_PP_STRINGIZE + - NS_SWIFT_NAME + - CF_SWIFT_NAME +... + diff --git a/gpcontrib/gp_url_tools/Makefile b/gpcontrib/gp_url_tools/Makefile new file mode 100644 index 00000000000..55c4767d637 --- /dev/null +++ b/gpcontrib/gp_url_tools/Makefile @@ -0,0 +1,11 @@ +DATA = $(wildcard sql/*.sql) +MODULES = $(patsubst %.c,%,$(wildcard src/*.c)) +EXTENSION = gp_url_tools + +TESTS = $(wildcard test/sql/*.sql) +REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) +REGRESS_OPTS = --inputdir=test + +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) diff --git a/gpcontrib/gp_url_tools/README.md b/gpcontrib/gp_url_tools/README.md new file mode 100644 index 00000000000..172e6dc8155 --- /dev/null +++ b/gpcontrib/gp_url_tools/README.md @@ -0,0 +1,45 @@ +# gp_url_tools: Greenplum extension providing functionality for working with URL addresses + +### Features +gp_url_tools is an extension for the Greenplum database that gives implementation +for functions that encode/decode url/uri. + +### Installation +Install from source: +``` +git clone https://github.com/open-gpdb/gp_url_tools.git +cd gp_url_tools +# Build it. Building would require GP installed nearby and sourcing greenplum_path.sh +source /greenplum_path.sh +make && make install +``` + +### Usage +``` +=# create extension gp_url_tools; + +=# select url_tools_schema.encode_url('Hello World'); + encode_url +─────────────── + Hello%20World +(1 row) + +=# select url_tools_schema.decode_url('Hello%20World'); + decode_url +───────────── + Hello World +(1 row) + +=# select url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)'); + encode_uri +──────────────────────────────────────────────────────────────────────────────────────────── + https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F) + +=# select url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)'); + decode_uri +──────────────────────────────────────────────────── + https://ru.wikipedia.org/wiki/Greenplum_(компания) +``` + +### Acknowledgments +Thank you very much for the extension for postgrsql: https://github.com/okbob/url_encode, its sources were very useful. diff --git a/gpcontrib/gp_url_tools/gp_url_tools.control b/gpcontrib/gp_url_tools/gp_url_tools.control new file mode 100644 index 00000000000..ecbfab545b6 --- /dev/null +++ b/gpcontrib/gp_url_tools/gp_url_tools.control @@ -0,0 +1,6 @@ +# gp_url_tools extension +comment = 'Greenplum extension providing functionality for working with URL addresses' +default_version = '1.0' +module_pathname = '$libdir/gp_url_tools' +relocatable = true +trusted = true diff --git a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql new file mode 100644 index 00000000000..04183a97bc7 --- /dev/null +++ b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql @@ -0,0 +1,27 @@ +/* gp_url_tools--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION gp_url_tools" to load this file. \quit + +CREATE SCHEMA IF NOT EXISTS url_tools_schema; + +CREATE FUNCTION url_tools_schema.encode_url(text) +RETURNS text +AS 'MODULE_PATHNAME', 'encode_url' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION url_tools_schema.decode_url(text) +RETURNS text +AS 'MODULE_PATHNAME', 'decode_url' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION url_tools_schema.encode_uri(text) +RETURNS text +AS 'MODULE_PATHNAME', 'encode_uri' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION url_tools_schema.decode_uri(text) +RETURNS text +AS 'MODULE_PATHNAME', 'decode_uri' +LANGUAGE C IMMUTABLE STRICT; + diff --git a/gpcontrib/gp_url_tools/src/gp_url_tools.c b/gpcontrib/gp_url_tools/src/gp_url_tools.c new file mode 100644 index 00000000000..49720a3a95f --- /dev/null +++ b/gpcontrib/gp_url_tools/src/gp_url_tools.c @@ -0,0 +1,249 @@ +#include "postgres.h" + +#include "fmgr.h" +#include "mb/pg_wchar.h" +#include "utils/builtins.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(encode_url); +PG_FUNCTION_INFO_V1(decode_url); +PG_FUNCTION_INFO_V1(encode_uri); +PG_FUNCTION_INFO_V1(decode_uri); + +Datum url_encode(PG_FUNCTION_ARGS); +Datum url_decode(PG_FUNCTION_ARGS); +Datum uri_encode(PG_FUNCTION_ARGS); +Datum uri_decode(PG_FUNCTION_ARGS); + +static bool allowed_character(const char c, const char *unreserved_special); +static unsigned char char2hex(char c); +static char *write_character(char *output, const char c); +static void valid_encoding_length(char *current, char *end, int length); +static text *encode(text *input, const char *unreserved_special); +static bool valid_utf16(unsigned int byte, int byte_num); +static unsigned int decode_utf16_pair(unsigned int bytes[2]); +static text *decode(text *input, const char *unreserved_special); +static bool is_utf8(const char *sequence, int length); +static bool is_utf16(const char *sequence, int length); +static void fetch_utf16(unsigned int *byte, const char *input); + +static const unsigned int utf16_low[2] = {0xD800, 0xDC00}; +static const unsigned int utf16_high[2] = {0xDBFF, 0xDFFF}; +static const unsigned int utf16_decode = 0x03FF; +static const unsigned int utf16_decode_base = 0x10000; + +unsigned char char2hex(char c) { + if ('0' <= c && c <= '9') { + return c - '0'; + } else if ('A' <= c && c <= 'Z') { + return c - 'A' + 10; + } else if ('a' <= c && c <= 'z') { + return c - 'a' + 10; + } + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid hexadecimal digit: \"%c\"", c))); + return -1; +} + +bool allowed_character(const char c, const char *unreserved_special) { + return ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || + ('a' <= c && c <= 'z') || (strchr(unreserved_special, c) != NULL); +} + +char *write_character(char *output, const char c) { + *output = c; + return ++output; +} + +void valid_encoding_length(char *current, char *end, int length) { + Assert(current + length <= end); +} + +text *encode(text *input, const char *unreserved_special) { + int input_length, output_length; + text *output; + char *cinput, *coutput, *current, *cend; + + // Convert input data for processing + cinput = text_to_cstring(input); + input_length = strlen(cinput); + // Allocate memory for result url string (allocate more memory for bad + // cases) + output_length = 3 * input_length + 1; + coutput = palloc(sizeof(*coutput) * output_length); + current = coutput; + cend = coutput + output_length; + + for (int i = 0; i < input_length; ++i) { + if (allowed_character(cinput[i], unreserved_special)) { + // single character => does not encode it or skip it + valid_encoding_length(current, cend, 1); + current = write_character(current, cinput[i]); + } else { + // some characters => process them all into '%XX' or '%XXXX' + // notation + valid_encoding_length(current, cend, 2); + current += sprintf(current, "%%%02X", (unsigned char)cinput[i]); + } + } + valid_encoding_length(current, cend, 1); + current = write_character(current, 0); + + // Convert to text and return + output = cstring_to_text(coutput); + pfree(coutput); + return output; +} + +bool valid_utf16(unsigned int byte, int byte_num) { + return utf16_low[byte_num] <= byte && byte <= utf16_high[byte_num]; +} + +unsigned int decode_utf16_pair(unsigned int bytes[2]) { + Assert(valid_utf16(bytes[0], 0)); + Assert(valid_utf16(bytes[1], 1)); + + return (utf16_decode_base + ((bytes[0] & utf16_decode) << 10) + + (bytes[1] & utf16_decode)); +} + +// Check that sequence of bytes starts with 'symbol' in UTF-8 encoding +// +// UTF-16 'symbols' starts with '%' or '%', and 'XX' after it. +// 'XX' - hex sequence that encode bytes +bool is_utf8(const char *sequence, int length) { + return 3 <= length && sequence[0] == '%' && sequence[1] != 'u' && + sequence[1] != 'U'; +} + +// Check that sequence of bytes starts with 'symbol' in UTF-16 encoding +// +// UTF-16 'symbols' starts with '%u' or '%U', and 'XXXX' after it. +// 'XXXX' - hex sequence that encode bytes (optinally sequence 'XXXX' -> +// 'XXXXXXXX') +bool is_utf16(const char *sequence, int length) { + return 6 <= length && sequence[0] == '%' && + (sequence[1] == 'u' || sequence[1] == 'U'); +} + +void fetch_utf16(unsigned int *byte, const char *input) { + for (int i = 0; i < 4; ++i) { + *byte = ((*byte) << 4) | char2hex(input[i]); + } +} + +text *decode(text *input, const char *unreserved_special) { + int input_length; + text *output; + char *cinput, *coutput, *current; + + // Convert input data for processing + cinput = text_to_cstring(input); + input_length = strlen(cinput); + // Allocate memory for result string + coutput = palloc(sizeof(*coutput) * (input_length + 1)); + current = coutput; + + for (int i = 0; i < input_length;) { + if (cinput[i] == '%') { + // special character => start process '%XX' or '%XXXX' sequence of + // chars + if (is_utf16(cinput + i, input_length - i)) { + unsigned int result = 0; + unsigned int bytes[2] = {0, 0}; + unsigned char buffer[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + + fetch_utf16(bytes, cinput + i + 2); + + if (valid_utf16(bytes[0], 0)) { + if (10 < input_length - i) { + ereport( + ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid sequence: not enough characters " + "to decode UTF-16 symbol from %d position", + i))); + } + + fetch_utf16(bytes + 1, cinput + i + 6); + if (!valid_utf16(bytes[1], 1)) { + ereport( + ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid UTF-16 byte: characters from %d " + "position define invalid UTF-16 symbol", + i + 6))); + } + + result = decode_utf16_pair(bytes); + i += 10; + } else { + result = bytes[0]; + i += 6; + } + + unicode_to_utf8((pg_wchar)result, buffer); + strncpy(current, (const char *)buffer, pg_utf_mblen(buffer)); + current += pg_utf_mblen(buffer); + } else if (is_utf8(cinput + i, input_length - i)) { + current = + write_character(current, (char2hex(cinput[i + 1]) << 4) | + char2hex(cinput[i + 2])); + i += 3; + } else { + // common case: not enough characters in line to decode special + // sequence => error 'incorrect sequence of tokens' + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid sequence: not enough characters to " + "decode any UTF-typed symbol from %d position", + i))); + } + } else if (allowed_character(cinput[i], unreserved_special)) { + // allowed and not '%' character => just copy it into result string + current = write_character(current, cinput[i]); + i += 1; + } else { + // cinput[i] - is not '%' and not allowed character => error + // 'unexpected character' + ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("unalloweed characters in url code: \"%c\"", + cinput[i]))); + } + } + current = write_character(current, 0); + + // Convert to text and return + output = cstring_to_text(coutput); + pfree(coutput); + return output; +} + +Datum encode_url(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) { + PG_RETURN_NULL(); + } + PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), ".-~_")); +} + +Datum decode_url(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) { + PG_RETURN_NULL(); + } + PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), ".-~_")); +} + +Datum encode_uri(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) { + PG_RETURN_NULL(); + } + PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#")); +} + +Datum decode_uri(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) { + PG_RETURN_NULL(); + } + PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#")); +} diff --git a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out new file mode 100644 index 00000000000..16cf3df2ea4 --- /dev/null +++ b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out @@ -0,0 +1,41 @@ +CREATE EXTENSION gp_url_tools; +SELECT url_tools_schema.encode_url('Hello World'); + encode_url +--------------- + Hello%20World +(1 row) + +SELECT url_tools_schema.decode_url('Hello%20World'); + decode_url +------------- + Hello World +(1 row) + +SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/')); + encode_url +------------------ + http%3A + + hu.wikipedia.org + wiki + S%C3%A3o_Paulo +(5 rows) + +SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo'); + encode_uri +--------------------------------------------- + http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo +(1 row) + +SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo')); + md5 +---------------------------------- + 147ded7d471df9cf050bc13242cbf39e +(1 row) + +SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV')); + md5 +---------------------------------- + d155b1f894fcd5540ba5881fb71753e1 +(1 row) + diff --git a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql new file mode 100644 index 00000000000..dc9a1fe0819 --- /dev/null +++ b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql @@ -0,0 +1,13 @@ +CREATE EXTENSION gp_url_tools; + +SELECT url_tools_schema.encode_url('Hello World'); +SELECT url_tools_schema.decode_url('Hello%20World'); + +SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/')); + +SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo'); +SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo')); + +SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV')); + + From 51ecfef21c3fe74016d93358417eaae69d84230f Mon Sep 17 00:00:00 2001 From: Vladislav Shchetinin Date: Thu, 25 Jun 2026 11:34:59 +0300 Subject: [PATCH 2/2] Feat: Adapt gp_url_tools for Cloudberry and fix UTF-16 surrogate decoding --- .github/workflows/build-cloudberry-rocky8.yml | 3 +- .github/workflows/build-cloudberry.yml | 3 +- gpcontrib/gp_url_tools/.clang-format | 192 ----------------- gpcontrib/gp_url_tools/Makefile | 6 + gpcontrib/gp_url_tools/README.md | 72 +++++-- gpcontrib/gp_url_tools/gp_url_tools.control | 2 +- .../gp_url_tools/sql/gp_url_tools--1.0.sql | 2 +- gpcontrib/gp_url_tools/src/gp_url_tools.c | 195 ++++++++++-------- .../test/expected/gp_url_tools.out | 37 +++- .../gp_url_tools/test/sql/gp_url_tools.sql | 24 ++- pom.xml | 3 + 11 files changed, 229 insertions(+), 310 deletions(-) delete mode 100644 gpcontrib/gp_url_tools/.clang-format diff --git a/.github/workflows/build-cloudberry-rocky8.yml b/.github/workflows/build-cloudberry-rocky8.yml index 2abf88060e3..11cf52e0710 100644 --- a/.github/workflows/build-cloudberry-rocky8.yml +++ b/.github/workflows/build-cloudberry-rocky8.yml @@ -318,7 +318,8 @@ jobs: "make_configs":["gpcontrib/orafce:installcheck", "gpcontrib/zstd:installcheck", "gpcontrib/gp_sparse_vector:installcheck", - "gpcontrib/gp_toolkit:installcheck"] + "gpcontrib/gp_toolkit:installcheck", + "gpcontrib/gp_url_tools:installcheck"] }, {"test":"ic-fixme", "make_configs":["src/test/regress:installcheck-fixme"], diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml index ca75f7b42e7..364684a904a 100644 --- a/.github/workflows/build-cloudberry.yml +++ b/.github/workflows/build-cloudberry.yml @@ -312,7 +312,8 @@ jobs: "gpcontrib/zstd:installcheck", "gpcontrib/gp_sparse_vector:installcheck", "gpcontrib/gp_toolkit:installcheck", - "gpcontrib/gp_exttable_fdw:installcheck"] + "gpcontrib/gp_exttable_fdw:installcheck", + "gpcontrib/gp_url_tools:installcheck"] }, {"test":"ic-diskquota", "make_configs":["gpcontrib/diskquota:installcheck"], diff --git a/gpcontrib/gp_url_tools/.clang-format b/gpcontrib/gp_url_tools/.clang-format deleted file mode 100644 index 6f46b9469d9..00000000000 --- a/gpcontrib/gp_url_tools/.clang-format +++ /dev/null @@ -1,192 +0,0 @@ ---- -Language: Cpp -# BasedOnStyle: LLVM -AccessModifierOffset: -2 -AlignAfterOpenBracket: Align -AlignArrayOfStructures: None -AlignConsecutiveMacros: None -AlignConsecutiveAssignments: None -AlignConsecutiveBitFields: None -AlignConsecutiveDeclarations: None -AlignEscapedNewlines: Right -AlignOperands: Align -AlignTrailingComments: true -AllowAllArgumentsOnNextLine: true -AllowAllParametersOfDeclarationOnNextLine: true -AllowShortEnumsOnASingleLine: true -AllowShortBlocksOnASingleLine: Never -AllowShortCaseLabelsOnASingleLine: false -AllowShortFunctionsOnASingleLine: All -AllowShortLambdasOnASingleLine: All -AllowShortIfStatementsOnASingleLine: Never -AllowShortLoopsOnASingleLine: false -AlwaysBreakAfterDefinitionReturnType: None -AlwaysBreakAfterReturnType: None -AlwaysBreakBeforeMultilineStrings: false -AlwaysBreakTemplateDeclarations: MultiLine -AttributeMacros: - - __capability -BinPackArguments: true -BinPackParameters: true -BraceWrapping: - AfterCaseLabel: false - AfterClass: false - AfterControlStatement: Never - AfterEnum: false - AfterFunction: false - AfterNamespace: false - AfterObjCDeclaration: false - AfterStruct: false - AfterUnion: false - AfterExternBlock: false - BeforeCatch: false - BeforeElse: false - BeforeLambdaBody: false - BeforeWhile: false - IndentBraces: false - SplitEmptyFunction: true - SplitEmptyRecord: true - SplitEmptyNamespace: true -BreakBeforeBinaryOperators: None -BreakBeforeConceptDeclarations: true -BreakBeforeBraces: Attach -BreakBeforeInheritanceComma: false -BreakInheritanceList: BeforeColon -BreakBeforeTernaryOperators: true -BreakConstructorInitializersBeforeComma: false -BreakConstructorInitializers: BeforeColon -BreakAfterJavaFieldAnnotations: false -BreakStringLiterals: true -ColumnLimit: 80 -CommentPragmas: '^ IWYU pragma:' -QualifierAlignment: Leave -CompactNamespaces: false -ConstructorInitializerIndentWidth: 4 -ContinuationIndentWidth: 4 -Cpp11BracedListStyle: true -DeriveLineEnding: true -DerivePointerAlignment: false -DisableFormat: false -EmptyLineAfterAccessModifier: Never -EmptyLineBeforeAccessModifier: LogicalBlock -ExperimentalAutoDetectBinPacking: false -PackConstructorInitializers: BinPack -BasedOnStyle: '' -ConstructorInitializerAllOnOneLineOrOnePerLine: false -AllowAllConstructorInitializersOnNextLine: true -FixNamespaceComments: true -ForEachMacros: - - foreach - - Q_FOREACH - - BOOST_FOREACH -IfMacros: - - KJ_IF_MAYBE -IncludeBlocks: Preserve -IncludeCategories: - - Regex: '^"(llvm|llvm-c|clang|clang-c)/' - Priority: 2 - SortPriority: 0 - CaseSensitive: false - - Regex: '^(<|"(gtest|gmock|isl|json)/)' - Priority: 3 - SortPriority: 0 - CaseSensitive: false - - Regex: '.*' - Priority: 1 - SortPriority: 0 - CaseSensitive: false -IncludeIsMainRegex: '(Test)?$' -IncludeIsMainSourceRegex: '' -IndentAccessModifiers: false -IndentCaseLabels: false -IndentCaseBlocks: false -IndentGotoLabels: true -IndentPPDirectives: None -IndentExternBlock: AfterExternBlock -IndentRequires: false -IndentWidth: 4 -IndentWrappedFunctionNames: false -InsertTrailingCommas: None -JavaScriptQuotes: Leave -JavaScriptWrapImports: true -KeepEmptyLinesAtTheStartOfBlocks: true -LambdaBodyIndentation: Signature -MacroBlockBegin: '' -MacroBlockEnd: '' -MaxEmptyLinesToKeep: 1 -NamespaceIndentation: None -ObjCBinPackProtocolList: Auto -ObjCBlockIndentWidth: 2 -ObjCBreakBeforeNestedBlockParam: true -ObjCSpaceAfterProperty: false -ObjCSpaceBeforeProtocolList: true -PenaltyBreakAssignment: 2 -PenaltyBreakBeforeFirstCallParameter: 19 -PenaltyBreakComment: 300 -PenaltyBreakFirstLessLess: 120 -PenaltyBreakOpenParenthesis: 0 -PenaltyBreakString: 1000 -PenaltyBreakTemplateDeclaration: 10 -PenaltyExcessCharacter: 1000000 -PenaltyReturnTypeOnItsOwnLine: 60 -PenaltyIndentedWhitespace: 0 -PointerAlignment: Right -PPIndentWidth: -1 -ReferenceAlignment: Pointer -ReflowComments: true -RemoveBracesLLVM: false -SeparateDefinitionBlocks: Leave -ShortNamespaceLines: 1 -SortIncludes: CaseSensitive -SortJavaStaticImport: Before -SortUsingDeclarations: true -SpaceAfterCStyleCast: false -SpaceAfterLogicalNot: false -SpaceAfterTemplateKeyword: true -SpaceBeforeAssignmentOperators: true -SpaceBeforeCaseColon: false -SpaceBeforeCpp11BracedList: false -SpaceBeforeCtorInitializerColon: true -SpaceBeforeInheritanceColon: true -SpaceBeforeParens: ControlStatements -SpaceBeforeParensOptions: - AfterControlStatements: true - AfterForeachMacros: true - AfterFunctionDefinitionName: false - AfterFunctionDeclarationName: false - AfterIfMacros: true - AfterOverloadedOperator: false - BeforeNonEmptyParentheses: false -SpaceAroundPointerQualifiers: Default -SpaceBeforeRangeBasedForLoopColon: true -SpaceInEmptyBlock: false -SpaceInEmptyParentheses: false -SpacesBeforeTrailingComments: 1 -SpacesInAngles: Never -SpacesInConditionalStatement: false -SpacesInContainerLiterals: true -SpacesInCStyleCastParentheses: false -SpacesInLineCommentPrefix: - Minimum: 1 - Maximum: -1 -SpacesInParentheses: false -SpacesInSquareBrackets: false -SpaceBeforeSquareBrackets: false -BitFieldColonSpacing: Both -Standard: Latest -StatementAttributeLikeMacros: - - Q_EMIT -StatementMacros: - - Q_UNUSED - - QT_REQUIRE_VERSION -TabWidth: 8 -UseCRLF: false -UseTab: Never -WhitespaceSensitiveMacros: - - STRINGIZE - - PP_STRINGIZE - - BOOST_PP_STRINGIZE - - NS_SWIFT_NAME - - CF_SWIFT_NAME -... - diff --git a/gpcontrib/gp_url_tools/Makefile b/gpcontrib/gp_url_tools/Makefile index 55c4767d637..c161751a88e 100644 --- a/gpcontrib/gp_url_tools/Makefile +++ b/gpcontrib/gp_url_tools/Makefile @@ -6,6 +6,12 @@ TESTS = $(wildcard test/sql/*.sql) REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) REGRESS_OPTS = --inputdir=test +ifdef USE_PGXS PG_CONFIG = pg_config PGXS := $(shell $(PG_CONFIG) --pgxs) include $(PGXS) +else +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/gpcontrib/gp_url_tools/README.md b/gpcontrib/gp_url_tools/README.md index 172e6dc8155..b95fa95a4ed 100644 --- a/gpcontrib/gp_url_tools/README.md +++ b/gpcontrib/gp_url_tools/README.md @@ -1,45 +1,75 @@ -# gp_url_tools: Greenplum extension providing functionality for working with URL addresses + + +# gp_url_tools: Cloudberry extension providing functionality for working with URL addresses ### Features -gp_url_tools is an extension for the Greenplum database that gives implementation +`gp_url_tools` is an extension for the Cloudberry database that gives implementation for functions that encode/decode url/uri. -### Installation -Install from source: -``` -git clone https://github.com/open-gpdb/gp_url_tools.git -cd gp_url_tools -# Build it. Building would require GP installed nearby and sourcing greenplum_path.sh -source /greenplum_path.sh -make && make install -``` +### Functions +The extension creates the `url_tools_schema` schema and adds four SQL functions: + +- `url_tools_schema.encode_url`/`.encode_uri` + Encodes a text value for use as a URL/URI component by replacing reserved characters with percent-encoded sequences. + +- `url_tools_schema.decode_url`/`.decode_uri` + Decodes percent-encoded sequences in a URL/URI-encoded text value back to their original characters (human-readable). ### Usage +```sql +CREATE EXTENSION gp_url_tools; ``` -=# create extension gp_url_tools; - -=# select url_tools_schema.encode_url('Hello World'); +```sql +SELECT url_tools_schema.encode_url('Hello World'); +``` +```bash encode_url ─────────────── Hello%20World (1 row) - -=# select url_tools_schema.decode_url('Hello%20World'); +``` +```sql +SELECT url_tools_schema.decode_url('Hello%20World'); +``` +```bash decode_url ───────────── Hello World (1 row) - -=# select url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)'); +``` +```sql +SELECT url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)'); +``` +```bash encode_uri ──────────────────────────────────────────────────────────────────────────────────────────── https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F) - -=# select url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)'); +``` +```sql +SELECT url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)'); +``` +```bash decode_uri ──────────────────────────────────────────────────── https://ru.wikipedia.org/wiki/Greenplum_(компания) ``` ### Acknowledgments -Thank you very much for the extension for postgrsql: https://github.com/okbob/url_encode, its sources were very useful. +Thank you very much for the extension for PostgreSQL: https://github.com/okbob/url_encode, its sources were very useful. diff --git a/gpcontrib/gp_url_tools/gp_url_tools.control b/gpcontrib/gp_url_tools/gp_url_tools.control index ecbfab545b6..cb16430ad62 100644 --- a/gpcontrib/gp_url_tools/gp_url_tools.control +++ b/gpcontrib/gp_url_tools/gp_url_tools.control @@ -1,5 +1,5 @@ # gp_url_tools extension -comment = 'Greenplum extension providing functionality for working with URL addresses' +comment = 'Functions for working with URL-s' default_version = '1.0' module_pathname = '$libdir/gp_url_tools' relocatable = true diff --git a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql index 04183a97bc7..3b2a773719a 100644 --- a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql +++ b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql @@ -4,6 +4,7 @@ \echo Use "CREATE EXTENSION gp_url_tools" to load this file. \quit CREATE SCHEMA IF NOT EXISTS url_tools_schema; +GRANT USAGE ON SCHEMA url_tools_schema TO public; CREATE FUNCTION url_tools_schema.encode_url(text) RETURNS text @@ -24,4 +25,3 @@ CREATE FUNCTION url_tools_schema.decode_uri(text) RETURNS text AS 'MODULE_PATHNAME', 'decode_uri' LANGUAGE C IMMUTABLE STRICT; - diff --git a/gpcontrib/gp_url_tools/src/gp_url_tools.c b/gpcontrib/gp_url_tools/src/gp_url_tools.c index 49720a3a95f..7c397828ded 100644 --- a/gpcontrib/gp_url_tools/src/gp_url_tools.c +++ b/gpcontrib/gp_url_tools/src/gp_url_tools.c @@ -1,3 +1,30 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * gp_url_tools.c + * + * IDENTIFICATION + * gpcontrib/gp_url_tools/src/gp_url_tools.c + * + *------------------------------------------------------------------------- + */ + #include "postgres.h" #include "fmgr.h" @@ -11,56 +38,43 @@ PG_FUNCTION_INFO_V1(decode_url); PG_FUNCTION_INFO_V1(encode_uri); PG_FUNCTION_INFO_V1(decode_uri); -Datum url_encode(PG_FUNCTION_ARGS); -Datum url_decode(PG_FUNCTION_ARGS); -Datum uri_encode(PG_FUNCTION_ARGS); -Datum uri_decode(PG_FUNCTION_ARGS); - -static bool allowed_character(const char c, const char *unreserved_special); -static unsigned char char2hex(char c); -static char *write_character(char *output, const char c); -static void valid_encoding_length(char *current, char *end, int length); -static text *encode(text *input, const char *unreserved_special); -static bool valid_utf16(unsigned int byte, int byte_num); -static unsigned int decode_utf16_pair(unsigned int bytes[2]); -static text *decode(text *input, const char *unreserved_special); -static bool is_utf8(const char *sequence, int length); -static bool is_utf16(const char *sequence, int length); -static void fetch_utf16(unsigned int *byte, const char *input); - static const unsigned int utf16_low[2] = {0xD800, 0xDC00}; static const unsigned int utf16_high[2] = {0xDBFF, 0xDFFF}; static const unsigned int utf16_decode = 0x03FF; static const unsigned int utf16_decode_base = 0x10000; - -unsigned char char2hex(char c) { - if ('0' <= c && c <= '9') { +static const int utf8_with_percent_length = 3; // Example: '%20 +static const int utf16_with_percent_length = 6; // Example: '%u0430' +static const int utf16_surrogate_pair_length = 12; // Example: '%uD800%uDC00' +static const int utf16_second_codepoint_offset = 8; // '%uD800%uDC00' => ('%uD800%u'.lenght == 8) +static const int utf16_past_first_codepoint_offset = 6; // '%uD800%uDC00' => ('%uD800'.lenght == 6) + +static unsigned char hex_char_to_value(char c) { + if ('0' <= c && c <= '9') return c - '0'; - } else if ('A' <= c && c <= 'Z') { + if ('A' <= c && c <= 'F') return c - 'A' + 10; - } else if ('a' <= c && c <= 'z') { + if ('a' <= c && c <= 'f') return c - 'a' + 10; - } ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), errmsg("invalid hexadecimal digit: \"%c\"", c))); - return -1; + pg_unreachable(); } -bool allowed_character(const char c, const char *unreserved_special) { +static bool allowed_character(const char c, const char *unreserved_special) { return ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || ('a' <= c && c <= 'z') || (strchr(unreserved_special, c) != NULL); } -char *write_character(char *output, const char c) { +static char *write_character(char *output, const char c) { *output = c; return ++output; } -void valid_encoding_length(char *current, char *end, int length) { +static void valid_encoding_length(char *current, char *end, int length) { Assert(current + length <= end); } -text *encode(text *input, const char *unreserved_special) { +static text *encode(text *input, const char *unreserved_special) { int input_length, output_length; text *output; char *cinput, *coutput, *current, *cend; @@ -68,8 +82,10 @@ text *encode(text *input, const char *unreserved_special) { // Convert input data for processing cinput = text_to_cstring(input); input_length = strlen(cinput); - // Allocate memory for result url string (allocate more memory for bad - // cases) + /* + * Worst case: every input byte becomes '%XX' (3 output chars). + * The +1 accounts for the null terminator + */ output_length = 3 * input_length + 1; coutput = palloc(sizeof(*coutput) * output_length); current = coutput; @@ -77,30 +93,29 @@ text *encode(text *input, const char *unreserved_special) { for (int i = 0; i < input_length; ++i) { if (allowed_character(cinput[i], unreserved_special)) { - // single character => does not encode it or skip it + // Allowed character => copy it into result string valid_encoding_length(current, cend, 1); current = write_character(current, cinput[i]); } else { - // some characters => process them all into '%XX' or '%XXXX' - // notation - valid_encoding_length(current, cend, 2); + // Percent-encode byte as '%XX' + valid_encoding_length(current, cend, 3); current += sprintf(current, "%%%02X", (unsigned char)cinput[i]); } } + // Terminate result string valid_encoding_length(current, cend, 1); - current = write_character(current, 0); + current = write_character(current, '\0'); - // Convert to text and return output = cstring_to_text(coutput); pfree(coutput); return output; } -bool valid_utf16(unsigned int byte, int byte_num) { +static bool valid_utf16(unsigned int byte, int byte_num) { return utf16_low[byte_num] <= byte && byte <= utf16_high[byte_num]; } -unsigned int decode_utf16_pair(unsigned int bytes[2]) { +static unsigned int decode_utf16_pair(unsigned int bytes[2]) { Assert(valid_utf16(bytes[0], 0)); Assert(valid_utf16(bytes[1], 1)); @@ -108,56 +123,56 @@ unsigned int decode_utf16_pair(unsigned int bytes[2]) { (bytes[1] & utf16_decode)); } -// Check that sequence of bytes starts with 'symbol' in UTF-8 encoding -// -// UTF-16 'symbols' starts with '%' or '%', and 'XX' after it. -// 'XX' - hex sequence that encode bytes -bool is_utf8(const char *sequence, int length) { - return 3 <= length && sequence[0] == '%' && sequence[1] != 'u' && - sequence[1] != 'U'; +/* + * Check whether the sequence starts with a percent-encoded UTF-8 byte (%XX). + * + * A UTF-8 percent-encoded byte starts with '%' followed by exactly two hex + * digits (e.g. "%20", "%D0"). This is distinguished from a UTF-16 sequence + * which starts with '%u' or '%U' (e.g. "%uD83D"). + * + * Requires at least 3 characters: '%' + 2 hex digits. + */ +static bool is_utf8(const char *sequence, int length) { + return utf8_with_percent_length <= length && sequence[0] == '%' && + sequence[1] != 'u' && sequence[1] != 'U'; } -// Check that sequence of bytes starts with 'symbol' in UTF-16 encoding -// -// UTF-16 'symbols' starts with '%u' or '%U', and 'XXXX' after it. -// 'XXXX' - hex sequence that encode bytes (optinally sequence 'XXXX' -> -// 'XXXXXXXX') -bool is_utf16(const char *sequence, int length) { - return 6 <= length && sequence[0] == '%' && +/* + * Check whether the sequence starts with a legacy percent-encoded UTF-16 unit + * ('%uXXXX' or '%UXXXX'). Requires at least 6 characters: '%u' + 4 hex digits. + */ +static bool is_utf16(const char *sequence, int length) { + return utf16_with_percent_length <= length && sequence[0] == '%' && (sequence[1] == 'u' || sequence[1] == 'U'); } -void fetch_utf16(unsigned int *byte, const char *input) { - for (int i = 0; i < 4; ++i) { - *byte = ((*byte) << 4) | char2hex(input[i]); - } +static void fetch_utf16(unsigned int *byte, const char *input) { + for (int i = 0; i < 4; ++i) + *byte = ((*byte) << 4) | hex_char_to_value(input[i]); } -text *decode(text *input, const char *unreserved_special) { +static text *decode(text *input, const char *unreserved_special) { int input_length; text *output; char *cinput, *coutput, *current; - // Convert input data for processing cinput = text_to_cstring(input); input_length = strlen(cinput); - // Allocate memory for result string coutput = palloc(sizeof(*coutput) * (input_length + 1)); current = coutput; for (int i = 0; i < input_length;) { if (cinput[i] == '%') { - // special character => start process '%XX' or '%XXXX' sequence of - // chars + // Special character => start process '%XX' sequence of chars if (is_utf16(cinput + i, input_length - i)) { unsigned int result = 0; - unsigned int bytes[2] = {0, 0}; - unsigned char buffer[10] = {0, 0, 0, 0, 0, 0, 0, 0, 0, 0}; + unsigned int bytes[2] = {}; + unsigned char buffer[10] = {}; fetch_utf16(bytes, cinput + i + 2); if (valid_utf16(bytes[0], 0)) { - if (10 < input_length - i) { + if (input_length - i < utf16_surrogate_pair_length) { ereport( ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), @@ -166,34 +181,35 @@ text *decode(text *input, const char *unreserved_special) { i))); } - fetch_utf16(bytes + 1, cinput + i + 6); + fetch_utf16(bytes + 1, + cinput + i + utf16_second_codepoint_offset); if (!valid_utf16(bytes[1], 1)) { ereport( ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid UTF-16 byte: characters from %d " "position define invalid UTF-16 symbol", - i + 6))); + i + utf16_past_first_codepoint_offset))); } result = decode_utf16_pair(bytes); - i += 10; + i += utf16_surrogate_pair_length; } else { result = bytes[0]; - i += 6; + i += utf16_with_percent_length; } unicode_to_utf8((pg_wchar)result, buffer); - strncpy(current, (const char *)buffer, pg_utf_mblen(buffer)); + memcpy(current, buffer, pg_utf_mblen(buffer)); current += pg_utf_mblen(buffer); } else if (is_utf8(cinput + i, input_length - i)) { current = - write_character(current, (char2hex(cinput[i + 1]) << 4) | - char2hex(cinput[i + 2])); + write_character(current, (hex_char_to_value(cinput[i + 1]) << 4) | + hex_char_to_value(cinput[i + 2])); i += 3; } else { - // common case: not enough characters in line to decode special - // sequence => error 'incorrect sequence of tokens' + // '%' starts a special sequence, but there are not enough + // characters left to decode it => error 'incorrect sequence of tokens' ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), errmsg("invalid sequence: not enough characters to " @@ -201,49 +217,46 @@ text *decode(text *input, const char *unreserved_special) { i))); } } else if (allowed_character(cinput[i], unreserved_special)) { - // allowed and not '%' character => just copy it into result string + // Copy an unescaped character that is allowed current = write_character(current, cinput[i]); i += 1; } else { - // cinput[i] - is not '%' and not allowed character => error - // 'unexpected character' ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), - errmsg("unalloweed characters in url code: \"%c\"", + errmsg("disallowed characters in URL: \"%c\"", cinput[i]))); } } - current = write_character(current, 0); + current = write_character(current, '\0'); - // Convert to text and return output = cstring_to_text(coutput); pfree(coutput); return output; } +static const char *url_unreserved_special = ".-~_"; + Datum encode_url(PG_FUNCTION_ARGS) { - if (PG_ARGISNULL(0)) { + if (PG_ARGISNULL(0)) PG_RETURN_NULL(); - } - PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), ".-~_")); + PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), url_unreserved_special)); } Datum decode_url(PG_FUNCTION_ARGS) { - if (PG_ARGISNULL(0)) { + if (PG_ARGISNULL(0)) PG_RETURN_NULL(); - } - PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), ".-~_")); + PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), url_unreserved_special)); } +static const char *uri_unreserved_special = "-_.!~*'();/?:@&=+$,#"; + Datum encode_uri(PG_FUNCTION_ARGS) { - if (PG_ARGISNULL(0)) { + if (PG_ARGISNULL(0)) PG_RETURN_NULL(); - } - PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#")); + PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), uri_unreserved_special)); } Datum decode_uri(PG_FUNCTION_ARGS) { - if (PG_ARGISNULL(0)) { + if (PG_ARGISNULL(0)) PG_RETURN_NULL(); - } - PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), "-_.!~*'();/?:@&=+$,#")); + PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), uri_unreserved_special)); } diff --git a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out index 16cf3df2ea4..ba6043c0e51 100644 --- a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out +++ b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out @@ -1,4 +1,8 @@ -CREATE EXTENSION gp_url_tools; +-- start_ignore +CREATE EXTENSION IF NOT EXISTS gp_url_tools; +-- end_ignore +SET client_encoding TO UTF8; +-- Basic encode/decode with ASCII and %XX escaping. SELECT url_tools_schema.encode_url('Hello World'); encode_url --------------- @@ -11,6 +15,7 @@ SELECT url_tools_schema.decode_url('Hello%20World'); Hello World (1 row) +-- encode_url() should escape reserved URL characters like ':'. SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/')); encode_url ------------------ @@ -21,6 +26,7 @@ SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://h S%C3%A3o_Paulo (5 rows) +-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping. SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo'); encode_uri --------------------------------------------- @@ -33,9 +39,38 @@ SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Pa 147ded7d471df9cf050bc13242cbf39e (1 row) +-- Legacy UTF-16 %uXXXX decoding for BMP characters. SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV')); md5 ---------------------------------- d155b1f894fcd5540ba5881fb71753e1 (1 row) +-- Single UTF-16 surrogate pair should decode to one Unicode character. +SELECT url_tools_schema.decode_url('%uD83D%uDE00'); + decode_url +------------ + 😀 +(1 row) + +-- Surrogate pair should also decode correctly in the middle of a string. +SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world'); + decode_url +------------- + hello😀world +(1 row) + +-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair. +SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00'); + decode_url +------------ + A ã 浪 😀 +(1 row) + +-- Truncated surrogate pair should raise an error. +SELECT url_tools_schema.decode_url('%uD83D'); +ERROR: invalid sequence: not enough characters to decode UTF-16 symbol from 0 position +-- High surrogate followed by a non-low-surrogate code unit should fail. +SELECT url_tools_schema.decode_url('%uD83D%u0041'); +ERROR: invalid UTF-16 byte: characters from 6 position define invalid UTF-16 symbol +DROP EXTENSION gp_url_tools; diff --git a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql index dc9a1fe0819..33ebc6781c1 100644 --- a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql +++ b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql @@ -1,13 +1,35 @@ -CREATE EXTENSION gp_url_tools; +-- start_ignore +CREATE EXTENSION IF NOT EXISTS gp_url_tools; +-- end_ignore +SET client_encoding TO UTF8; +-- Basic encode/decode with ASCII and %XX escaping. SELECT url_tools_schema.encode_url('Hello World'); SELECT url_tools_schema.decode_url('Hello%20World'); +-- encode_url() should escape reserved URL characters like ':'. SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/')); +-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping. SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo'); SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo')); +-- Legacy UTF-16 %uXXXX decoding for BMP characters. SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV')); +-- Single UTF-16 surrogate pair should decode to one Unicode character. +SELECT url_tools_schema.decode_url('%uD83D%uDE00'); +-- Surrogate pair should also decode correctly in the middle of a string. +SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world'); + +-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair. +SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00'); + +-- Truncated surrogate pair should raise an error. +SELECT url_tools_schema.decode_url('%uD83D'); + +-- High surrogate followed by a non-low-surrogate code unit should fail. +SELECT url_tools_schema.decode_url('%uD83D%u0041'); + +DROP EXTENSION gp_url_tools; diff --git a/pom.xml b/pom.xml index 0e000093399..98e1931d8da 100644 --- a/pom.xml +++ b/pom.xml @@ -155,6 +155,9 @@ code or new licensing patterns. gpcontrib/diskquota/** + gpcontrib/gp_url_tools/Makefile + gpcontrib/gp_url_tools/gp_url_tools.control + getversion .git-blame-ignore-revs .dir-locals.el