diff --git a/.github/workflows/build-cloudberry-rocky8.yml b/.github/workflows/build-cloudberry-rocky8.yml index 2abf88060e3..11cf52e0710 100644 --- a/.github/workflows/build-cloudberry-rocky8.yml +++ b/.github/workflows/build-cloudberry-rocky8.yml @@ -318,7 +318,8 @@ jobs: "make_configs":["gpcontrib/orafce:installcheck", "gpcontrib/zstd:installcheck", "gpcontrib/gp_sparse_vector:installcheck", - "gpcontrib/gp_toolkit:installcheck"] + "gpcontrib/gp_toolkit:installcheck", + "gpcontrib/gp_url_tools:installcheck"] }, {"test":"ic-fixme", "make_configs":["src/test/regress:installcheck-fixme"], diff --git a/.github/workflows/build-cloudberry.yml b/.github/workflows/build-cloudberry.yml index ca75f7b42e7..364684a904a 100644 --- a/.github/workflows/build-cloudberry.yml +++ b/.github/workflows/build-cloudberry.yml @@ -312,7 +312,8 @@ jobs: "gpcontrib/zstd:installcheck", "gpcontrib/gp_sparse_vector:installcheck", "gpcontrib/gp_toolkit:installcheck", - "gpcontrib/gp_exttable_fdw:installcheck"] + "gpcontrib/gp_exttable_fdw:installcheck", + "gpcontrib/gp_url_tools:installcheck"] }, {"test":"ic-diskquota", "make_configs":["gpcontrib/diskquota:installcheck"], diff --git a/gpcontrib/gp_url_tools/Makefile b/gpcontrib/gp_url_tools/Makefile new file mode 100644 index 00000000000..c161751a88e --- /dev/null +++ b/gpcontrib/gp_url_tools/Makefile @@ -0,0 +1,17 @@ +DATA = $(wildcard sql/*.sql) +MODULES = $(patsubst %.c,%,$(wildcard src/*.c)) +EXTENSION = gp_url_tools + +TESTS = $(wildcard test/sql/*.sql) +REGRESS = $(patsubst test/sql/%.sql,%,$(TESTS)) +REGRESS_OPTS = --inputdir=test + +ifdef USE_PGXS +PG_CONFIG = pg_config +PGXS := $(shell $(PG_CONFIG) --pgxs) +include $(PGXS) +else +top_builddir = ../.. +include $(top_builddir)/src/Makefile.global +include $(top_srcdir)/contrib/contrib-global.mk +endif diff --git a/gpcontrib/gp_url_tools/README.md b/gpcontrib/gp_url_tools/README.md new file mode 100644 index 00000000000..b95fa95a4ed --- /dev/null +++ b/gpcontrib/gp_url_tools/README.md @@ -0,0 +1,75 @@ + + +# gp_url_tools: Cloudberry extension providing functionality for working with URL addresses + +### Features +`gp_url_tools` is an extension for the Cloudberry database that gives implementation +for functions that encode/decode url/uri. + +### Functions +The extension creates the `url_tools_schema` schema and adds four SQL functions: + +- `url_tools_schema.encode_url`/`.encode_uri` + Encodes a text value for use as a URL/URI component by replacing reserved characters with percent-encoded sequences. + +- `url_tools_schema.decode_url`/`.decode_uri` + Decodes percent-encoded sequences in a URL/URI-encoded text value back to their original characters (human-readable). + +### Usage +```sql +CREATE EXTENSION gp_url_tools; +``` +```sql +SELECT url_tools_schema.encode_url('Hello World'); +``` +```bash + encode_url +─────────────── + Hello%20World +(1 row) +``` +```sql +SELECT url_tools_schema.decode_url('Hello%20World'); +``` +```bash + decode_url +───────────── + Hello World +(1 row) +``` +```sql +SELECT url_tools_schema.encode_uri('https://ru.wikipedia.org/wiki/Greenplum_(компания)'); +``` +```bash + encode_uri +──────────────────────────────────────────────────────────────────────────────────────────── + https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F) +``` +```sql +SELECT url_tools_schema.decode_uri('https://ru.wikipedia.org/wiki/Greenplum_(%D0%BA%D0%BE%D0%BC%D0%BF%D0%B0%D0%BD%D0%B8%D1%8F)'); +``` +```bash + decode_uri +──────────────────────────────────────────────────── + https://ru.wikipedia.org/wiki/Greenplum_(компания) +``` + +### Acknowledgments +Thank you very much for the extension for PostgreSQL: https://github.com/okbob/url_encode, its sources were very useful. diff --git a/gpcontrib/gp_url_tools/gp_url_tools.control b/gpcontrib/gp_url_tools/gp_url_tools.control new file mode 100644 index 00000000000..cb16430ad62 --- /dev/null +++ b/gpcontrib/gp_url_tools/gp_url_tools.control @@ -0,0 +1,6 @@ +# gp_url_tools extension +comment = 'Functions for working with URL-s' +default_version = '1.0' +module_pathname = '$libdir/gp_url_tools' +relocatable = true +trusted = true diff --git a/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql new file mode 100644 index 00000000000..3b2a773719a --- /dev/null +++ b/gpcontrib/gp_url_tools/sql/gp_url_tools--1.0.sql @@ -0,0 +1,27 @@ +/* gp_url_tools--1.0.sql */ + +-- complain if script is sourced in psql, rather than via CREATE EXTENSION +\echo Use "CREATE EXTENSION gp_url_tools" to load this file. \quit + +CREATE SCHEMA IF NOT EXISTS url_tools_schema; +GRANT USAGE ON SCHEMA url_tools_schema TO public; + +CREATE FUNCTION url_tools_schema.encode_url(text) +RETURNS text +AS 'MODULE_PATHNAME', 'encode_url' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION url_tools_schema.decode_url(text) +RETURNS text +AS 'MODULE_PATHNAME', 'decode_url' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION url_tools_schema.encode_uri(text) +RETURNS text +AS 'MODULE_PATHNAME', 'encode_uri' +LANGUAGE C IMMUTABLE STRICT; + +CREATE FUNCTION url_tools_schema.decode_uri(text) +RETURNS text +AS 'MODULE_PATHNAME', 'decode_uri' +LANGUAGE C IMMUTABLE STRICT; diff --git a/gpcontrib/gp_url_tools/src/gp_url_tools.c b/gpcontrib/gp_url_tools/src/gp_url_tools.c new file mode 100644 index 00000000000..7c397828ded --- /dev/null +++ b/gpcontrib/gp_url_tools/src/gp_url_tools.c @@ -0,0 +1,262 @@ +/*------------------------------------------------------------------------- + * + * Licensed to the Apache Software Foundation (ASF) under one + * or more contributor license agreements. See the NOTICE file + * distributed with this work for additional information + * regarding copyright ownership. The ASF licenses this file + * to you under the Apache License, Version 2.0 (the + * "License"); you may not use this file except in compliance + * with the License. You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, + * software distributed under the License is distributed on an + * "AS IS" BASIS, WITHOUT WARRANTIES OR CONDITIONS OF ANY + * KIND, either express or implied. See the License for the + * specific language governing permissions and limitations + * under the License. + * + * gp_url_tools.c + * + * IDENTIFICATION + * gpcontrib/gp_url_tools/src/gp_url_tools.c + * + *------------------------------------------------------------------------- + */ + +#include "postgres.h" + +#include "fmgr.h" +#include "mb/pg_wchar.h" +#include "utils/builtins.h" + +PG_MODULE_MAGIC; + +PG_FUNCTION_INFO_V1(encode_url); +PG_FUNCTION_INFO_V1(decode_url); +PG_FUNCTION_INFO_V1(encode_uri); +PG_FUNCTION_INFO_V1(decode_uri); + +static const unsigned int utf16_low[2] = {0xD800, 0xDC00}; +static const unsigned int utf16_high[2] = {0xDBFF, 0xDFFF}; +static const unsigned int utf16_decode = 0x03FF; +static const unsigned int utf16_decode_base = 0x10000; +static const int utf8_with_percent_length = 3; // Example: '%20 +static const int utf16_with_percent_length = 6; // Example: '%u0430' +static const int utf16_surrogate_pair_length = 12; // Example: '%uD800%uDC00' +static const int utf16_second_codepoint_offset = 8; // '%uD800%uDC00' => ('%uD800%u'.lenght == 8) +static const int utf16_past_first_codepoint_offset = 6; // '%uD800%uDC00' => ('%uD800'.lenght == 6) + +static unsigned char hex_char_to_value(char c) { + if ('0' <= c && c <= '9') + return c - '0'; + if ('A' <= c && c <= 'F') + return c - 'A' + 10; + if ('a' <= c && c <= 'f') + return c - 'a' + 10; + ereport(ERROR, (errcode(ERRCODE_INVALID_PARAMETER_VALUE), + errmsg("invalid hexadecimal digit: \"%c\"", c))); + pg_unreachable(); +} + +static bool allowed_character(const char c, const char *unreserved_special) { + return ('0' <= c && c <= '9') || ('A' <= c && c <= 'Z') || + ('a' <= c && c <= 'z') || (strchr(unreserved_special, c) != NULL); +} + +static char *write_character(char *output, const char c) { + *output = c; + return ++output; +} + +static void valid_encoding_length(char *current, char *end, int length) { + Assert(current + length <= end); +} + +static text *encode(text *input, const char *unreserved_special) { + int input_length, output_length; + text *output; + char *cinput, *coutput, *current, *cend; + + // Convert input data for processing + cinput = text_to_cstring(input); + input_length = strlen(cinput); + /* + * Worst case: every input byte becomes '%XX' (3 output chars). + * The +1 accounts for the null terminator + */ + output_length = 3 * input_length + 1; + coutput = palloc(sizeof(*coutput) * output_length); + current = coutput; + cend = coutput + output_length; + + for (int i = 0; i < input_length; ++i) { + if (allowed_character(cinput[i], unreserved_special)) { + // Allowed character => copy it into result string + valid_encoding_length(current, cend, 1); + current = write_character(current, cinput[i]); + } else { + // Percent-encode byte as '%XX' + valid_encoding_length(current, cend, 3); + current += sprintf(current, "%%%02X", (unsigned char)cinput[i]); + } + } + // Terminate result string + valid_encoding_length(current, cend, 1); + current = write_character(current, '\0'); + + output = cstring_to_text(coutput); + pfree(coutput); + return output; +} + +static bool valid_utf16(unsigned int byte, int byte_num) { + return utf16_low[byte_num] <= byte && byte <= utf16_high[byte_num]; +} + +static unsigned int decode_utf16_pair(unsigned int bytes[2]) { + Assert(valid_utf16(bytes[0], 0)); + Assert(valid_utf16(bytes[1], 1)); + + return (utf16_decode_base + ((bytes[0] & utf16_decode) << 10) + + (bytes[1] & utf16_decode)); +} + +/* + * Check whether the sequence starts with a percent-encoded UTF-8 byte (%XX). + * + * A UTF-8 percent-encoded byte starts with '%' followed by exactly two hex + * digits (e.g. "%20", "%D0"). This is distinguished from a UTF-16 sequence + * which starts with '%u' or '%U' (e.g. "%uD83D"). + * + * Requires at least 3 characters: '%' + 2 hex digits. + */ +static bool is_utf8(const char *sequence, int length) { + return utf8_with_percent_length <= length && sequence[0] == '%' && + sequence[1] != 'u' && sequence[1] != 'U'; +} + +/* + * Check whether the sequence starts with a legacy percent-encoded UTF-16 unit + * ('%uXXXX' or '%UXXXX'). Requires at least 6 characters: '%u' + 4 hex digits. + */ +static bool is_utf16(const char *sequence, int length) { + return utf16_with_percent_length <= length && sequence[0] == '%' && + (sequence[1] == 'u' || sequence[1] == 'U'); +} + +static void fetch_utf16(unsigned int *byte, const char *input) { + for (int i = 0; i < 4; ++i) + *byte = ((*byte) << 4) | hex_char_to_value(input[i]); +} + +static text *decode(text *input, const char *unreserved_special) { + int input_length; + text *output; + char *cinput, *coutput, *current; + + cinput = text_to_cstring(input); + input_length = strlen(cinput); + coutput = palloc(sizeof(*coutput) * (input_length + 1)); + current = coutput; + + for (int i = 0; i < input_length;) { + if (cinput[i] == '%') { + // Special character => start process '%XX' sequence of chars + if (is_utf16(cinput + i, input_length - i)) { + unsigned int result = 0; + unsigned int bytes[2] = {}; + unsigned char buffer[10] = {}; + + fetch_utf16(bytes, cinput + i + 2); + + if (valid_utf16(bytes[0], 0)) { + if (input_length - i < utf16_surrogate_pair_length) { + ereport( + ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid sequence: not enough characters " + "to decode UTF-16 symbol from %d position", + i))); + } + + fetch_utf16(bytes + 1, + cinput + i + utf16_second_codepoint_offset); + if (!valid_utf16(bytes[1], 1)) { + ereport( + ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid UTF-16 byte: characters from %d " + "position define invalid UTF-16 symbol", + i + utf16_past_first_codepoint_offset))); + } + + result = decode_utf16_pair(bytes); + i += utf16_surrogate_pair_length; + } else { + result = bytes[0]; + i += utf16_with_percent_length; + } + + unicode_to_utf8((pg_wchar)result, buffer); + memcpy(current, buffer, pg_utf_mblen(buffer)); + current += pg_utf_mblen(buffer); + } else if (is_utf8(cinput + i, input_length - i)) { + current = + write_character(current, (hex_char_to_value(cinput[i + 1]) << 4) | + hex_char_to_value(cinput[i + 2])); + i += 3; + } else { + // '%' starts a special sequence, but there are not enough + // characters left to decode it => error 'incorrect sequence of tokens' + ereport(ERROR, + (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("invalid sequence: not enough characters to " + "decode any UTF-typed symbol from %d position", + i))); + } + } else if (allowed_character(cinput[i], unreserved_special)) { + // Copy an unescaped character that is allowed + current = write_character(current, cinput[i]); + i += 1; + } else { + ereport(ERROR, (errcode(ERRCODE_CHARACTER_NOT_IN_REPERTOIRE), + errmsg("disallowed characters in URL: \"%c\"", + cinput[i]))); + } + } + current = write_character(current, '\0'); + + output = cstring_to_text(coutput); + pfree(coutput); + return output; +} + +static const char *url_unreserved_special = ".-~_"; + +Datum encode_url(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), url_unreserved_special)); +} + +Datum decode_url(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), url_unreserved_special)); +} + +static const char *uri_unreserved_special = "-_.!~*'();/?:@&=+$,#"; + +Datum encode_uri(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + PG_RETURN_TEXT_P(encode(PG_GETARG_TEXT_PP(0), uri_unreserved_special)); +} + +Datum decode_uri(PG_FUNCTION_ARGS) { + if (PG_ARGISNULL(0)) + PG_RETURN_NULL(); + PG_RETURN_TEXT_P(decode(PG_GETARG_TEXT_PP(0), uri_unreserved_special)); +} diff --git a/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out new file mode 100644 index 00000000000..ba6043c0e51 --- /dev/null +++ b/gpcontrib/gp_url_tools/test/expected/gp_url_tools.out @@ -0,0 +1,76 @@ +-- start_ignore +CREATE EXTENSION IF NOT EXISTS gp_url_tools; +-- end_ignore +SET client_encoding TO UTF8; +-- Basic encode/decode with ASCII and %XX escaping. +SELECT url_tools_schema.encode_url('Hello World'); + encode_url +--------------- + Hello%20World +(1 row) + +SELECT url_tools_schema.decode_url('Hello%20World'); + decode_url +------------- + Hello World +(1 row) + +-- encode_url() should escape reserved URL characters like ':'. +SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/')); + encode_url +------------------ + http%3A + + hu.wikipedia.org + wiki + S%C3%A3o_Paulo +(5 rows) + +-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping. +SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo'); + encode_uri +--------------------------------------------- + http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo +(1 row) + +SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo')); + md5 +---------------------------------- + 147ded7d471df9cf050bc13242cbf39e +(1 row) + +-- Legacy UTF-16 %uXXXX decoding for BMP characters. +SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV')); + md5 +---------------------------------- + d155b1f894fcd5540ba5881fb71753e1 +(1 row) + +-- Single UTF-16 surrogate pair should decode to one Unicode character. +SELECT url_tools_schema.decode_url('%uD83D%uDE00'); + decode_url +------------ + 😀 +(1 row) + +-- Surrogate pair should also decode correctly in the middle of a string. +SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world'); + decode_url +------------- + hello😀world +(1 row) + +-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair. +SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00'); + decode_url +------------ + A ã 浪 😀 +(1 row) + +-- Truncated surrogate pair should raise an error. +SELECT url_tools_schema.decode_url('%uD83D'); +ERROR: invalid sequence: not enough characters to decode UTF-16 symbol from 0 position +-- High surrogate followed by a non-low-surrogate code unit should fail. +SELECT url_tools_schema.decode_url('%uD83D%u0041'); +ERROR: invalid UTF-16 byte: characters from 6 position define invalid UTF-16 symbol +DROP EXTENSION gp_url_tools; diff --git a/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql new file mode 100644 index 00000000000..33ebc6781c1 --- /dev/null +++ b/gpcontrib/gp_url_tools/test/sql/gp_url_tools.sql @@ -0,0 +1,35 @@ +-- start_ignore +CREATE EXTENSION IF NOT EXISTS gp_url_tools; +-- end_ignore +SET client_encoding TO UTF8; + +-- Basic encode/decode with ASCII and %XX escaping. +SELECT url_tools_schema.encode_url('Hello World'); +SELECT url_tools_schema.decode_url('Hello%20World'); + +-- encode_url() should escape reserved URL characters like ':'. +SELECT url_tools_schema.encode_url(unnest) from unnest(string_to_array('http://hu.wikipedia.org/wiki/São_Paulo','/')); + +-- encode_uri() keeps URI delimiters, decode_uri() reverses UTF-8 %XX escaping. +SELECT url_tools_schema.encode_uri('http://hu.wikipedia.org/wiki/São_Paulo'); +SELECT md5(url_tools_schema.decode_uri('http://hu.wikipedia.org/wiki/S%C3%A3o_Paulo')); + +-- Legacy UTF-16 %uXXXX decoding for BMP characters. +SELECT md5(url_tools_schema.decode_url('%u6D6A%u82B1%u4E00%u6735%u6735%20%u7B2C8%u96C6%20-%20%u89C6%u9891%u5728%u7EBF%u89C2%u770B%20-%20%u6D6A%u82B1%u4E00%u6735%u6735%20-%20%u8292%u679CTV')); + +-- Single UTF-16 surrogate pair should decode to one Unicode character. +SELECT url_tools_schema.decode_url('%uD83D%uDE00'); + +-- Surrogate pair should also decode correctly in the middle of a string. +SELECT url_tools_schema.decode_url('hello%uD83D%uDE00world'); + +-- Mixed input: ASCII, UTF-8 %XX, UTF-16 BMP, and UTF-16 surrogate pair. +SELECT url_tools_schema.decode_url('A%20%C3%A3%20%u6D6A%20%uD83D%uDE00'); + +-- Truncated surrogate pair should raise an error. +SELECT url_tools_schema.decode_url('%uD83D'); + +-- High surrogate followed by a non-low-surrogate code unit should fail. +SELECT url_tools_schema.decode_url('%uD83D%u0041'); + +DROP EXTENSION gp_url_tools; diff --git a/pom.xml b/pom.xml index 0e000093399..98e1931d8da 100644 --- a/pom.xml +++ b/pom.xml @@ -155,6 +155,9 @@ code or new licensing patterns. gpcontrib/diskquota/** + gpcontrib/gp_url_tools/Makefile + gpcontrib/gp_url_tools/gp_url_tools.control + getversion .git-blame-ignore-revs .dir-locals.el