From 2b87dfd005ef2ef2de8b113a84334ca2029da9cb Mon Sep 17 00:00:00 2001 From: Andrey Sokolov Date: Tue, 9 Jun 2026 11:00:38 +0300 Subject: [PATCH] feat: add REST catalog support Bump iceberg-cxx to the commit where REST catalog is supported. Add the rest_url and rest_warehouse_id properties to config. When catalog type is rest, check that this properties are not empty and create RESTCatalog object in the GetCatalog function. REST catalog support is optional. When catalog type in configuration file is set to rest, but TEA is built with USE_REST=OFF, then an error message is printed. Add the rest-tests stage to ci.yml. --- .github/workflows/ci.yml | 92 +++++++++++++++++++++++++++ CMakeLists.txt | 5 ++ tea/common/config.cpp | 13 +++- tea/common/config.h | 7 ++ tea/gpext/CMakeLists.txt | 2 +- tea/metadata/access_iceberg.cpp | 14 ++++ test/config/tea-config-rest.json | 15 +++++ test/config/tea-config-schema.json | 6 +- test/iceberg/gen/setup_lakekeeper.sql | 13 ++++ vendor/CMakeLists.txt | 3 +- 10 files changed, 165 insertions(+), 5 deletions(-) create mode 100644 test/config/tea-config-rest.json create mode 100644 test/iceberg/gen/setup_lakekeeper.sql diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b61d4b1b..4ccb6701 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -317,3 +317,95 @@ jobs: --metadata_type=${{ matrix.metadata_type }} \ --table_type=${{ matrix.table_type }} \ --profile=${{ matrix.profile }} + + rest-tests: + needs: build-tea + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install runtime dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + libevent-2.1-7 \ + libipc-run-perl \ + libxerces-c3.2 \ + libxml2 \ + python2 \ + software-properties-common + + wget -qO- https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - + sudo add-apt-repository -y "deb http://apt.postgresql.org/pub/repos/apt jammy-pgdg main" + DEBIAN_FRONTEND=noninteractive sudo apt-get install -y postgresql-16 postgresql-client-16 + sudo pg_dropcluster 14 main + + sudo ln -s -f python2 /usr/bin/python + sudo locale-gen "en_US.UTF-8" + + - name: Download runtime artifacts + uses: actions/download-artifact@v4 + with: + name: tea-runtime + path: ci-artifacts + + - name: Restore runtime files + run: | + mkdir -p "$HOME/local" + tar -xzf ci-artifacts/gpdb-with-tea.tar.gz -C "$HOME/local" + + - name: Initialize Greenplum cluster + run: | + : # TODO(gmusya): consider using make create-demo-cluster + sudo locale-gen "ru_RU.CP1251" + sudo mkdir -p /gpdata + sudo chown $USER /gpdata + + source $HOME/local/gpdb/greenplum_path.sh + export MASTER_DATA_DIRECTORY=/gpdata/master/gpsne-1 + NUM_SEGS=2 bash test/start-gp.sh $HOME/local/gpdb /gpdata + + - name: Start Minio and upload test data + run: | + wget -q https://dl.min.io/server/minio/release/linux-amd64/minio -O /tmp/minio + wget -q https://dl.min.io/client/mc/release/linux-amd64/mc -O /tmp/mc + chmod +x /tmp/minio /tmp/mc + + export CI_PROJECT_DIR=$PWD + + MINIO_EXECUTABLE=/tmp/minio MC_EXECUTABLE=/tmp/mc \ + MINIO_DATA_DIR=/tmp/minio-data \ + bash test/iceberg/gen/init_minio.sh + + - name: Deploy tea config + run: | + source $HOME/local/gpdb/greenplum_path.sh + mkdir -p $GPHOME/tea + cp test/config/tea-config-rest.json $GPHOME/tea/tea-config.json + cp test/config/tea-config-schema.json $GPHOME/tea/tea-config-schema.json + + - name: Start Lakekeeper + run: | + sudo pg_createcluster -p 5433 16 main + sudo sed -i 's/scram-sha-256/trust/g' /etc/postgresql/16/main/pg_hba.conf + sudo /etc/init.d/postgresql start + + wget -q https://github.com/lakekeeper/lakekeeper/releases/download/v0.12.1/lakekeeper-x86_64-unknown-linux-gnu.tar.gz -O - | tar -xzf - + export LAKEKEEPER__PG_DATABASE_URL_READ="postgres://postgres@localhost:5433/postgres" + export LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://postgres@localhost:5433/postgres" + export LAKEKEEPER__METRICS__PORT=0 + ./lakekeeper migrate + psql -h localhost -p 5433 -U postgres -f test/iceberg/gen/setup_lakekeeper.sql postgres + ./lakekeeper serve & + sleep 5 + + - name: Check REST catalog support + run: | + source $HOME/local/gpdb/greenplum_path.sh + export PGDATABASE=tea_ci + psql -c 'CREATE EXTENSION tea' + psql -c "CREATE FOREIGN TABLE rest_test (a bigint, b bigint) server tea_server options(location 'tea://iceberg://gperov.test')" + OUT=`psql -Atc "SELECT count(*), sum(a), sum(b) FROM rest_test"` + echo $OUT + [[ "$OUT" = "9999|49992899|99985798" ]] || exit 1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3081c95d..1aa8e539 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,7 @@ option(TEA_BUILD_FDW "Build Foreign Data Wrapper" ON) option(HAS_ARROW_CSV "Arrow built with csv" ON) option(ICECXX_GENERATOR "" ON) option(USE_NESSIE "Enable Nessie catalog" ON) +option(USE_REST "Enable REST catalog" ON) option(TEA_USE_THREAD_SANITIZER "Enable running tests with ThreadSanitizer" ON) cmake_minimum_required(VERSION 3.25) @@ -59,6 +60,10 @@ if(USE_NESSIE) add_compile_definitions(USE_NESSIE) endif() +if(USE_REST) + add_compile_definitions(USE_REST) +endif() + enable_testing() add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) diff --git a/tea/common/config.cpp b/tea/common/config.cpp index 49c5593d..80dc89a6 100644 --- a/tea/common/config.cpp +++ b/tea/common/config.cpp @@ -253,7 +253,14 @@ bool Get(const rapidjson::Value* doc, std::string_view section_prefix, std::stri *out = CatalogConfig::CatalogType::kNessie; return true; } - return false; +#if USE_REST + if (str == "rest") { + *out = CatalogConfig::CatalogType::kREST; + return true; + } +#endif + + throw std::runtime_error("Catalog type '" + str + "' is not supported"); } bool Get(const rapidjson::Value* doc, std::string_view section_prefix, std::string_view section, const std::string& key, @@ -377,6 +384,10 @@ arrow::Status ReadValues(Source* src, Config* config, std::string_view section_p Get(src, section_prefix, "catalog", "type", &config->catalog.type); GetEndpoints(src, section_prefix, "catalog", "hms", &config->catalog.hms_endpoints); GetEndpoints(src, section_prefix, "catalog", "nessie", &config->catalog.nessie_endpoints); +#if USE_REST + Get(src, section_prefix, "catalog", "rest_url", &config->catalog.rest_url); + Get(src, section_prefix, "catalog", "rest_warehouse_id", &config->catalog.rest_warehouse_id); +#endif GetEndpoints(src, section_prefix, "hms", "hms", &config->hms_catalog.hms_endpoints); diff --git a/tea/common/config.h b/tea/common/config.h index 7ac169e8..030aa5d1 100644 --- a/tea/common/config.h +++ b/tea/common/config.h @@ -42,10 +42,17 @@ struct CatalogConfig { enum class CatalogType { kNessie, kHMS, +#if USE_REST + kREST, +#endif } type = CatalogType::kHMS; std::vector hms_endpoints; std::vector nessie_endpoints; +#if USE_REST + std::string rest_url; + std::string rest_warehouse_id; +#endif bool operator==(const CatalogConfig&) const = default; }; diff --git a/tea/gpext/CMakeLists.txt b/tea/gpext/CMakeLists.txt index 0302d775..aab61c6b 100644 --- a/tea/gpext/CMakeLists.txt +++ b/tea/gpext/CMakeLists.txt @@ -19,7 +19,7 @@ target_link_libraries( tea PRIVATE reader tea_log teapot_file_filter Arrow::arrow_static Parquet::parquet_static teapot_grpc_proto gp_filter_convert) -if (USE_NESSIE) +if (USE_NESSIE OR USE_REST) target_link_libraries(tea PRIVATE cpr) endif() diff --git a/tea/metadata/access_iceberg.cpp b/tea/metadata/access_iceberg.cpp index 1741470f..792efede 100644 --- a/tea/metadata/access_iceberg.cpp +++ b/tea/metadata/access_iceberg.cpp @@ -20,6 +20,7 @@ #include "iceberg/schema.h" #include "iceberg/tea_hive_catalog.h" #include "iceberg/tea_nessie_catalog.h" +#include "iceberg/tea_rest_catalog.h" #include "iceberg/tea_scan.h" #include "tea/common/cancel.h" @@ -73,6 +74,19 @@ std::shared_ptr GetCatalog(const Config& config #endif throw std::runtime_error("No correct Nessie endpoints for iceberg catalog were provided"); } +#if USE_REST + case CatalogConfig::CatalogType::kREST: { + if (config.catalog.rest_url.empty()) { + throw std::runtime_error("REST URL for iceberg catalog is not provided"); + } + + if (config.catalog.rest_warehouse_id.empty()) { + throw std::runtime_error("Warehouse id for iceberg catalog is not provided"); + } + + return std::make_shared(config.catalog.rest_url, config.catalog.rest_warehouse_id); + } +#endif } throw std::runtime_error("No any correct endpoint for iceberg catalog were provided"); } diff --git a/test/config/tea-config-rest.json b/test/config/tea-config-rest.json new file mode 100644 index 00000000..8a8c5389 --- /dev/null +++ b/test/config/tea-config-rest.json @@ -0,0 +1,15 @@ +{ + "common": { + "s3": { + "access_key": "minioadmin", + "secret_key": "minioadmin", + "endpoint_override": "127.0.0.1:9000", + "scheme": "http" + }, + "catalog": { + "type": "rest", + "rest_url": "http://127.0.0.1:8181/catalog", + "rest_warehouse_id": "b498836e-6ecd-11f1-9c23-533b70a81474" + } + } +} diff --git a/test/config/tea-config-schema.json b/test/config/tea-config-schema.json index 692a5b01..47366021 100644 --- a/test/config/tea-config-schema.json +++ b/test/config/tea-config-schema.json @@ -16,9 +16,11 @@ "catalog": { "type": "object", "properties": { - "type": { "type": "string", "enum": ["hms", "nessie"] }, + "type": { "type": "string", "enum": ["hms", "nessie", "rest"] }, "hms": { "type": "string" }, - "nessie": { "type": "string" } + "nessie": { "type": "string" }, + "rest_url": { "type": "string" }, + "rest_warehouse_id": { "type": "string" } }, "required": ["type", "hms", "nessie"] }, diff --git a/test/iceberg/gen/setup_lakekeeper.sql b/test/iceberg/gen/setup_lakekeeper.sql new file mode 100644 index 00000000..c15924f9 --- /dev/null +++ b/test/iceberg/gen/setup_lakekeeper.sql @@ -0,0 +1,13 @@ +insert into project(project_name, project_id) values ('Default Project', '00000000-0000-0000-0000-000000000000'); +insert into warehouse(warehouse_id, warehouse_name, storage_profile, status, tabular_delete_mode, project_id) values ('b498836e-6ecd-11f1-9c23-533b70a81474', 'wh1', '{"type": "s3", "bucket": "warehouse", "flavor": "s3-compat", "region": "local", "endpoint": "http://127.0.0.1:9000/", "key-prefix": null, "sts-enabled": false, "sts-endpoint": null, "sts-role-arn": null, "storage-layout": null, "assume-role-arn": null, "aws-kms-key-arn": null, "sts-session-tags": {}, "path-style-access": null, "legacy-md5-behavior": null, "remote-signing-enabled": true, "push-s3-delete-disabled": true, "remote-signing-url-style": "auto", "sts-token-validity-seconds": 3600, "allow-alternative-protocols": true}', 'active', 'hard', '00000000-0000-0000-0000-000000000000'); +insert into namespace(namespace_id, warehouse_id, namespace_name, namespace_properties) values ('019ef331-4f46-7c02-a050-c2c81b9b6888', 'b498836e-6ecd-11f1-9c23-533b70a81474', '{gperov}', '{"location": "s3://warehouse/019ef331-4f46-7c02-a050-c2c81b9b6888"}'); +insert into tabular(tabular_id, namespace_id, name, typ, metadata_location, fs_protocol, fs_location, warehouse_id, tabular_namespace_name) values ('4412d001-c6df-4adb-8854-d3b9e762440c', '019ef331-4f46-7c02-a050-c2c81b9b6888', 'test', 'table', 's3://warehouse/gperov/test/metadata/00003-ca406d8e-6c7b-4672-87ff-bfd76f84f949.metadata.json', 's3', 'warehouse/gperov/test', 'b498836e-6ecd-11f1-9c23-533b70a81474', '{gperov}'); +insert into "table"(table_id, table_format_version, last_column_id, last_sequence_number, last_updated_ms, last_partition_id, warehouse_id, next_row_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', '2', 2, 3, 1713951998102, 999, 'b498836e-6ecd-11f1-9c23-533b70a81474', 0); +insert into table_partition_spec(partition_spec_id, table_id, partition_spec, warehouse_id) values (0, '4412d001-c6df-4adb-8854-d3b9e762440c', '{"fields": [], "spec-id": 0}', 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_schema (schema_id, table_id, schema, warehouse_id) values (0, '4412d001-c6df-4adb-8854-d3b9e762440c', '{"type": "struct", "fields": [{"id": 1, "name": "a", "type": "long", "required": false}, {"id": 2, "name": "b", "type": "long", "required": false}], "schema-id": 0}', 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_snapshot (snapshot_id, table_id, parent_snapshot_id, sequence_number, manifest_list, summary, schema_id, timestamp_ms, warehouse_id) values (5231658854638766100, '4412d001-c6df-4adb-8854-d3b9e762440c', 1638951453256129678, 2, 's3://warehouse/gperov/test/metadata/snap-5231658854638766100-1-7e6e13cb-31fd-4de7-8811-02ce7cec44a9.avro', '{"operation": "append", "spark.app.id": "local-1713951981838", "added-records": "10000", "total-records": "10000", "added-data-files": "6", "added-files-size": "25206", "total-data-files": "6", "total-files-size": "25206", "total-delete-files": "0", "total-equality-deletes": "0", "total-position-deletes": "0", "changed-partition-count": "1"}', 0, 1713951995410, 'b498836e-6ecd-11f1-9c23-533b70a81474'), (7558608030923099867, '4412d001-c6df-4adb-8854-d3b9e762440c', 5231658854638766100, 3, 's3://warehouse/gperov/test/metadata/snap-7558608030923099867-1-41f34bc8-eedf-4573-96b0-10c04e7c84c4.avro', '{"operation": "overwrite", "spark.app.id": "local-1713951981838", "total-records": "10000", "added-files-size": "1391", "total-data-files": "6", "total-files-size": "26597", "added-delete-files": "1", "total-delete-files": "1", "added-position-deletes": "1", "total-equality-deletes": "0", "total-position-deletes": "1", "changed-partition-count": "1", "added-position-delete-files": "1"}', 0, 1713951998102, 'b498836e-6ecd-11f1-9c23-533b70a81474'), (1638951453256129678, '4412d001-c6df-4adb-8854-d3b9e762440c', NULL, 1, 's3://warehouse/gperov/test/metadata/snap-1638951453256129678-1-eea762e4-1b7a-4717-b361-eae34da54fd4.avro', '{"operation": "append", "spark.app.id": "local-1713951981838", "total-records": "0", "total-data-files": "0", "total-files-size": "0", "total-delete-files": "0", "total-equality-deletes": "0", "total-position-deletes": "0", "changed-partition-count": "0"}', 0, 1713951992417, 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_sort_order (sort_order_id, table_id, sort_order, warehouse_id) values (0, '4412d001-c6df-4adb-8854-d3b9e762440c', '{"fields": [], "order-id": 0}', 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_default_sort_order (table_id, sort_order_id, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 0, 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_refs (table_id, table_ref_name, snapshot_id, retention, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 'main', '7558608030923099867', '{"type": "branch"}','b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_default_partition_spec (table_id, partition_spec_id, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 0, 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_current_schema (table_id, schema_id, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 0, 'b498836e-6ecd-11f1-9c23-533b70a81474'); diff --git a/vendor/CMakeLists.txt b/vendor/CMakeLists.txt index 6a8c191d..ef2c3839 100644 --- a/vendor/CMakeLists.txt +++ b/vendor/CMakeLists.txt @@ -30,7 +30,7 @@ FetchContent_Declare( iceberg-cxx EXCLUDE_FROM_ALL GIT_REPOSITORY ${GITHUB}/lithium-tech/iceberg-cxx.git - GIT_TAG 614d392831675cdae9569ca70a1e2e281ef60509 + GIT_TAG 3e148ac96d07e9bd3a07dbe9a6ae2efdc6559f2a ) FetchContent_MakeAvailable(googletest hiredis) @@ -40,4 +40,5 @@ set(ICECXX_BUILD_ABSEIL OFF CACHE BOOL "") set(ICECXX_BUILD_TOOLS ON CACHE BOOL "") set(ICECXX_GENERATOR ${ICECXX_GENERATOR} CACHE BOOL "") set(ICECXX_USE_NESSIE ${USE_NESSIE} CACHE BOOL "") +set(ICECXX_USE_REST ${USE_REST} CACHE BOOL "") FetchContent_MakeAvailable(iceberg-cxx)