diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml index b61d4b1b..4ccb6701 100644 --- a/.github/workflows/ci.yml +++ b/.github/workflows/ci.yml @@ -317,3 +317,95 @@ jobs: --metadata_type=${{ matrix.metadata_type }} \ --table_type=${{ matrix.table_type }} \ --profile=${{ matrix.profile }} + + rest-tests: + needs: build-tea + runs-on: ubuntu-22.04 + steps: + - name: Checkout + uses: actions/checkout@v4 + + - name: Install runtime dependencies + run: | + sudo apt-get update + sudo apt-get install -y \ + libevent-2.1-7 \ + libipc-run-perl \ + libxerces-c3.2 \ + libxml2 \ + python2 \ + software-properties-common + + wget -qO- https://www.postgresql.org/media/keys/ACCC4CF8.asc | sudo apt-key add - + sudo add-apt-repository -y "deb http://apt.postgresql.org/pub/repos/apt jammy-pgdg main" + DEBIAN_FRONTEND=noninteractive sudo apt-get install -y postgresql-16 postgresql-client-16 + sudo pg_dropcluster 14 main + + sudo ln -s -f python2 /usr/bin/python + sudo locale-gen "en_US.UTF-8" + + - name: Download runtime artifacts + uses: actions/download-artifact@v4 + with: + name: tea-runtime + path: ci-artifacts + + - name: Restore runtime files + run: | + mkdir -p "$HOME/local" + tar -xzf ci-artifacts/gpdb-with-tea.tar.gz -C "$HOME/local" + + - name: Initialize Greenplum cluster + run: | + : # TODO(gmusya): consider using make create-demo-cluster + sudo locale-gen "ru_RU.CP1251" + sudo mkdir -p /gpdata + sudo chown $USER /gpdata + + source $HOME/local/gpdb/greenplum_path.sh + export MASTER_DATA_DIRECTORY=/gpdata/master/gpsne-1 + NUM_SEGS=2 bash test/start-gp.sh $HOME/local/gpdb /gpdata + + - name: Start Minio and upload test data + run: | + wget -q https://dl.min.io/server/minio/release/linux-amd64/minio -O /tmp/minio + wget -q https://dl.min.io/client/mc/release/linux-amd64/mc -O /tmp/mc + chmod +x /tmp/minio /tmp/mc + + export CI_PROJECT_DIR=$PWD + + MINIO_EXECUTABLE=/tmp/minio MC_EXECUTABLE=/tmp/mc \ + MINIO_DATA_DIR=/tmp/minio-data \ + bash test/iceberg/gen/init_minio.sh + + - name: Deploy tea config + run: | + source $HOME/local/gpdb/greenplum_path.sh + mkdir -p $GPHOME/tea + cp test/config/tea-config-rest.json $GPHOME/tea/tea-config.json + cp test/config/tea-config-schema.json $GPHOME/tea/tea-config-schema.json + + - name: Start Lakekeeper + run: | + sudo pg_createcluster -p 5433 16 main + sudo sed -i 's/scram-sha-256/trust/g' /etc/postgresql/16/main/pg_hba.conf + sudo /etc/init.d/postgresql start + + wget -q https://github.com/lakekeeper/lakekeeper/releases/download/v0.12.1/lakekeeper-x86_64-unknown-linux-gnu.tar.gz -O - | tar -xzf - + export LAKEKEEPER__PG_DATABASE_URL_READ="postgres://postgres@localhost:5433/postgres" + export LAKEKEEPER__PG_DATABASE_URL_WRITE="postgres://postgres@localhost:5433/postgres" + export LAKEKEEPER__METRICS__PORT=0 + ./lakekeeper migrate + psql -h localhost -p 5433 -U postgres -f test/iceberg/gen/setup_lakekeeper.sql postgres + ./lakekeeper serve & + sleep 5 + + - name: Check REST catalog support + run: | + source $HOME/local/gpdb/greenplum_path.sh + export PGDATABASE=tea_ci + psql -c 'CREATE EXTENSION tea' + psql -c "CREATE FOREIGN TABLE rest_test (a bigint, b bigint) server tea_server options(location 'tea://iceberg://gperov.test')" + OUT=`psql -Atc "SELECT count(*), sum(a), sum(b) FROM rest_test"` + echo $OUT + [[ "$OUT" = "9999|49992899|99985798" ]] || exit 1 diff --git a/CMakeLists.txt b/CMakeLists.txt index 3081c95d..1aa8e539 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -2,6 +2,7 @@ option(TEA_BUILD_FDW "Build Foreign Data Wrapper" ON) option(HAS_ARROW_CSV "Arrow built with csv" ON) option(ICECXX_GENERATOR "" ON) option(USE_NESSIE "Enable Nessie catalog" ON) +option(USE_REST "Enable REST catalog" ON) option(TEA_USE_THREAD_SANITIZER "Enable running tests with ThreadSanitizer" ON) cmake_minimum_required(VERSION 3.25) @@ -59,6 +60,10 @@ if(USE_NESSIE) add_compile_definitions(USE_NESSIE) endif() +if(USE_REST) + add_compile_definitions(USE_REST) +endif() + enable_testing() add_custom_target(check COMMAND ${CMAKE_CTEST_COMMAND}) diff --git a/tea/common/config.cpp b/tea/common/config.cpp index 49c5593d..80dc89a6 100644 --- a/tea/common/config.cpp +++ b/tea/common/config.cpp @@ -253,7 +253,14 @@ bool Get(const rapidjson::Value* doc, std::string_view section_prefix, std::stri *out = CatalogConfig::CatalogType::kNessie; return true; } - return false; +#if USE_REST + if (str == "rest") { + *out = CatalogConfig::CatalogType::kREST; + return true; + } +#endif + + throw std::runtime_error("Catalog type '" + str + "' is not supported"); } bool Get(const rapidjson::Value* doc, std::string_view section_prefix, std::string_view section, const std::string& key, @@ -377,6 +384,10 @@ arrow::Status ReadValues(Source* src, Config* config, std::string_view section_p Get(src, section_prefix, "catalog", "type", &config->catalog.type); GetEndpoints(src, section_prefix, "catalog", "hms", &config->catalog.hms_endpoints); GetEndpoints(src, section_prefix, "catalog", "nessie", &config->catalog.nessie_endpoints); +#if USE_REST + Get(src, section_prefix, "catalog", "rest_url", &config->catalog.rest_url); + Get(src, section_prefix, "catalog", "rest_warehouse_id", &config->catalog.rest_warehouse_id); +#endif GetEndpoints(src, section_prefix, "hms", "hms", &config->hms_catalog.hms_endpoints); diff --git a/tea/common/config.h b/tea/common/config.h index 7ac169e8..030aa5d1 100644 --- a/tea/common/config.h +++ b/tea/common/config.h @@ -42,10 +42,17 @@ struct CatalogConfig { enum class CatalogType { kNessie, kHMS, +#if USE_REST + kREST, +#endif } type = CatalogType::kHMS; std::vector hms_endpoints; std::vector nessie_endpoints; +#if USE_REST + std::string rest_url; + std::string rest_warehouse_id; +#endif bool operator==(const CatalogConfig&) const = default; }; diff --git a/tea/gpext/CMakeLists.txt b/tea/gpext/CMakeLists.txt index 0302d775..aab61c6b 100644 --- a/tea/gpext/CMakeLists.txt +++ b/tea/gpext/CMakeLists.txt @@ -19,7 +19,7 @@ target_link_libraries( tea PRIVATE reader tea_log teapot_file_filter Arrow::arrow_static Parquet::parquet_static teapot_grpc_proto gp_filter_convert) -if (USE_NESSIE) +if (USE_NESSIE OR USE_REST) target_link_libraries(tea PRIVATE cpr) endif() diff --git a/tea/metadata/access_iceberg.cpp b/tea/metadata/access_iceberg.cpp index 1741470f..792efede 100644 --- a/tea/metadata/access_iceberg.cpp +++ b/tea/metadata/access_iceberg.cpp @@ -20,6 +20,7 @@ #include "iceberg/schema.h" #include "iceberg/tea_hive_catalog.h" #include "iceberg/tea_nessie_catalog.h" +#include "iceberg/tea_rest_catalog.h" #include "iceberg/tea_scan.h" #include "tea/common/cancel.h" @@ -73,6 +74,19 @@ std::shared_ptr GetCatalog(const Config& config #endif throw std::runtime_error("No correct Nessie endpoints for iceberg catalog were provided"); } +#if USE_REST + case CatalogConfig::CatalogType::kREST: { + if (config.catalog.rest_url.empty()) { + throw std::runtime_error("REST URL for iceberg catalog is not provided"); + } + + if (config.catalog.rest_warehouse_id.empty()) { + throw std::runtime_error("Warehouse id for iceberg catalog is not provided"); + } + + return std::make_shared(config.catalog.rest_url, config.catalog.rest_warehouse_id); + } +#endif } throw std::runtime_error("No any correct endpoint for iceberg catalog were provided"); } diff --git a/test/config/tea-config-rest.json b/test/config/tea-config-rest.json new file mode 100644 index 00000000..8a8c5389 --- /dev/null +++ b/test/config/tea-config-rest.json @@ -0,0 +1,15 @@ +{ + "common": { + "s3": { + "access_key": "minioadmin", + "secret_key": "minioadmin", + "endpoint_override": "127.0.0.1:9000", + "scheme": "http" + }, + "catalog": { + "type": "rest", + "rest_url": "http://127.0.0.1:8181/catalog", + "rest_warehouse_id": "b498836e-6ecd-11f1-9c23-533b70a81474" + } + } +} diff --git a/test/config/tea-config-schema.json b/test/config/tea-config-schema.json index 692a5b01..47366021 100644 --- a/test/config/tea-config-schema.json +++ b/test/config/tea-config-schema.json @@ -16,9 +16,11 @@ "catalog": { "type": "object", "properties": { - "type": { "type": "string", "enum": ["hms", "nessie"] }, + "type": { "type": "string", "enum": ["hms", "nessie", "rest"] }, "hms": { "type": "string" }, - "nessie": { "type": "string" } + "nessie": { "type": "string" }, + "rest_url": { "type": "string" }, + "rest_warehouse_id": { "type": "string" } }, "required": ["type", "hms", "nessie"] }, diff --git a/test/iceberg/gen/setup_lakekeeper.sql b/test/iceberg/gen/setup_lakekeeper.sql new file mode 100644 index 00000000..c15924f9 --- /dev/null +++ b/test/iceberg/gen/setup_lakekeeper.sql @@ -0,0 +1,13 @@ +insert into project(project_name, project_id) values ('Default Project', '00000000-0000-0000-0000-000000000000'); +insert into warehouse(warehouse_id, warehouse_name, storage_profile, status, tabular_delete_mode, project_id) values ('b498836e-6ecd-11f1-9c23-533b70a81474', 'wh1', '{"type": "s3", "bucket": "warehouse", "flavor": "s3-compat", "region": "local", "endpoint": "http://127.0.0.1:9000/", "key-prefix": null, "sts-enabled": false, "sts-endpoint": null, "sts-role-arn": null, "storage-layout": null, "assume-role-arn": null, "aws-kms-key-arn": null, "sts-session-tags": {}, "path-style-access": null, "legacy-md5-behavior": null, "remote-signing-enabled": true, "push-s3-delete-disabled": true, "remote-signing-url-style": "auto", "sts-token-validity-seconds": 3600, "allow-alternative-protocols": true}', 'active', 'hard', '00000000-0000-0000-0000-000000000000'); +insert into namespace(namespace_id, warehouse_id, namespace_name, namespace_properties) values ('019ef331-4f46-7c02-a050-c2c81b9b6888', 'b498836e-6ecd-11f1-9c23-533b70a81474', '{gperov}', '{"location": "s3://warehouse/019ef331-4f46-7c02-a050-c2c81b9b6888"}'); +insert into tabular(tabular_id, namespace_id, name, typ, metadata_location, fs_protocol, fs_location, warehouse_id, tabular_namespace_name) values ('4412d001-c6df-4adb-8854-d3b9e762440c', '019ef331-4f46-7c02-a050-c2c81b9b6888', 'test', 'table', 's3://warehouse/gperov/test/metadata/00003-ca406d8e-6c7b-4672-87ff-bfd76f84f949.metadata.json', 's3', 'warehouse/gperov/test', 'b498836e-6ecd-11f1-9c23-533b70a81474', '{gperov}'); +insert into "table"(table_id, table_format_version, last_column_id, last_sequence_number, last_updated_ms, last_partition_id, warehouse_id, next_row_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', '2', 2, 3, 1713951998102, 999, 'b498836e-6ecd-11f1-9c23-533b70a81474', 0); +insert into table_partition_spec(partition_spec_id, table_id, partition_spec, warehouse_id) values (0, '4412d001-c6df-4adb-8854-d3b9e762440c', '{"fields": [], "spec-id": 0}', 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_schema (schema_id, table_id, schema, warehouse_id) values (0, '4412d001-c6df-4adb-8854-d3b9e762440c', '{"type": "struct", "fields": [{"id": 1, "name": "a", "type": "long", "required": false}, {"id": 2, "name": "b", "type": "long", "required": false}], "schema-id": 0}', 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_snapshot (snapshot_id, table_id, parent_snapshot_id, sequence_number, manifest_list, summary, schema_id, timestamp_ms, warehouse_id) values (5231658854638766100, '4412d001-c6df-4adb-8854-d3b9e762440c', 1638951453256129678, 2, 's3://warehouse/gperov/test/metadata/snap-5231658854638766100-1-7e6e13cb-31fd-4de7-8811-02ce7cec44a9.avro', '{"operation": "append", "spark.app.id": "local-1713951981838", "added-records": "10000", "total-records": "10000", "added-data-files": "6", "added-files-size": "25206", "total-data-files": "6", "total-files-size": "25206", "total-delete-files": "0", "total-equality-deletes": "0", "total-position-deletes": "0", "changed-partition-count": "1"}', 0, 1713951995410, 'b498836e-6ecd-11f1-9c23-533b70a81474'), (7558608030923099867, '4412d001-c6df-4adb-8854-d3b9e762440c', 5231658854638766100, 3, 's3://warehouse/gperov/test/metadata/snap-7558608030923099867-1-41f34bc8-eedf-4573-96b0-10c04e7c84c4.avro', '{"operation": "overwrite", "spark.app.id": "local-1713951981838", "total-records": "10000", "added-files-size": "1391", "total-data-files": "6", "total-files-size": "26597", "added-delete-files": "1", "total-delete-files": "1", "added-position-deletes": "1", "total-equality-deletes": "0", "total-position-deletes": "1", "changed-partition-count": "1", "added-position-delete-files": "1"}', 0, 1713951998102, 'b498836e-6ecd-11f1-9c23-533b70a81474'), (1638951453256129678, '4412d001-c6df-4adb-8854-d3b9e762440c', NULL, 1, 's3://warehouse/gperov/test/metadata/snap-1638951453256129678-1-eea762e4-1b7a-4717-b361-eae34da54fd4.avro', '{"operation": "append", "spark.app.id": "local-1713951981838", "total-records": "0", "total-data-files": "0", "total-files-size": "0", "total-delete-files": "0", "total-equality-deletes": "0", "total-position-deletes": "0", "changed-partition-count": "0"}', 0, 1713951992417, 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_sort_order (sort_order_id, table_id, sort_order, warehouse_id) values (0, '4412d001-c6df-4adb-8854-d3b9e762440c', '{"fields": [], "order-id": 0}', 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_default_sort_order (table_id, sort_order_id, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 0, 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_refs (table_id, table_ref_name, snapshot_id, retention, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 'main', '7558608030923099867', '{"type": "branch"}','b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_default_partition_spec (table_id, partition_spec_id, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 0, 'b498836e-6ecd-11f1-9c23-533b70a81474'); +insert into table_current_schema (table_id, schema_id, warehouse_id) values ('4412d001-c6df-4adb-8854-d3b9e762440c', 0, 'b498836e-6ecd-11f1-9c23-533b70a81474'); diff --git a/vendor/CMakeLists.txt b/vendor/CMakeLists.txt index 6a8c191d..ef2c3839 100644 --- a/vendor/CMakeLists.txt +++ b/vendor/CMakeLists.txt @@ -30,7 +30,7 @@ FetchContent_Declare( iceberg-cxx EXCLUDE_FROM_ALL GIT_REPOSITORY ${GITHUB}/lithium-tech/iceberg-cxx.git - GIT_TAG 614d392831675cdae9569ca70a1e2e281ef60509 + GIT_TAG 3e148ac96d07e9bd3a07dbe9a6ae2efdc6559f2a ) FetchContent_MakeAvailable(googletest hiredis) @@ -40,4 +40,5 @@ set(ICECXX_BUILD_ABSEIL OFF CACHE BOOL "") set(ICECXX_BUILD_TOOLS ON CACHE BOOL "") set(ICECXX_GENERATOR ${ICECXX_GENERATOR} CACHE BOOL "") set(ICECXX_USE_NESSIE ${USE_NESSIE} CACHE BOOL "") +set(ICECXX_USE_REST ${USE_REST} CACHE BOOL "") FetchContent_MakeAvailable(iceberg-cxx)