diff --git a/duckdb/Dockerfile b/duckdb/Dockerfile index 8e64ce8..42a41b7 100644 --- a/duckdb/Dockerfile +++ b/duckdb/Dockerfile @@ -1,21 +1,126 @@ -FROM --platform=$BUILDPLATFORM yolean/builder-base +# syntax=docker/dockerfile:1.7-labs +# +# DuckDB CLI compiled from source so we can statically link extensions that are +# not published as prebuilt binaries for the 1.5.x line — notably `tributary` +# (Kafka, community) — alongside `httpfs` (which provides https:// and Google +# Cloud Storage gs:// access; there are no separate `https`/`gcs` extensions). +# +# The build mirrors DuckDB's own community-extension CI: the manylinux + vcpkg +# toolchain from duckdb/extension-ci-tools, then `make release` against a pinned +# duckdb checkout. Because the toolchain matches the official build environment, +# the resulting binary loads cleanly on our ubuntu-based runtime. + +ARG DUCKDB_TAG=v1.5.3 +# Query-farm/tributary branch tracking the duckdb 1.5 line, pinned for reproducibility. +ARG TRIBUTARY_REF=aaedad15bfb43b15c678c93baad41251117f0574 +# paleolimbot/duckdb-nanoarrow ref — the "arrow"/"arrow-ipc" community extension +# (Apache Arrow IPC). Tracks its community-extensions release. +ARG NANOARROW_REF=42e4199a67c4cd0789087562a025e87e7130fdc3 +# vcpkg commit used by duckdb's v1.5 extension-ci-tools. +ARG VCPKG_COMMIT=84bab45d415d22042bd0b9081aea57f362da3f35 +# httpfs is not pinned here: its version is taken from duckdb's own +# .github/config/extensions/httpfs.cmake in the checked-out DUCKDB_TAG, so it +# always matches the duckdb release. + +# --- Per-arch manylinux base selection (same images extension-ci-tools builds on) --- +FROM quay.io/pypa/manylinux_2_28_x86_64 AS toolchain-amd64 +ENV DUCKDB_PLATFORM=linux_amd64 VCPKG_TRIPLET=x64-linux-release CMAKE_ARCH=x86_64 + +FROM quay.io/pypa/manylinux_2_28_aarch64 AS toolchain-arm64 +ENV DUCKDB_PLATFORM=linux_arm64 VCPKG_TRIPLET=arm64-linux-release CMAKE_ARCH=aarch64 + +# --- Toolchain: build tools + recent cmake + vcpkg --- +FROM toolchain-${TARGETARCH} AS toolchain ARG TARGETARCH -ARG DUCKDB_TAG=v1.5.1 +ARG VCPKG_COMMIT +RUN yum groupinstall -y "Development Tools" \ + && yum install -y curl zip unzip tar autoconf libtool ninja-build \ + perl-IPC-Cmd perl-core ccache bison flex git jq \ + && yum clean all && rm -rf /var/cache/yum +# DuckDB requires a recent CMake (the manylinux-packaged one is too old). +RUN cd /tmp \ + && curl -fLO https://github.com/Kitware/CMake/releases/download/v4.0.2/cmake-4.0.2-linux-${CMAKE_ARCH}.sh \ + && sh cmake-4.0.2-linux-${CMAKE_ARCH}.sh --skip-license --prefix=/usr/local \ + && rm cmake-4.0.2-linux-${CMAKE_ARCH}.sh +RUN git clone https://github.com/microsoft/vcpkg.git /vcpkg \ + && git -C /vcpkg checkout ${VCPKG_COMMIT} \ + && /vcpkg/bootstrap-vcpkg.sh +ENV VCPKG_ROOT=/vcpkg \ + VCPKG_TOOLCHAIN_PATH=/vcpkg/scripts/buildsystems/vcpkg.cmake \ + GEN=ninja \ + PATH=/vcpkg:$PATH +RUN git config --global --add safe.directory '*' -RUN set -ex; \ - ARCH=$TARGETARCH; \ - DUCKDB_RELEASE=https://github.com/duckdb/duckdb/releases/download/${DUCKDB_TAG}/duckdb_cli-linux-${ARCH}.gz; \ - curl -I $DUCKDB_RELEASE; \ - curl -L $DUCKDB_RELEASE | gunzip > /tmp/duckdb; \ - chmod u+x /tmp/duckdb; \ - sha256sum /tmp/duckdb +# --- Build: compile duckdb with tributary + httpfs + nanoarrow statically linked --- +FROM toolchain AS build +ARG TARGETARCH +ARG DUCKDB_TAG +ARG TRIBUTARY_REF +ARG NANOARROW_REF +WORKDIR /src +RUN git clone --recurse-submodules --shallow-submodules \ + https://github.com/Query-farm/tributary.git . \ + && git checkout ${TRIBUTARY_REF} && git submodule update --init --recursive \ + && git -C duckdb fetch --depth 1 origin tag ${DUCKDB_TAG} \ + && git -C duckdb checkout ${DUCKDB_TAG} -# https://github.com/duckdb/duckdb-ui/discussions/84 -RUN echo '#!/bin/sh' > /tmp/xdg-open && chmod u+x /tmp/xdg-open +# Add the extensions we want statically linked beyond tributary's own config: +# httpfs — https:// and gcs gs:// access; out-of-tree in 1.5, pinned by +# duckdb itself so it always matches DUCKDB_TAG +# nanoarrow — Apache Arrow IPC ("arrow"/"arrow-ipc") +# httpfs pulls in openssl, which we add to the vcpkg manifest. +RUN cat >> extension_config.cmake < vcpkg.json.tmp && mv vcpkg.json.tmp vcpkg.json + +# `make release` builds the whole tree and only fails on `plan_serializer`, a dev +# tool (gated by BUILD_SHELL, but not a dependency of the `duckdb` binary) that +# hits a link-time ODR clash under gcc-toolset-14. We then build the `duckdb` +# target explicitly, which never compiles that tool and surfaces real errors. +# Caches: ccache for duckdb objects, vcpkg buildtrees/packages/downloads for deps. +RUN --mount=type=cache,target=/ccache,id=duckdb-ccache-${TARGETARCH} \ + --mount=type=cache,target=/vcpkg/buildtrees,id=duckdb-vcpkg-bt-${TARGETARCH} \ + --mount=type=cache,target=/vcpkg/packages,id=duckdb-vcpkg-pkg-${TARGETARCH} \ + --mount=type=cache,target=/vcpkg/downloads,id=duckdb-vcpkg-dl-${TARGETARCH} \ + set -ex; \ + export CCACHE_DIR=/ccache CCACHE_MAXSIZE=2G \ + VCPKG_BINARY_SOURCES='clear;http,https://vcpkg-cache.duckdb.org,read' \ + VCPKG_TARGET_TRIPLET="$VCPKG_TRIPLET" VCPKG_HOST_TRIPLET="$VCPKG_TRIPLET" \ + OPENSSL_ROOT_DIR="/src/build/release/vcpkg_installed/$VCPKG_TRIPLET" \ + OPENSSL_DIR="/src/build/release/vcpkg_installed/$VCPKG_TRIPLET" \ + OPENSSL_USE_STATIC_LIBS=true \ + DUCKDB_PLATFORM="$DUCKDB_PLATFORM" DUCKDB_GIT_VERSION="$DUCKDB_TAG" \ + ENABLE_EXTENSION_AUTOLOADING=1 ENABLE_EXTENSION_AUTOINSTALL=1; \ + make release || true; \ + cmake --build build/release --target duckdb; \ + test -x build/release/duckdb; \ + strip build/release/duckdb; \ + ./build/release/duckdb -c "SELECT extension_name FROM duckdb_extensions() WHERE extension_name IN ('tributary','httpfs','nanoarrow') AND loaded ORDER BY 1;" + +# Stage the runtime artifacts here (as root) since the runtime image is nonroot. +RUN install -D build/release/duckdb /out/usr/local/bin/duckdb \ + && printf '#!/bin/sh\n' > /out/usr/local/bin/xdg-open \ + && chmod +x /out/usr/local/bin/xdg-open \ + && install -D /etc/pki/tls/certs/ca-bundle.crt /out/etc/ssl/certs/ca-certificates.crt + +# --- Runtime --- FROM --platform=$TARGETPLATFORM yolean/homedir -COPY --from=0 /etc/ssl/certs/ca-certificates.crt /etc/ssl/certs/ -COPY --from=0 /tmp/duckdb /tmp/xdg-open /usr/local/bin/ +# duckdb resolves history/temp from $HOME; set it for both the root image and +# the nonroot user (whose home this is) that the to-nonroot variant runs as. +ENV HOME=/home/nonroot + +# The runtime user is nonroot, so all files are staged in the build stage (as +# root) and copied in. ubuntu ships without a CA bundle; httpfs/openssl +# auto-discover /etc/ssl/certs/ca-certificates.crt. The xdg-open shim is a no-op +# for duckdb-ui (https://github.com/duckdb/duckdb-ui/discussions/84). +COPY --from=build /out/ / ENTRYPOINT ["/usr/local/bin/duckdb"] diff --git a/node-kafka-duckdb/package.json b/node-kafka-duckdb/package.json index e4a1bea..9d82834 100644 --- a/node-kafka-duckdb/package.json +++ b/node-kafka-duckdb/package.json @@ -1,7 +1,7 @@ { "private": true, "dependencies": { - "@duckdb/node-bindings": "1.5.1-r.1", + "@duckdb/node-bindings": "1.5.3-r.1", "@google-cloud/pubsub": "5.3.0" } }