diff --git a/Dockerfile.android b/Dockerfile.android index dcdbcfa..16f9bdb 100644 --- a/Dockerfile.android +++ b/Dockerfile.android @@ -92,7 +92,9 @@ RUN mkdir -p /out/llama.cpp/android-arm64 /out/llama.cpp/include /out/llama.cpp/ cp llama-src/include/*.h /out/llama.cpp/include/ && \ cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \ cp llama-src/common/common.h /out/llama.cpp/common/ && \ - cp llama-src/common/sampling.h /out/llama.cpp/common/ + cp llama-src/common/sampling.h /out/llama.cpp/common/ && \ + cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \ + cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/ # Collect whisper.cpp artifacts and strip debug symbols RUN mkdir -p /out/whisper.cpp/android-arm64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ diff --git a/Dockerfile.libs b/Dockerfile.libs index ac82e85..febe06b 100644 --- a/Dockerfile.libs +++ b/Dockerfile.libs @@ -53,6 +53,8 @@ RUN WHISPER_VERSION=$(go run ./cmd/versioncmd whisper.cpp) && \ # ============================================================================ FROM golang:1.24-bookworm AS builder-cpu +ARG ARCH_SUFFIX=linux-amd64 + RUN apt-get update && apt-get install -y --no-install-recommends \ build-essential cmake && \ rm -rf /var/lib/apt/lists/* @@ -72,18 +74,20 @@ RUN cd whisper-src && \ cmake --build build --config Release -j$(nproc) # Collect llama.cpp artifacts and strip debug symbols -RUN mkdir -p /out/llama.cpp/linux-amd64 /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \ - find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/linux-amd64/ \; && \ - find /out/llama.cpp/linux-amd64 -name "*.a" -exec strip --strip-debug {} \; && \ +RUN mkdir -p /out/llama.cpp/${ARCH_SUFFIX} /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \ + find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/${ARCH_SUFFIX}/ \; && \ + find /out/llama.cpp/${ARCH_SUFFIX} -name "*.a" -exec strip --strip-debug {} \; && \ cp llama-src/include/*.h /out/llama.cpp/include/ && \ cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \ cp llama-src/common/common.h /out/llama.cpp/common/ && \ - cp llama-src/common/sampling.h /out/llama.cpp/common/ + cp llama-src/common/sampling.h /out/llama.cpp/common/ && \ + cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \ + cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/ # Collect whisper.cpp artifacts and strip debug symbols -RUN mkdir -p /out/whisper.cpp/linux-amd64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ - find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/linux-amd64/ \; && \ - find /out/whisper.cpp/linux-amd64 -name "*.a" -exec strip --strip-debug {} \; && \ +RUN mkdir -p /out/whisper.cpp/${ARCH_SUFFIX} /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ + find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/${ARCH_SUFFIX}/ \; && \ + find /out/whisper.cpp/${ARCH_SUFFIX} -name "*.a" -exec strip --strip-debug {} \; && \ cp whisper-src/include/*.h /out/whisper.cpp/include/ && \ cp whisper-src/ggml/include/*.h /out/whisper.cpp/ggml/include/ @@ -92,9 +96,15 @@ RUN mkdir -p /out/whisper.cpp/linux-amd64 /out/whisper.cpp/include /out/whisper. # ============================================================================ FROM nvidia/cuda:12.8.0-devel-ubuntu24.04 AS builder-cuda +# gcc-12: gcc-13 hits an internal-compiler-error on fattn-mma-f16 templates +# (cfgcleanup.cc:580 try_forward_edges) — reproducible at any -j on b9222. RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential cmake wget && \ - rm -rf /var/lib/apt/lists/* + build-essential cmake wget gcc-12 g++-12 && \ + rm -rf /var/lib/apt/lists/* && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 + +ENV CC=gcc-12 CXX=g++-12 WORKDIR /src COPY --from=sources /src/llama-src llama-src @@ -117,7 +127,9 @@ RUN mkdir -p /out/llama.cpp/linux-amd64-cuda /out/llama.cpp/include /out/llama.c cp llama-src/include/*.h /out/llama.cpp/include/ && \ cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \ cp llama-src/common/common.h /out/llama.cpp/common/ && \ - cp llama-src/common/sampling.h /out/llama.cpp/common/ + cp llama-src/common/sampling.h /out/llama.cpp/common/ && \ + cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \ + cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/ # Collect whisper.cpp artifacts (CUDA variant) and strip debug symbols RUN mkdir -p /out/whisper.cpp/linux-amd64-cuda /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ @@ -133,9 +145,16 @@ RUN mkdir -p /out/whisper.cpp/linux-amd64-cuda /out/whisper.cpp/include /out/whi # (llama.cpp b8220+ needs VK_EXT_layer_settings from Vulkan 1.3.261+) FROM ubuntu:24.04 AS builder-vulkan +# - spirv-headers: required since llama.cpp b9000+ (ggml-vulkan calls find_package(SPIRV-Headers)) +# - gcc-12: gcc-13 ICEs on common/arg.cpp.o under load RUN apt-get update && apt-get install -y --no-install-recommends \ - build-essential cmake wget ca-certificates libvulkan-dev glslang-tools glslc && \ - rm -rf /var/lib/apt/lists/* + build-essential cmake wget ca-certificates libvulkan-dev glslang-tools glslc \ + spirv-headers gcc-12 g++-12 && \ + rm -rf /var/lib/apt/lists/* && \ + update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \ + update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100 + +ENV CC=gcc-12 CXX=g++-12 WORKDIR /src COPY --from=sources /src/llama-src llama-src @@ -158,7 +177,9 @@ RUN mkdir -p /out/llama.cpp/linux-amd64-vulkan /out/llama.cpp/include /out/llama cp llama-src/include/*.h /out/llama.cpp/include/ && \ cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \ cp llama-src/common/common.h /out/llama.cpp/common/ && \ - cp llama-src/common/sampling.h /out/llama.cpp/common/ + cp llama-src/common/sampling.h /out/llama.cpp/common/ && \ + cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \ + cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/ # Collect whisper.cpp artifacts (Vulkan variant) and strip debug symbols RUN mkdir -p /out/whisper.cpp/linux-amd64-vulkan /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ diff --git a/Dockerfile.libs-arm64 b/Dockerfile.libs-arm64 new file mode 100644 index 0000000..770da69 --- /dev/null +++ b/Dockerfile.libs-arm64 @@ -0,0 +1,98 @@ +# Dockerfile.libs-arm64 — cross-compile linux-arm64 static libraries +# +# Uses aarch64-linux-gnu gcc/g++ toolchain on an amd64 host. No qemu/binfmt +# required, which makes it work inside unprivileged LXC where binfmt_misc +# is read-only. +# +# Usage: +# docker build -f Dockerfile.libs-arm64 -o ./out . + +# ============================================================================ +# Stage: Download sources (shared) +# ============================================================================ +FROM golang:1.24-bookworm AS sources + +RUN apt-get update && apt-get install -y --no-install-recommends \ + wget && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /src + +COPY go.mod ./ +COPY version.go ./ +COPY cmd/versioncmd/ ./cmd/versioncmd/ + +RUN LLAMA_VERSION=$(go run ./cmd/versioncmd llama.cpp) && \ + echo "Downloading llama.cpp ${LLAMA_VERSION}..." && \ + wget -qO llama.cpp.tar.gz "https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz" && \ + mkdir -p llama-src && \ + tar xzf llama.cpp.tar.gz --strip-components=1 -C llama-src && \ + rm llama.cpp.tar.gz + +RUN WHISPER_VERSION=$(go run ./cmd/versioncmd whisper.cpp) && \ + echo "Downloading whisper.cpp ${WHISPER_VERSION}..." && \ + wget -qO whisper.cpp.tar.gz "https://github.com/ggerganov/whisper.cpp/archive/refs/tags/${WHISPER_VERSION}.tar.gz" && \ + mkdir -p whisper-src && \ + tar xzf whisper.cpp.tar.gz --strip-components=1 -C whisper-src && \ + rm whisper.cpp.tar.gz + +# ============================================================================ +# Builder: cross-compile to aarch64-linux-gnu +# ============================================================================ +FROM debian:bookworm AS builder + +RUN apt-get update && apt-get install -y --no-install-recommends \ + build-essential cmake \ + crossbuild-essential-arm64 && \ + rm -rf /var/lib/apt/lists/* + +WORKDIR /src +COPY --from=sources /src/llama-src llama-src +COPY --from=sources /src/whisper-src whisper-src + +RUN printf '%s\n' \ + 'set(CMAKE_SYSTEM_NAME Linux)' \ + 'set(CMAKE_SYSTEM_PROCESSOR aarch64)' \ + 'set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)' \ + 'set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)' \ + 'set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)' \ + 'set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)' \ + 'set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)' \ + > /src/aarch64-toolchain.cmake + +# Build llama.cpp (aarch64) +RUN cd llama-src && \ + cmake -B build \ + -DCMAKE_TOOLCHAIN_FILE=/src/aarch64-toolchain.cmake \ + -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF \ + -DGGML_NATIVE=OFF && \ + cmake --build build --config Release -j$(nproc) + +# Build whisper.cpp (aarch64) +RUN cd whisper-src && \ + cmake -B build \ + -DCMAKE_TOOLCHAIN_FILE=/src/aarch64-toolchain.cmake \ + -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF \ + -DGGML_NATIVE=OFF && \ + cmake --build build --config Release -j$(nproc) + +# Collect llama.cpp artifacts +RUN mkdir -p /out/llama.cpp/linux-arm64 /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \ + find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/linux-arm64/ \; && \ + find /out/llama.cpp/linux-arm64 -name "*.a" -exec aarch64-linux-gnu-strip --strip-debug {} \; && \ + cp llama-src/include/*.h /out/llama.cpp/include/ && \ + cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \ + cp llama-src/common/common.h /out/llama.cpp/common/ && \ + cp llama-src/common/sampling.h /out/llama.cpp/common/ && \ + cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \ + cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/ + +# Collect whisper.cpp artifacts +RUN mkdir -p /out/whisper.cpp/linux-arm64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \ + find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/linux-arm64/ \; && \ + find /out/whisper.cpp/linux-arm64 -name "*.a" -exec aarch64-linux-gnu-strip --strip-debug {} \; && \ + cp whisper-src/include/*.h /out/whisper.cpp/include/ && \ + cp whisper-src/ggml/include/*.h /out/whisper.cpp/ggml/include/ + +FROM scratch +COPY --from=builder /out/ / diff --git a/Makefile b/Makefile index 1bf8439..e61a158 100644 --- a/Makefile +++ b/Makefile @@ -13,6 +13,7 @@ # make build-libs-linux-cpu # Build linux-amd64 CPU only # make build-libs-linux-cuda # Build linux-amd64 CUDA only # make build-libs-linux-vulkan # Build linux-amd64 Vulkan only +# make build-libs-linux-arm64 # Build linux-arm64 (CPU) via qemu emulation # make build-libs-android # Build android-arm64 via NDK # make build-libs-all # Build native + all linux + android # make clean # Remove temp build dirs (keeps prebuilt .a + headers) @@ -38,7 +39,7 @@ WHISPER_PREBUILT := $(WHISPER_THIRD_PARTY)/prebuilt/$(PLATFORM) .PHONY: build-libs build-libs-llama build-libs-whisper \ build-libs-linux build-libs-linux-cpu build-libs-linux-cuda build-libs-linux-vulkan \ - build-libs-android build-libs-all clean verify + build-libs-linux-arm64 build-libs-android build-libs-all clean verify build-libs: build-libs-llama build-libs-whisper @@ -138,6 +139,29 @@ build-libs-linux-cuda: build-libs-linux-vulkan: $(call build-linux-variant,vulkan,-vulkan) +# ============================================================================ +# Docker build for linux-arm64 (cross-compile via aarch64-linux-gnu toolchain) +# ============================================================================ +# Uses Dockerfile.libs-arm64 which runs aarch64 gcc/g++ inside an amd64 +# container. No qemu/binfmt needed (works in unprivileged LXC). +build-libs-linux-arm64: + @echo "==> Building linux-arm64 static libraries via Docker (cross-compile)..." + docker build -f Dockerfile.libs-arm64 -o ./out . + @mkdir -p $(LLAMA_THIRD_PARTY)/prebuilt/linux-arm64 + cp out/llama.cpp/linux-arm64/*.a $(LLAMA_THIRD_PARTY)/prebuilt/linux-arm64/ + @mkdir -p $(LLAMA_THIRD_PARTY)/include $(LLAMA_THIRD_PARTY)/ggml/include $(LLAMA_THIRD_PARTY)/common + cp out/llama.cpp/include/*.h $(LLAMA_THIRD_PARTY)/include/ + cp out/llama.cpp/ggml/include/*.h $(LLAMA_THIRD_PARTY)/ggml/include/ + cp out/llama.cpp/common/common.h $(LLAMA_THIRD_PARTY)/common/ + cp out/llama.cpp/common/sampling.h $(LLAMA_THIRD_PARTY)/common/ + @mkdir -p $(WHISPER_THIRD_PARTY)/prebuilt/linux-arm64 + cp out/whisper.cpp/linux-arm64/*.a $(WHISPER_THIRD_PARTY)/prebuilt/linux-arm64/ + @mkdir -p $(WHISPER_THIRD_PARTY)/include $(WHISPER_THIRD_PARTY)/ggml/include + cp out/whisper.cpp/include/*.h $(WHISPER_THIRD_PARTY)/include/ + cp out/whisper.cpp/ggml/include/*.h $(WHISPER_THIRD_PARTY)/ggml/include/ + rm -rf out + @echo "==> linux-arm64 libraries ready" + # ============================================================================ # Docker build for android-arm64 (cross-compile via Android NDK) # ============================================================================ diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go index ba0eb50..04c7371 100644 --- a/ggml/llamacpp/llamacpp.go +++ b/ggml/llamacpp/llamacpp.go @@ -12,7 +12,7 @@ package llamacpp #cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64 -#cgo darwin LDFLAGS: -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm +#cgo darwin LDFLAGS: -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm #include #include #include "wrapper.h" diff --git a/ggml/llamacpp/llamacpp_android.go b/ggml/llamacpp/llamacpp_android.go index f1b95b2..1d835c5 100644 --- a/ggml/llamacpp/llamacpp_android.go +++ b/ggml/llamacpp/llamacpp_android.go @@ -7,6 +7,6 @@ package llamacpp /* #cgo android,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/android-arm64 -#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog +#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog */ import "C" diff --git a/ggml/llamacpp/llamacpp_linux.go b/ggml/llamacpp/llamacpp_linux.go index a35551a..d002eb6 100644 --- a/ggml/llamacpp/llamacpp_linux.go +++ b/ggml/llamacpp/llamacpp_linux.go @@ -8,6 +8,6 @@ package llamacpp /* #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-arm64 -#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp +#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp */ import "C" diff --git a/ggml/llamacpp/llamacpp_linux_cuda.go b/ggml/llamacpp/llamacpp_linux_cuda.go index d7c8fd5..ea3147e 100644 --- a/ggml/llamacpp/llamacpp_linux_cuda.go +++ b/ggml/llamacpp/llamacpp_linux_cuda.go @@ -7,6 +7,6 @@ package llamacpp /* #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-cuda -#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp +#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp */ import "C" diff --git a/ggml/llamacpp/llamacpp_linux_vulkan.go b/ggml/llamacpp/llamacpp_linux_vulkan.go index 97fea4b..003303e 100644 --- a/ggml/llamacpp/llamacpp_linux_vulkan.go +++ b/ggml/llamacpp/llamacpp_linux_vulkan.go @@ -7,6 +7,6 @@ package llamacpp /* #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-vulkan -#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp +#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp */ import "C" diff --git a/ggml/llamacpp/third_party/common/common.h b/ggml/llamacpp/third_party/common/common.h index 020b6a7..1d3d788 100644 --- a/ggml/llamacpp/third_party/common/common.h +++ b/ggml/llamacpp/third_party/common/common.h @@ -2,17 +2,18 @@ #pragma once +#include "llama-cpp.h" + #include "ggml-opt.h" #include "ggml.h" -#include "llama-cpp.h" #include #include #include #include -#include #include #include +#include #if defined(_WIN32) && !defined(_WIN32_WINNT) #define _WIN32_WINNT 0x0A00 @@ -27,11 +28,6 @@ #define die(msg) do { fputs("error: " msg "\n", stderr); exit(1); } while (0) #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0) -#define print_build_info() do { \ - fprintf(stderr, "%s: build = %d (%s)\n", __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT); \ - fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET); \ -} while(0) - struct common_time_meas { common_time_meas(int64_t & t_acc, bool disable = false); ~common_time_meas(); @@ -53,21 +49,13 @@ struct common_adapter_lora_info { using llama_tokens = std::vector; -// build info -extern int LLAMA_BUILD_NUMBER; -extern const char * LLAMA_COMMIT; -extern const char * LLAMA_COMPILER; -extern const char * LLAMA_BUILD_TARGET; - -const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT); - struct common_control_vector_load_info; // // CPU utils // -struct cpu_params { +struct common_cpu_params { int n_threads = -1; bool cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask. bool mask_valid = false; // Default: any CPU @@ -76,8 +64,8 @@ struct cpu_params { uint32_t poll = 50; // Polling (busywait) level (0 - no polling, 100 - mostly polling) }; -int32_t cpu_get_num_physical_cores(); -int32_t cpu_get_num_math(); +int32_t common_cpu_get_num_physical_cores(); +int32_t common_cpu_get_num_math(); // // Common params @@ -170,9 +158,10 @@ enum common_params_sampling_config : uint64_t { enum common_speculative_type { COMMON_SPECULATIVE_TYPE_NONE, // no speculative decoding - COMMON_SPECULATIVE_TYPE_DRAFT, // draft model - COMMON_SPECULATIVE_TYPE_EAGLE3, // eagle draft model - COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE, // standalone draft model speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3, // Eagle3 speculative decoding + COMMON_SPECULATIVE_TYPE_DRAFT_MTP, // Multi-token prediction + COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE, // simple self-speculative decoding based on n-grams COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K, // self-speculative decoding with n-gram keys only COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values COMMON_SPECULATIVE_TYPE_NGRAM_MOD, @@ -287,6 +276,7 @@ struct common_params_sampling { std::vector reasoning_budget_start; // start tag token sequence std::vector reasoning_budget_end; // end tag token sequence std::vector reasoning_budget_forced; // forced sequence (message + end tag) + std::string reasoning_budget_message; // message injected before end tag when budget exhausted bool backend_sampling = false; @@ -307,62 +297,82 @@ struct common_params_model { std::string name = ""; // in format /[:] (tag is optional) // NOLINT }; -struct common_ngram_mod; +// draft-model-based speculative decoding parameters +struct common_params_speculative_draft { + int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding + int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding -struct common_params_speculative { - common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding + float p_split = 0.1f; // speculative decoding split probability + float p_min = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f - // general-purpose speculative decoding parameters + common_params_model mparams; - int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding - int32_t n_min = 0; // minimum number of draft tokens to use for speculative decoding - float p_split = 0.1f; // speculative decoding split probability - float p_min = 0.75f; // minimum speculative decoding probability (greedy) + llama_context * ctx_tgt = nullptr; + llama_context * ctx_dft = nullptr; - // ngram-based speculative decoding + int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) - uint16_t ngram_size_n = 12; // ngram size for lookup - uint16_t ngram_size_m = 48; // mgram size for speculative tokens - uint16_t ngram_min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed + ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K + ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V - std::shared_ptr ngram_mod; + common_cpu_params cpuparams; + common_cpu_params cpuparams_batch; - std::string lookup_cache_static; // path of static ngram cache file for lookup decoding // NOLINT - std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding // NOLINT + std::vector devices; // devices to use for offloading - // draft-model speculative decoding + std::vector tensor_buft_overrides; +}; - struct common_params_model mparams_dft; +struct common_params_speculative_ngram_mod { + int32_t n_match = 24; - llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts + int32_t n_max = 64; + int32_t n_min = 48; +}; - llama_context_params cparams_dft; // these are the parameters for the draft llama_context +struct common_params_speculative_ngram_map { + uint16_t size_n = 12; // ngram size for lookup + uint16_t size_m = 48; // mgram size for speculative tokens + uint16_t min_hits = 1; // minimum hits at ngram/mgram lookup for mgram to be proposed +}; - int32_t n_ctx = 0; // draft context size - int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default) +struct common_params_speculative_ngram_cache { + std::string lookup_cache_static; // path of static ngram cache file for lookup decoding + std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding +}; - ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K - ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V +struct common_params_speculative { + std::vector types = { COMMON_SPECULATIVE_TYPE_NONE }; - struct cpu_params cpuparams; - struct cpu_params cpuparams_batch; + // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model + common_params_speculative_draft draft; - std::vector devices; // devices to use for offloading + common_params_speculative_ngram_mod ngram_mod; + common_params_speculative_ngram_map ngram_simple; + common_params_speculative_ngram_map ngram_map_k; + common_params_speculative_ngram_map ngram_map_k4v; - std::vector> replacements; // main to speculative model replacements - std::vector tensor_buft_overrides; + common_params_speculative_ngram_cache ngram_cache; bool has_dft() const { - return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty(); + return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty(); + } + + uint32_t need_n_rs_seq() const { + bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) { + return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP; + }); + + return needs_rs_seq ? draft.n_max : 0u; } }; struct common_params_vocoder { struct common_params_model model; - std::string speaker_file = ""; // speaker file path // NOLINT + std::string speaker_file; // speaker file path - bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy // NOLINT + bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy }; struct common_params_diffusion { @@ -433,19 +443,20 @@ struct common_params { // offload params std::vector devices; // devices to use for offloading - int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all - int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors - float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs - bool fit_params = true; // whether to fit unset model/context parameters to free device memory - int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use + int32_t n_gpu_layers = -1; // number of layers to store in VRAM, -1 is auto, <= -2 is all + int32_t main_gpu = 0; // the GPU that is used for scratch and small tensors + float tensor_split[128] = {0}; // how split tensors should be distributed across GPUs + bool fit_params = true; // whether to fit unset model/context parameters to free device memory + bool fit_params_print = false; // print the estimated required memory to run the model + int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use // margin per device in bytes for fitting parameters to free memory: std::vector fit_params_target = std::vector(llama_max_devices(), 1024 * 1024*1024); enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs - struct cpu_params cpuparams; - struct cpu_params cpuparams_batch; + common_cpu_params cpuparams; + common_cpu_params cpuparams_batch; ggml_backend_sched_eval_callback cb_eval = nullptr; void * cb_eval_user_data = nullptr; @@ -579,7 +590,7 @@ struct common_params { int32_t n_threads_http = -1; // number of threads to process HTTP requests (TODO: support threadpool) int32_t n_cache_reuse = 0; // min chunk size to reuse from the cache via KV shifting bool cache_prompt = true; // whether to enable prompt caching - bool clear_idle = true; // save and clear idle slots upon starting a new task + bool cache_idle_slots = true; // save and clear idle slots upon starting a new task int32_t n_ctx_checkpoints = 32; // max number of context checkpoints per slot int32_t checkpoint_every_nt = 8192; // make a checkpoint every n tokens during prefill int32_t cache_ram_mib = 8192; // -1 = no limit, 0 - disable, 1 = 1 MiB, etc. @@ -593,8 +604,6 @@ struct common_params { bool force_pure_content_parser = false; common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK; int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable - int reasoning_budget = -1; - std::string reasoning_budget_message; // message injected before end tag when budget exhausted bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response int sleep_idle_seconds = -1; // if >0, server will sleep after this many seconds of idle time @@ -605,11 +614,21 @@ struct common_params { std::map default_template_kwargs; - // webui configs - bool webui = true; + // UI configs +#ifdef LLAMA_UI_DEFAULT_ENABLED + bool ui = LLAMA_UI_DEFAULT_ENABLED != 0; +#else + bool ui = true; // default to enabled when not set +#endif + + // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead + bool webui = ui; bool webui_mcp_proxy = false; std::string webui_config_json; + bool ui_mcp_proxy = false; + std::string ui_config_json; + // "advanced" endpoints are disabled by default for better security bool endpoint_slots = true; bool endpoint_props = false; // only control POST requests, not GET @@ -687,11 +706,12 @@ struct common_params { // initializes the logging system and prints info about the build void common_init(); +void common_params_print_info(const common_params & params, bool print_devices = true); std::string common_params_get_system_info(const common_params & params); bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]); bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]); -void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr); +void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr); bool set_process_priority(enum ggml_sched_priority prio); // @@ -759,6 +779,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) { str.compare(0, prefix.size(), prefix) == 0; } +// remove when moving to c++20 +inline bool string_starts_with(std::string_view str, char prefix) { + return !str.empty() && str.front() == prefix; +} + // remove when moving to c++20 inline bool string_ends_with(std::string_view str, std::string_view suffix) { return str.size() >= suffix.size() && @@ -854,12 +879,33 @@ common_init_result_ptr common_init_from_params(common_params & params); struct llama_model_params common_model_params_to_llama ( common_params & params); struct llama_context_params common_context_params_to_llama(const common_params & params); -struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params); +struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params); // clear LoRA adapters from context, then apply new list of adapters void common_set_adapter_lora(struct llama_context * ctx, std::vector & lora); -std::string get_model_endpoint(); +// model endpoint from env +std::string common_get_model_endpoint(); + +// +// Context utils +// + +enum common_context_seq_rm_type { + COMMON_CONTEXT_SEQ_RM_TYPE_NO = 0, // seq_rm not supported (e.g. no memory module) + COMMON_CONTEXT_SEQ_RM_TYPE_PART = 1, // can seq_rm partial sequences + COMMON_CONTEXT_SEQ_RM_TYPE_FULL = 2, // can seq_rm full sequences only + COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq +}; + +// check if the llama_context can remove sequences +// note: clears the memory of the context +common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx); + +// aborts execution on failure +void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1); +void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta); +void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1); // // Batch utils @@ -998,3 +1044,50 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std // "adamw" or "sgd" (case insensitive) enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *); + +// +// prompt utils +// + +struct common_prompt_checkpoint { + int64_t n_tokens; + + llama_pos pos_min; + llama_pos pos_max; + + std::vector data_tgt; + std::vector data_dft; + + size_t size() const; + + bool empty() const; + void clear(); + + void update_pos( + int64_t n_tokens, + llama_pos pos_min, + llama_pos pos_max); + + void update_tgt( + llama_context * ctx, + llama_seq_id seq_id, + llama_state_seq_flags flags); + + void update_dft( + llama_context * ctx, + llama_seq_id seq_id, + llama_state_seq_flags flags); + + void load_tgt( + llama_context * ctx, + llama_seq_id seq_id, + llama_state_seq_flags flags) const; + + void load_dft( + llama_context * ctx, + llama_seq_id seq_id, + llama_state_seq_flags flags) const; + + void clear_tgt(); + void clear_dft(); +}; diff --git a/ggml/llamacpp/third_party/common/sampling.h b/ggml/llamacpp/third_party/common/sampling.h index 5b57ad6..49506a0 100644 --- a/ggml/llamacpp/third_party/common/sampling.h +++ b/ggml/llamacpp/third_party/common/sampling.h @@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st void common_sampler_free(struct common_sampler * gsmpl); -// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar -void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar); +// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler +void common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated); void common_sampler_reset (struct common_sampler * gsmpl); struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl); diff --git a/ggml/llamacpp/third_party/ggml/include/ggml-backend.h b/ggml/llamacpp/third_party/ggml/include/ggml-backend.h index 3c06aea..b6f7373 100644 --- a/ggml/llamacpp/third_party/ggml/include/ggml-backend.h +++ b/ggml/llamacpp/third_party/ggml/include/ggml-backend.h @@ -169,7 +169,7 @@ extern "C" { // device type enum ggml_backend_dev_type type; // device id - // for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0") + // for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0") // if the id is unknown, this should be NULL const char * device_id; // device capabilities @@ -202,8 +202,11 @@ extern "C" { // Common functions that may be obtained using ggml_backend_reg_get_proc_address - // AllReduce operation for tensor parallelism (meta backend) - typedef bool (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends); + // Context management and operations for faster communication between backends, used for tensor parallelism (meta backend) + typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends); + typedef void (*ggml_backend_comm_free_t)(void * comm_ctx); + typedef bool (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors); + // Split buffer type for tensor parallelism (old) typedef ggml_backend_buffer_type_t (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split); // Set the number of threads for the backend @@ -348,6 +351,53 @@ extern "C" { // Set a callback to be called for each resulting node during graph compute GGML_API void ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data); + // + // Meta backend + // + +#define GGML_BACKEND_META_MAX_DEVICES 16 + + enum ggml_backend_meta_split_axis { + // tensor split by tensor dimensions: + GGML_BACKEND_SPLIT_AXIS_0 = 0, + GGML_BACKEND_SPLIT_AXIS_1 = 1, + GGML_BACKEND_SPLIT_AXIS_2 = 2, + GGML_BACKEND_SPLIT_AXIS_3 = 3, + + GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends + GGML_BACKEND_SPLIT_AXIS_PARTIAL = 11, // each backend has a partial sum + + // for internal bookkeeping only: + GGML_BACKEND_SPLIT_AXIS_NONE = 98, + GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99, + }; + GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis); + + struct ggml_backend_meta_split_state { + enum ggml_backend_meta_split_axis axis; + + // for tensors with axis >= 0 && axis < GGML_MAX_DIMS: + // - each device has a slice of the tensor along the split axis + // - most tensors have n_segments == 1 and a contiguous slice of the tensor data + // - some tensors have an inhomogenenous data layout along the split axis, + // those tensors are divided into segments which are each individually split across devices + // - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis, + // the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1], + // - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments + // that each need to be split individually across devices so that each device gets a slice of Q, K, and V + int64_t ne[16*GGML_BACKEND_META_MAX_DEVICES]; + uint32_t n_segments; + }; + + // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible: + typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata); + + // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this: + // TODO: this looks a bit strange - a backend API creates a device. I think we should try + // express this as a backend registry functionality instead + GGML_API ggml_backend_dev_t ggml_backend_meta_device( + ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud); + // // Utils // diff --git a/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h b/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h index 1c11495..6fcf5a4 100644 --- a/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h +++ b/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h @@ -6,9 +6,9 @@ extern "C" { #endif -#define RPC_PROTO_MAJOR_VERSION 3 -#define RPC_PROTO_MINOR_VERSION 6 -#define RPC_PROTO_PATCH_VERSION 1 +#define RPC_PROTO_MAJOR_VERSION 4 +#define RPC_PROTO_MINOR_VERSION 0 +#define RPC_PROTO_PATCH_VERSION 0 #ifdef __cplusplus static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION"); diff --git a/ggml/llamacpp/third_party/ggml/include/ggml.h b/ggml/llamacpp/third_party/ggml/include/ggml.h index 11d3e8a..41566d4 100644 --- a/ggml/llamacpp/third_party/ggml/include/ggml.h +++ b/ggml/llamacpp/third_party/ggml/include/ggml.h @@ -438,6 +438,12 @@ extern "C" { GGML_PREC_F32 = 10, }; + // op hint + enum ggml_op_hint { + GGML_HINT_NONE = 0, + GGML_HINT_SRC0_IS_HADAMARD = 1, + }; + // model file types enum ggml_ftype { GGML_FTYPE_UNKNOWN = -1, @@ -1419,6 +1425,11 @@ extern "C" { struct ggml_tensor * a, enum ggml_prec prec); + // change the hint of a matrix multiplication + GGML_API void ggml_mul_mat_set_hint( + struct ggml_tensor * a, + enum ggml_op_hint hint); + // indirect matrix multiplication GGML_API struct ggml_tensor * ggml_mul_mat_id( struct ggml_context * ctx, @@ -1773,8 +1784,32 @@ extern "C" { int n_dims, int mode); - // custom RoPE + // RoPE operations with extended options + // a is the input tensor to apply RoPE to, shape [n_embd, n_head, n_token] + // b is an int32 vector with size n_token // c is freq factors (e.g. phi3-128k), (optional) + // mode can be GGML_ROPE_TYPE_NORMAL or NEOX; for MROPE and VISION mode, use ggml_rope_multi + // + // pseudo-code for computing theta: + // for i in [0, n_dims/2): + // theta[i] = b[i] * powf(freq_base, -2.0 * i / n_dims); + // theta[i] = theta[i] / c[i]; # if c is provided, divide theta by c + // theta[i] = rope_yarn(theta[i], ...); # note: theta = theta * freq_scale is applied here + // + // other params are used by YaRN RoPE scaling, these default values will disable YaRN: + // freq_scale = 1.0f + // ext_factor = 0.0f + // attn_factor = 1.0f + // beta_fast = 0.0f + // beta_slow = 0.0f + // + // example: + // (marking: c = cos, s = sin, 0 = unrotated) + // given a single head with size = 8 --> [00000000] + // GGML_ROPE_TYPE_NORMAL n_dims = 4 --> [cscs0000] + // GGML_ROPE_TYPE_NORMAL n_dims = 8 --> [cscscscs] + // GGML_ROPE_TYPE_NEOX n_dims = 4 --> [ccss0000] + // GGML_ROPE_TYPE_NEOX n_dims = 8 --> [ccccssss] GGML_API struct ggml_tensor * ggml_rope_ext( struct ggml_context * ctx, struct ggml_tensor * a, @@ -1790,6 +1825,36 @@ extern "C" { float beta_fast, float beta_slow); + // multi-dimensional RoPE, for Qwen-VL and similar vision models + // mode can be either VISION, MROPE, IMROPE, cannot be combined with NORMAL or NEOX + // sections specify how many dimensions to rotate in each section: + // section length is equivalent to number of cos/sin pairs, NOT the number of dims + // (i.e. sum of 4 sections are expected to be n_dims/2) + // last sections can be 0, means ignored + // all other options are identical to ggml_rope_ext + // + // important note: + // - NEOX ordering is automatically applied and cannot be disabled for MROPE and VISION + // if you need normal ordering, there are 2 methods: + // (1) split the tensor manually using ggml_view + // (2) permute the weight upon conversion + // - for VISION, n_dims must be head_size/2 + // + // example M-RoPE: + // given sections = [t=4, y=2, x=2, 0] + // given a single head with size = 18 --> [000000000000000000] + // GGML_ROPE_TYPE_MROPE n_dims = 16 --> [ttttyyxxttttyyxx00] (cos/sin are applied in NEOX ordering) + // GGML_ROPE_TYPE_IMROPE n_dims = 16 --> [ttyxttyxttyxttyx00] (interleaved M-RoPE, still NEOX ordering) + // note: the theta for each dim is computed the same way as ggml_rope_ext, no matter the section + // in other words, idx used for theta: [0123456789... until n_dims/2], not reset for each section + // + // example vision RoPE: + // given sections = [y=4, x=4, 0, 0] (last 2 sections are ignored) + // given a single head with size = 8 --> [00000000] + // GGML_ROPE_TYPE_VISION n_dims = 4 --> [yyyyxxxx] + // other values of n_dims are untested and is undefined behavior + // note: unlike MROPE, the theta for each dim is computed differently for each section + // in other words, idx used for theta: [0123] for y section, then [0123] for x section GGML_API struct ggml_tensor * ggml_rope_multi( struct ggml_context * ctx, struct ggml_tensor * a, @@ -2476,6 +2541,11 @@ extern "C" { // TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST] // ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306 + // + // state is a 3D tensor of shape (S_v*S_v*H, K, n_seqs): + // K == 1: output carries the final state only. + // K > 1: output carries K snapshot slots; the kernel writes the last min(n_tokens, K) + // per-token snapshots into the trailing slots GGML_API struct ggml_tensor * ggml_gated_delta_net( struct ggml_context * ctx, struct ggml_tensor * q, diff --git a/ggml/llamacpp/third_party/include/llama.h b/ggml/llamacpp/third_party/include/llama.h index ac267b5..75095b2 100644 --- a/ggml/llamacpp/third_party/include/llama.h +++ b/ggml/llamacpp/third_party/include/llama.h @@ -198,6 +198,11 @@ extern "C" { LLAMA_SPLIT_MODE_TENSOR = 3, }; + enum llama_context_type { + LLAMA_CONTEXT_TYPE_DEFAULT = 0, + LLAMA_CONTEXT_TYPE_MTP = 1, + }; + // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979) typedef struct llama_token_data { llama_token id; // token id @@ -333,9 +338,11 @@ extern "C" { uint32_t n_batch; // logical maximum batch size that can be submitted to llama_decode uint32_t n_ubatch; // physical maximum batch size uint32_t n_seq_max; // max number of sequences (i.e. distinct states for recurrent models) + uint32_t n_rs_seq; // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL] int32_t n_threads; // number of threads to use for generation int32_t n_threads_batch; // number of threads to use for batch processing + enum llama_context_type ctx_type; // set the context type (e.g. MTP) enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type` enum llama_pooling_type pooling_type; // whether to pool (sum) embedding results by sequence id enum llama_attention_type attention_type; // attention type to use for embeddings @@ -511,27 +518,6 @@ extern "C" { // Frees all allocated memory LLAMA_API void llama_free(struct llama_context * ctx); - enum llama_params_fit_status { - LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit - LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit - LLAMA_PARAMS_FIT_STATUS_ERROR = 2, // a hard error occurred, e.g. because no model could be found at the specified path - }; - - // fits mparams and cparams to free device memory (assumes system memory is unlimited) - // - returns true if the parameters could be successfully modified to fit device memory - // - this function is NOT thread safe because it modifies the global llama logger state - // - only parameters that have the same value as in llama_default_model_params are modified - // with the exception of the context size which is modified if and only if equal to 0 - LLAMA_API enum llama_params_fit_status llama_params_fit( - const char * path_model, - struct llama_model_params * mparams, - struct llama_context_params * cparams, - float * tensor_split, // writable buffer for tensor split, needs at least llama_max_devices elements - struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements - size_t * margins, // margins of memory to leave per device in bytes - uint32_t n_ctx_min, // minimum context size to set when trying to reduce memory use - enum ggml_log_level log_level); // minimum log level to print during fitting, lower levels go to debug log - LLAMA_API int64_t llama_time_us(void); LLAMA_API size_t llama_max_devices(void); @@ -551,6 +537,7 @@ extern "C" { LLAMA_API uint32_t llama_n_batch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_ubatch (const struct llama_context * ctx); LLAMA_API uint32_t llama_n_seq_max (const struct llama_context * ctx); + LLAMA_API uint32_t llama_n_rs_seq (const struct llama_context * ctx); DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead"); DEPRECATED(LLAMA_API int32_t llama_n_embd (const struct llama_model * model), "use llama_model_n_embd instead"); @@ -879,12 +866,17 @@ extern "C" { size_t n_token_capacity, size_t * n_token_count_out); +#define LLAMA_STATE_SEQ_FLAGS_NONE 0 + // for backwards-compat #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1 // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba) #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1 +// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load) +#define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2 + typedef uint32_t llama_state_seq_flags; LLAMA_API size_t llama_state_seq_get_size_ext( @@ -1546,9 +1538,6 @@ extern "C" { LLAMA_API void llama_perf_sampler_print(const struct llama_sampler * chain); LLAMA_API void llama_perf_sampler_reset( struct llama_sampler * chain); - // print a breakdown of per-device memory use via LLAMA_LOG: - LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx); - // // training // diff --git a/ggml/llamacpp/third_party/include/mtmd-helper.h b/ggml/llamacpp/third_party/include/mtmd-helper.h index 5036b92..57da78a 100644 --- a/ggml/llamacpp/third_party/include/mtmd-helper.h +++ b/ggml/llamacpp/third_party/include/mtmd-helper.h @@ -47,6 +47,10 @@ MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks); // normally, n_pos is equal to n_tokens, but for M-RoPE it is different MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks); +// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE +// out_pos must have length == mtmd_helper_get_n_tokens(image) +MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos); + // helper function that automatically: // 1. run llama_decode() on text chunks // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode() diff --git a/ggml/llamacpp/third_party/include/mtmd.h b/ggml/llamacpp/third_party/include/mtmd.h index ebb4a18..54b9515 100644 --- a/ggml/llamacpp/third_party/include/mtmd.h +++ b/ggml/llamacpp/third_party/include/mtmd.h @@ -46,9 +46,6 @@ # define MTMD_API #endif -// deprecated marker, use mtmd_default_marker() instead -#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>" - #ifdef __cplusplus extern "C" { #endif @@ -114,20 +111,21 @@ MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname, MTMD_API void mtmd_free(mtmd_context * ctx); // whether we need to set non-causal mask before llama_decode -MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx); +// if chunk is nullptr, we assume the default case where chunk is an image chunk +MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk); // whether the current model use M-RoPE for llama_decode -MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx); +MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx); // whether the current model supports vision input -MTMD_API bool mtmd_support_vision(mtmd_context * ctx); +MTMD_API bool mtmd_support_vision(const mtmd_context * ctx); // whether the current model supports audio input -MTMD_API bool mtmd_support_audio(mtmd_context * ctx); +MTMD_API bool mtmd_support_audio(const mtmd_context * ctx); // get audio sample rate in Hz, for example 16000 for Whisper // return -1 if audio is not supported -MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx); +MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx); // mtmd_bitmap // @@ -185,12 +183,27 @@ MTMD_API void mtmd_input_chunk_free(mtmd_input_chunk * chunk); // the instance will be constructed via mtmd_tokenize() // it will be freed along with mtmd_input_chunk MTMD_API size_t mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate -MTMD_API size_t mtmd_image_tokens_get_nx (const mtmd_image_tokens * image_tokens); -MTMD_API size_t mtmd_image_tokens_get_ny (const mtmd_image_tokens * image_tokens); MTMD_API const char * mtmd_image_tokens_get_id (const mtmd_image_tokens * image_tokens); // TODO: deprecate // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise) MTMD_API llama_pos mtmd_image_tokens_get_n_pos (const mtmd_image_tokens * image_tokens); // TODO: deprecate +DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens), + "use mtmd_image_tokens_get_decoder_pos() instead"); +DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens), + "use mtmd_image_tokens_get_decoder_pos() instead"); + +struct mtmd_decoder_pos { + uint32_t t; + uint32_t x; + uint32_t y; + uint32_t z; // unused for now, reserved for future use +}; +// get position for decoder attention, to be used by M-RoPE models +// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1 +// pos_0 is the absolute position of the first token +// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position) +MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i); + // tokenize an input text prompt and a list of bitmaps (images/audio) // the prompt must have the input image marker (default: "<__media__>") in it // the default marker is defined by mtmd_default_marker() @@ -231,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx); // If this is not called, or NULL is supplied, everything is output on stderr. MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data); +// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context +// This is only intended to be used by llama-server, breaking changes is expected +struct mtmd_caps { + bool inp_vision; + bool inp_audio; +}; +MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname); + ///////////////////////////////////////// // test function, to be used in test-mtmd-c-api.c diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a index 7170f50..7a911d7 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a index 2e703f0..9f4e570 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a index 017b54f..2af6a19 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a index f9ad770..206a9ae 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common-base.a new file mode 100644 index 0000000..91b6cf1 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common.a similarity index 66% rename from ggml/llamacpp/third_party/prebuilt/android-arm64/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common.a index 6658711..2b2dfd8 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-diffusion.a new file mode 100644 index 0000000..6ee355d Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-diffusion.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-ui.a new file mode 100644 index 0000000..02eb28e Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a index 5ab86cb..c6fe540 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a index b97718a..ea81ce2 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a index a8fb085..842aff5 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a index 0c57130..2b29ab2 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a index 6c6fddc..c8915dc 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a index bbf6a65..91a2591 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a index 4fe09a0..ff452ab 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a index 4a6e268..b6ae635 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a index f13db70..248b382 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a index 445e276..8b267b1 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common-base.a new file mode 100644 index 0000000..57599d1 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common.a similarity index 50% rename from ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common.a index c636085..be273b4 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-diffusion.a new file mode 100644 index 0000000..52f8732 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-diffusion.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-ui.a new file mode 100644 index 0000000..18b968b Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a index d6e5a29..845d176 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a index eb6ad00..05784b2 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a index 01c4738..51f6aa8 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libcpp-httplib.a new file mode 100644 index 0000000..7e9eba2 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-base.a new file mode 100644 index 0000000..7fabd54 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-blas.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-blas.a new file mode 100644 index 0000000..9626f00 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-blas.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a new file mode 100644 index 0000000..4f2b53f Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-metal.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-metal.a new file mode 100644 index 0000000..5b2052a Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-metal.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml.a new file mode 100644 index 0000000..f057c51 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common-base.a new file mode 100644 index 0000000..0eed854 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common.a new file mode 100644 index 0000000..b746bfd Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-ui.a new file mode 100644 index 0000000..e1c026f Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama.a new file mode 100644 index 0000000..3e4ff66 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libmtmd.a new file mode 100644 index 0000000..151b516 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libserver-context.a new file mode 100644 index 0000000..6b84e60 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libserver-context.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a index e1fefa7..9deb533 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a index 94e4ce7..cf26741 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a index ff49e8f..57e3b66 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a index ae671c5..65400f0 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a index 249cdb8..ed5dea2 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common-base.a new file mode 100644 index 0000000..fb44406 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common.a similarity index 57% rename from ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common.a index f6f06ff..16631a5 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-diffusion.a new file mode 100644 index 0000000..9263253 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-diffusion.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-ui.a new file mode 100644 index 0000000..ce03eaf Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a index 3379693..55718cf 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a index 6cd24ec..57b74cd 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a index dbeb59d..cf77963 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a index 9c5a1ac..9deb533 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a index ba99cb7..cf26741 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a index da2df04..57e3b66 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a index 88a9ac4..6fddb02 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a index 72fc96e..6dc3907 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common-base.a new file mode 100644 index 0000000..fb44406 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common.a similarity index 57% rename from ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common.a index a8468fe..16631a5 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-diffusion.a new file mode 100644 index 0000000..9263253 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-diffusion.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-ui.a new file mode 100644 index 0000000..ce03eaf Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a index 9da3eae..55718cf 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a index 6ecf783..57b74cd 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a index 213ce33..cf77963 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a index f8f5d13..4d0a0d6 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a index bd61e10..2b01df0 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a index e690af0..947396e 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a index ff7fa77..9d4e8f0 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common-base.a new file mode 100644 index 0000000..fb7b545 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common.a similarity index 63% rename from ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common.a index aeceb6b..e366b95 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-diffusion.a new file mode 100644 index 0000000..d94553f Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-diffusion.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-ui.a new file mode 100644 index 0000000..869028f Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a index 1ed1847..e0cac12 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a index 0792c2c..dbe3eae 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a index 8b56512..17e7aaf 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a index 0f0a2e3..1db6a89 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a index d7498bd..c70452b 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a index 53465bf..129bff7 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a index a3f95e8..bc80da1 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common-base.a new file mode 100644 index 0000000..54d45f2 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common-base.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common.a similarity index 51% rename from ggml/llamacpp/third_party/prebuilt/linux-arm64/libcommon.a rename to ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common.a index e8a6ab9..18957c5 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-diffusion.a new file mode 100644 index 0000000..4e342b1 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-diffusion.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-ui.a new file mode 100644 index 0000000..e86fb73 Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-ui.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a index 945138f..ee22667 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a index b23259b..0d25011 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a differ diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a index 0cb0b6e..65897a4 100644 Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a index c58565f..9992462 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a index 3a51b19..c0bdda3 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a index a1a690e..26e4777 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a index b78e00f..1674f5c 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a index f2434b6..b2d6730 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a index ed939f4..fef498c 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a index 4ff1a6b..a1afac6 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libcommon.a new file mode 100644 index 0000000..a438119 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-base.a new file mode 100644 index 0000000..7c89ec0 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-blas.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-blas.a new file mode 100644 index 0000000..fdf718a Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-blas.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a new file mode 100644 index 0000000..03dcc94 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-metal.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-metal.a new file mode 100644 index 0000000..ec7fb4a Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-metal.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml.a new file mode 100644 index 0000000..b04cacd Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libwhisper.a new file mode 100644 index 0000000..cdb2d26 Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libwhisper.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a index 61aa619..0f941af 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a index 4207df2..d343ff2 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a index dbaf36f..568975d 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a index c4e6da5..01d0914 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a index 8db5376..50b3c7c 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a index a0faa0f..ba57e1c 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a index 79bf1cb..0f941af 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a index 6312e47..d343ff2 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a index a6adb24..568975d 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a index c93f533..4ddf15c 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a index 00e15d4..a3918c7 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a index babf534..2e5cad8 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a index d6395a7..23844ab 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a index 94ed338..97f3889 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a index 38cf53b..6d03128 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a index a37112a..38373aa 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a index 39a9565..218ce14 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a differ diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a index 453f978..c5537de 100644 Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a differ diff --git a/version.go b/version.go index aafe579..df49375 100644 --- a/version.go +++ b/version.go @@ -2,6 +2,6 @@ package gonativeml const ( Version = "v0.1.7" - LlamaCppVersion = "b8772" + LlamaCppVersion = "b9222" WhisperCppVersion = "v1.8.3" )