diff --git a/Dockerfile.android b/Dockerfile.android
index dcdbcfa..16f9bdb 100644
--- a/Dockerfile.android
+++ b/Dockerfile.android
@@ -92,7 +92,9 @@ RUN mkdir -p /out/llama.cpp/android-arm64 /out/llama.cpp/include /out/llama.cpp/
     cp llama-src/include/*.h /out/llama.cpp/include/ && \
     cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \
     cp llama-src/common/common.h /out/llama.cpp/common/ && \
-    cp llama-src/common/sampling.h /out/llama.cpp/common/
+    cp llama-src/common/sampling.h /out/llama.cpp/common/ && \
+    cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \
+    cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/
 
 # Collect whisper.cpp artifacts and strip debug symbols
 RUN mkdir -p /out/whisper.cpp/android-arm64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \
diff --git a/Dockerfile.libs b/Dockerfile.libs
index ac82e85..febe06b 100644
--- a/Dockerfile.libs
+++ b/Dockerfile.libs
@@ -53,6 +53,8 @@ RUN WHISPER_VERSION=$(go run ./cmd/versioncmd whisper.cpp) && \
 # ============================================================================
 FROM golang:1.24-bookworm AS builder-cpu
 
+ARG ARCH_SUFFIX=linux-amd64
+
 RUN apt-get update && apt-get install -y --no-install-recommends \
     build-essential cmake && \
     rm -rf /var/lib/apt/lists/*
@@ -72,18 +74,20 @@ RUN cd whisper-src && \
     cmake --build build --config Release -j$(nproc)
 
 # Collect llama.cpp artifacts and strip debug symbols
-RUN mkdir -p /out/llama.cpp/linux-amd64 /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \
-    find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/linux-amd64/ \; && \
-    find /out/llama.cpp/linux-amd64 -name "*.a" -exec strip --strip-debug {} \; && \
+RUN mkdir -p /out/llama.cpp/${ARCH_SUFFIX} /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \
+    find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/${ARCH_SUFFIX}/ \; && \
+    find /out/llama.cpp/${ARCH_SUFFIX} -name "*.a" -exec strip --strip-debug {} \; && \
     cp llama-src/include/*.h /out/llama.cpp/include/ && \
     cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \
     cp llama-src/common/common.h /out/llama.cpp/common/ && \
-    cp llama-src/common/sampling.h /out/llama.cpp/common/
+    cp llama-src/common/sampling.h /out/llama.cpp/common/ && \
+    cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \
+    cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/
 
 # Collect whisper.cpp artifacts and strip debug symbols
-RUN mkdir -p /out/whisper.cpp/linux-amd64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \
-    find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/linux-amd64/ \; && \
-    find /out/whisper.cpp/linux-amd64 -name "*.a" -exec strip --strip-debug {} \; && \
+RUN mkdir -p /out/whisper.cpp/${ARCH_SUFFIX} /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \
+    find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/${ARCH_SUFFIX}/ \; && \
+    find /out/whisper.cpp/${ARCH_SUFFIX} -name "*.a" -exec strip --strip-debug {} \; && \
     cp whisper-src/include/*.h /out/whisper.cpp/include/ && \
     cp whisper-src/ggml/include/*.h /out/whisper.cpp/ggml/include/
 
@@ -92,9 +96,15 @@ RUN mkdir -p /out/whisper.cpp/linux-amd64 /out/whisper.cpp/include /out/whisper.
 # ============================================================================
 FROM nvidia/cuda:12.8.0-devel-ubuntu24.04 AS builder-cuda
 
+# gcc-12: gcc-13 hits an internal-compiler-error on fattn-mma-f16 templates
+# (cfgcleanup.cc:580 try_forward_edges) — reproducible at any -j on b9222.
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake wget && \
-    rm -rf /var/lib/apt/lists/*
+    build-essential cmake wget gcc-12 g++-12 && \
+    rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100
+
+ENV CC=gcc-12 CXX=g++-12
 
 WORKDIR /src
 COPY --from=sources /src/llama-src llama-src
@@ -117,7 +127,9 @@ RUN mkdir -p /out/llama.cpp/linux-amd64-cuda /out/llama.cpp/include /out/llama.c
     cp llama-src/include/*.h /out/llama.cpp/include/ && \
     cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \
     cp llama-src/common/common.h /out/llama.cpp/common/ && \
-    cp llama-src/common/sampling.h /out/llama.cpp/common/
+    cp llama-src/common/sampling.h /out/llama.cpp/common/ && \
+    cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \
+    cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/
 
 # Collect whisper.cpp artifacts (CUDA variant) and strip debug symbols
 RUN mkdir -p /out/whisper.cpp/linux-amd64-cuda /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \
@@ -133,9 +145,16 @@ RUN mkdir -p /out/whisper.cpp/linux-amd64-cuda /out/whisper.cpp/include /out/whi
 # (llama.cpp b8220+ needs VK_EXT_layer_settings from Vulkan 1.3.261+)
 FROM ubuntu:24.04 AS builder-vulkan
 
+# - spirv-headers: required since llama.cpp b9000+ (ggml-vulkan calls find_package(SPIRV-Headers))
+# - gcc-12: gcc-13 ICEs on common/arg.cpp.o under load
 RUN apt-get update && apt-get install -y --no-install-recommends \
-    build-essential cmake wget ca-certificates libvulkan-dev glslang-tools glslc && \
-    rm -rf /var/lib/apt/lists/*
+    build-essential cmake wget ca-certificates libvulkan-dev glslang-tools glslc \
+    spirv-headers gcc-12 g++-12 && \
+    rm -rf /var/lib/apt/lists/* && \
+    update-alternatives --install /usr/bin/gcc gcc /usr/bin/gcc-12 100 && \
+    update-alternatives --install /usr/bin/g++ g++ /usr/bin/g++-12 100
+
+ENV CC=gcc-12 CXX=g++-12
 
 WORKDIR /src
 COPY --from=sources /src/llama-src llama-src
@@ -158,7 +177,9 @@ RUN mkdir -p /out/llama.cpp/linux-amd64-vulkan /out/llama.cpp/include /out/llama
     cp llama-src/include/*.h /out/llama.cpp/include/ && \
     cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \
     cp llama-src/common/common.h /out/llama.cpp/common/ && \
-    cp llama-src/common/sampling.h /out/llama.cpp/common/
+    cp llama-src/common/sampling.h /out/llama.cpp/common/ && \
+    cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \
+    cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/
 
 # Collect whisper.cpp artifacts (Vulkan variant) and strip debug symbols
 RUN mkdir -p /out/whisper.cpp/linux-amd64-vulkan /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \
diff --git a/Dockerfile.libs-arm64 b/Dockerfile.libs-arm64
new file mode 100644
index 0000000..770da69
--- /dev/null
+++ b/Dockerfile.libs-arm64
@@ -0,0 +1,98 @@
+# Dockerfile.libs-arm64 — cross-compile linux-arm64 static libraries
+#
+# Uses aarch64-linux-gnu gcc/g++ toolchain on an amd64 host. No qemu/binfmt
+# required, which makes it work inside unprivileged LXC where binfmt_misc
+# is read-only.
+#
+# Usage:
+#   docker build -f Dockerfile.libs-arm64 -o ./out .
+
+# ============================================================================
+# Stage: Download sources (shared)
+# ============================================================================
+FROM golang:1.24-bookworm AS sources
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    wget && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+
+COPY go.mod ./
+COPY version.go ./
+COPY cmd/versioncmd/ ./cmd/versioncmd/
+
+RUN LLAMA_VERSION=$(go run ./cmd/versioncmd llama.cpp) && \
+    echo "Downloading llama.cpp ${LLAMA_VERSION}..." && \
+    wget -qO llama.cpp.tar.gz "https://github.com/ggerganov/llama.cpp/archive/refs/tags/${LLAMA_VERSION}.tar.gz" && \
+    mkdir -p llama-src && \
+    tar xzf llama.cpp.tar.gz --strip-components=1 -C llama-src && \
+    rm llama.cpp.tar.gz
+
+RUN WHISPER_VERSION=$(go run ./cmd/versioncmd whisper.cpp) && \
+    echo "Downloading whisper.cpp ${WHISPER_VERSION}..." && \
+    wget -qO whisper.cpp.tar.gz "https://github.com/ggerganov/whisper.cpp/archive/refs/tags/${WHISPER_VERSION}.tar.gz" && \
+    mkdir -p whisper-src && \
+    tar xzf whisper.cpp.tar.gz --strip-components=1 -C whisper-src && \
+    rm whisper.cpp.tar.gz
+
+# ============================================================================
+# Builder: cross-compile to aarch64-linux-gnu
+# ============================================================================
+FROM debian:bookworm AS builder
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    build-essential cmake \
+    crossbuild-essential-arm64 && \
+    rm -rf /var/lib/apt/lists/*
+
+WORKDIR /src
+COPY --from=sources /src/llama-src llama-src
+COPY --from=sources /src/whisper-src whisper-src
+
+RUN printf '%s\n' \
+    'set(CMAKE_SYSTEM_NAME Linux)' \
+    'set(CMAKE_SYSTEM_PROCESSOR aarch64)' \
+    'set(CMAKE_C_COMPILER aarch64-linux-gnu-gcc)' \
+    'set(CMAKE_CXX_COMPILER aarch64-linux-gnu-g++)' \
+    'set(CMAKE_FIND_ROOT_PATH_MODE_PROGRAM NEVER)' \
+    'set(CMAKE_FIND_ROOT_PATH_MODE_LIBRARY ONLY)' \
+    'set(CMAKE_FIND_ROOT_PATH_MODE_INCLUDE ONLY)' \
+    > /src/aarch64-toolchain.cmake
+
+# Build llama.cpp (aarch64)
+RUN cd llama-src && \
+    cmake -B build \
+        -DCMAKE_TOOLCHAIN_FILE=/src/aarch64-toolchain.cmake \
+        -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF \
+        -DGGML_NATIVE=OFF && \
+    cmake --build build --config Release -j$(nproc)
+
+# Build whisper.cpp (aarch64)
+RUN cd whisper-src && \
+    cmake -B build \
+        -DCMAKE_TOOLCHAIN_FILE=/src/aarch64-toolchain.cmake \
+        -DCMAKE_BUILD_TYPE=Release -DBUILD_SHARED_LIBS=OFF \
+        -DGGML_NATIVE=OFF && \
+    cmake --build build --config Release -j$(nproc)
+
+# Collect llama.cpp artifacts
+RUN mkdir -p /out/llama.cpp/linux-arm64 /out/llama.cpp/include /out/llama.cpp/ggml/include /out/llama.cpp/common && \
+    find llama-src/build -name "*.a" -exec cp {} /out/llama.cpp/linux-arm64/ \; && \
+    find /out/llama.cpp/linux-arm64 -name "*.a" -exec aarch64-linux-gnu-strip --strip-debug {} \; && \
+    cp llama-src/include/*.h /out/llama.cpp/include/ && \
+    cp llama-src/ggml/include/*.h /out/llama.cpp/ggml/include/ && \
+    cp llama-src/common/common.h /out/llama.cpp/common/ && \
+    cp llama-src/common/sampling.h /out/llama.cpp/common/ && \
+    cp llama-src/tools/mtmd/mtmd.h /out/llama.cpp/include/ && \
+    cp llama-src/tools/mtmd/mtmd-helper.h /out/llama.cpp/include/
+
+# Collect whisper.cpp artifacts
+RUN mkdir -p /out/whisper.cpp/linux-arm64 /out/whisper.cpp/include /out/whisper.cpp/ggml/include && \
+    find whisper-src/build -name "*.a" -exec cp {} /out/whisper.cpp/linux-arm64/ \; && \
+    find /out/whisper.cpp/linux-arm64 -name "*.a" -exec aarch64-linux-gnu-strip --strip-debug {} \; && \
+    cp whisper-src/include/*.h /out/whisper.cpp/include/ && \
+    cp whisper-src/ggml/include/*.h /out/whisper.cpp/ggml/include/
+
+FROM scratch
+COPY --from=builder /out/ /
diff --git a/Makefile b/Makefile
index 1bf8439..e61a158 100644
--- a/Makefile
+++ b/Makefile
@@ -13,6 +13,7 @@
 #   make build-libs-linux-cpu         # Build linux-amd64 CPU only
 #   make build-libs-linux-cuda        # Build linux-amd64 CUDA only
 #   make build-libs-linux-vulkan      # Build linux-amd64 Vulkan only
+#   make build-libs-linux-arm64       # Build linux-arm64 (CPU) via qemu emulation
 #   make build-libs-android           # Build android-arm64 via NDK
 #   make build-libs-all               # Build native + all linux + android
 #   make clean                        # Remove temp build dirs (keeps prebuilt .a + headers)
@@ -38,7 +39,7 @@ WHISPER_PREBUILT := $(WHISPER_THIRD_PARTY)/prebuilt/$(PLATFORM)
 
 .PHONY: build-libs build-libs-llama build-libs-whisper \
        build-libs-linux build-libs-linux-cpu build-libs-linux-cuda build-libs-linux-vulkan \
-       build-libs-android build-libs-all clean verify
+       build-libs-linux-arm64 build-libs-android build-libs-all clean verify
 
 build-libs: build-libs-llama build-libs-whisper
 
@@ -138,6 +139,29 @@ build-libs-linux-cuda:
 build-libs-linux-vulkan:
 	$(call build-linux-variant,vulkan,-vulkan)
 
+# ============================================================================
+# Docker build for linux-arm64 (cross-compile via aarch64-linux-gnu toolchain)
+# ============================================================================
+# Uses Dockerfile.libs-arm64 which runs aarch64 gcc/g++ inside an amd64
+# container. No qemu/binfmt needed (works in unprivileged LXC).
+build-libs-linux-arm64:
+	@echo "==> Building linux-arm64 static libraries via Docker (cross-compile)..."
+	docker build -f Dockerfile.libs-arm64 -o ./out .
+	@mkdir -p $(LLAMA_THIRD_PARTY)/prebuilt/linux-arm64
+	cp out/llama.cpp/linux-arm64/*.a $(LLAMA_THIRD_PARTY)/prebuilt/linux-arm64/
+	@mkdir -p $(LLAMA_THIRD_PARTY)/include $(LLAMA_THIRD_PARTY)/ggml/include $(LLAMA_THIRD_PARTY)/common
+	cp out/llama.cpp/include/*.h $(LLAMA_THIRD_PARTY)/include/
+	cp out/llama.cpp/ggml/include/*.h $(LLAMA_THIRD_PARTY)/ggml/include/
+	cp out/llama.cpp/common/common.h $(LLAMA_THIRD_PARTY)/common/
+	cp out/llama.cpp/common/sampling.h $(LLAMA_THIRD_PARTY)/common/
+	@mkdir -p $(WHISPER_THIRD_PARTY)/prebuilt/linux-arm64
+	cp out/whisper.cpp/linux-arm64/*.a $(WHISPER_THIRD_PARTY)/prebuilt/linux-arm64/
+	@mkdir -p $(WHISPER_THIRD_PARTY)/include $(WHISPER_THIRD_PARTY)/ggml/include
+	cp out/whisper.cpp/include/*.h $(WHISPER_THIRD_PARTY)/include/
+	cp out/whisper.cpp/ggml/include/*.h $(WHISPER_THIRD_PARTY)/ggml/include/
+	rm -rf out
+	@echo "==> linux-arm64 libraries ready"
+
 # ============================================================================
 # Docker build for android-arm64 (cross-compile via Android NDK)
 # ============================================================================
diff --git a/ggml/llamacpp/llamacpp.go b/ggml/llamacpp/llamacpp.go
index ba0eb50..04c7371 100644
--- a/ggml/llamacpp/llamacpp.go
+++ b/ggml/llamacpp/llamacpp.go
@@ -12,7 +12,7 @@ package llamacpp
 #cgo CXXFLAGS: -std=c++17 -I${SRCDIR}/third_party/include -I${SRCDIR}/third_party/ggml/include -I${SRCDIR}/third_party/common
 #cgo darwin,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-arm64
 #cgo darwin,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/darwin-amd64
-#cgo darwin LDFLAGS: -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
+#cgo darwin LDFLAGS: -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -lggml-blas -lggml-metal -L/usr/local/opt/libomp/lib -L/opt/homebrew/opt/libomp/lib -lomp -framework Accelerate -framework Metal -framework Foundation -lstdc++ -lm
 #include <stdlib.h>
 #include <stdbool.h>
 #include "wrapper.h"
diff --git a/ggml/llamacpp/llamacpp_android.go b/ggml/llamacpp/llamacpp_android.go
index f1b95b2..1d835c5 100644
--- a/ggml/llamacpp/llamacpp_android.go
+++ b/ggml/llamacpp/llamacpp_android.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo android,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/android-arm64
-#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
+#cgo android LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -ldl -llog
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux.go b/ggml/llamacpp/llamacpp_linux.go
index a35551a..d002eb6 100644
--- a/ggml/llamacpp/llamacpp_linux.go
+++ b/ggml/llamacpp/llamacpp_linux.go
@@ -8,6 +8,6 @@ package llamacpp
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64
 #cgo linux,arm64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-arm64
-#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -Wl,--end-group -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux_cuda.go b/ggml/llamacpp/llamacpp_linux_cuda.go
index d7c8fd5..ea3147e 100644
--- a/ggml/llamacpp/llamacpp_linux_cuda.go
+++ b/ggml/llamacpp/llamacpp_linux_cuda.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-cuda
-#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -lggml-cuda -Wl,--end-group -lcuda -lcudart -lcublas -lcublasLt -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/llamacpp_linux_vulkan.go b/ggml/llamacpp/llamacpp_linux_vulkan.go
index 97fea4b..003303e 100644
--- a/ggml/llamacpp/llamacpp_linux_vulkan.go
+++ b/ggml/llamacpp/llamacpp_linux_vulkan.go
@@ -7,6 +7,6 @@ package llamacpp
 
 /*
 #cgo linux,amd64 LDFLAGS: -L${SRCDIR}/third_party/prebuilt/linux-amd64-vulkan
-#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lcommon -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
+#cgo linux LDFLAGS: -Wl,--start-group -lmtmd -lllama-common -lllama-common-base -lllama -lggml-cpu -lggml-base -lggml -lggml-vulkan -Wl,--end-group -lvulkan -lstdc++ -lm -lpthread -ldl -lrt -lgomp
 */
 import "C"
diff --git a/ggml/llamacpp/third_party/common/common.h b/ggml/llamacpp/third_party/common/common.h
index 020b6a7..1d3d788 100644
--- a/ggml/llamacpp/third_party/common/common.h
+++ b/ggml/llamacpp/third_party/common/common.h
@@ -2,17 +2,18 @@
 
 #pragma once
 
+#include "llama-cpp.h"
+
 #include "ggml-opt.h"
 #include "ggml.h"
-#include "llama-cpp.h"
 
 #include <set>
 #include <sstream>
 #include <string>
 #include <string_view>
-#include <variant>
 #include <vector>
 #include <map>
+#include <algorithm>
 
 #if defined(_WIN32) && !defined(_WIN32_WINNT)
 #define _WIN32_WINNT 0x0A00
@@ -27,11 +28,6 @@
 #define die(msg)          do { fputs("error: " msg "\n", stderr);                exit(1); } while (0)
 #define die_fmt(fmt, ...) do { fprintf(stderr, "error: " fmt "\n", __VA_ARGS__); exit(1); } while (0)
 
-#define print_build_info() do {                                                                     \
-    fprintf(stderr, "%s: build = %d (%s)\n",      __func__, LLAMA_BUILD_NUMBER, LLAMA_COMMIT);      \
-    fprintf(stderr, "%s: built with %s for %s\n", __func__, LLAMA_COMPILER, LLAMA_BUILD_TARGET);    \
-} while(0)
-
 struct common_time_meas {
     common_time_meas(int64_t & t_acc, bool disable = false);
     ~common_time_meas();
@@ -53,21 +49,13 @@ struct common_adapter_lora_info {
 
 using llama_tokens = std::vector<llama_token>;
 
-// build info
-extern int LLAMA_BUILD_NUMBER;
-extern const char * LLAMA_COMMIT;
-extern const char * LLAMA_COMPILER;
-extern const char * LLAMA_BUILD_TARGET;
-
-const static std::string build_info("b" + std::to_string(LLAMA_BUILD_NUMBER) + "-" + LLAMA_COMMIT);
-
 struct common_control_vector_load_info;
 
 //
 // CPU utils
 //
 
-struct cpu_params {
+struct common_cpu_params {
     int      n_threads                   = -1;
     bool     cpumask[GGML_MAX_N_THREADS] = {false}; // CPU affinity mask.
     bool     mask_valid                  = false;   // Default: any CPU
@@ -76,8 +64,8 @@ struct cpu_params {
     uint32_t poll                        = 50;      // Polling (busywait) level (0 - no polling, 100 - mostly polling)
 };
 
-int32_t cpu_get_num_physical_cores();
-int32_t cpu_get_num_math();
+int32_t common_cpu_get_num_physical_cores();
+int32_t common_cpu_get_num_math();
 
 //
 // Common params
@@ -170,9 +158,10 @@ enum common_params_sampling_config : uint64_t {
 
 enum common_speculative_type {
     COMMON_SPECULATIVE_TYPE_NONE,          // no speculative decoding
-    COMMON_SPECULATIVE_TYPE_DRAFT,         // draft model
-    COMMON_SPECULATIVE_TYPE_EAGLE3,        // eagle draft model
-    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_SIMPLE,  // standalone draft model speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_EAGLE3,  // Eagle3 speculative decoding
+    COMMON_SPECULATIVE_TYPE_DRAFT_MTP,     // Multi-token prediction
+    COMMON_SPECULATIVE_TYPE_NGRAM_SIMPLE,  // simple self-speculative decoding based on n-grams
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K,   // self-speculative decoding with n-gram keys only
     COMMON_SPECULATIVE_TYPE_NGRAM_MAP_K4V, // self-speculative decoding with n-gram keys and 4 m-gram values
     COMMON_SPECULATIVE_TYPE_NGRAM_MOD,
@@ -287,6 +276,7 @@ struct common_params_sampling {
     std::vector<llama_token> reasoning_budget_start;           // start tag token sequence
     std::vector<llama_token> reasoning_budget_end;             // end tag token sequence
     std::vector<llama_token> reasoning_budget_forced;          // forced sequence (message + end tag)
+    std::string              reasoning_budget_message;         // message injected before end tag when budget exhausted
 
     bool backend_sampling = false;
 
@@ -307,62 +297,82 @@ struct common_params_model {
     std::string name        = ""; // in format <user>/<model>[:<tag>] (tag is optional)     // NOLINT
 };
 
-struct common_ngram_mod;
+// draft-model-based speculative decoding parameters
+struct common_params_speculative_draft {
+    int32_t n_max = 16; // maximum number of tokens to draft during speculative decoding
+    int32_t n_min = 0;  // minimum number of draft tokens to use for speculative decoding
 
-struct common_params_speculative {
-    common_speculative_type type = COMMON_SPECULATIVE_TYPE_NONE; // type of speculative decoding
+    float p_split = 0.1f;  // speculative decoding split probability
+    float p_min   = 0.75f; // minimum speculative decoding probability (greedy) // TODO: change default to 0.0f
 
-    // general-purpose speculative decoding parameters
+    common_params_model mparams;
 
-    int32_t n_max   = 16; // maximum number of tokens to draft during speculative decoding
-    int32_t n_min   = 0; // minimum number of draft tokens to use for speculative decoding
-    float   p_split = 0.1f; // speculative decoding split probability
-    float   p_min   = 0.75f; // minimum speculative decoding probability (greedy)
+    llama_context * ctx_tgt = nullptr;
+    llama_context * ctx_dft = nullptr;
 
-    // ngram-based speculative decoding
+    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
 
-    uint16_t ngram_size_n     = 12; // ngram size for lookup
-    uint16_t ngram_size_m     = 48; // mgram size for speculative tokens
-    uint16_t ngram_min_hits   =  1; // minimum hits at ngram/mgram lookup for mgram to be proposed
+    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
+    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
 
-    std::shared_ptr<common_ngram_mod> ngram_mod;
+    common_cpu_params cpuparams;
+    common_cpu_params cpuparams_batch;
 
-    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding           // NOLINT
-    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding          // NOLINT
+    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
-    // draft-model speculative decoding
+    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+};
 
-    struct common_params_model mparams_dft;
+struct common_params_speculative_ngram_mod {
+    int32_t n_match = 24;
 
-    llama_model * model_dft = nullptr; // a llama_model that can be shared by multiple speculative contexts
+    int32_t n_max = 64;
+    int32_t n_min = 48;
+};
 
-    llama_context_params cparams_dft; // these are the parameters for the draft llama_context
+struct common_params_speculative_ngram_map {
+    uint16_t size_n   = 12; // ngram size for lookup
+    uint16_t size_m   = 48; // mgram size for speculative tokens
+    uint16_t min_hits = 1;  // minimum hits at ngram/mgram lookup for mgram to be proposed
+};
 
-    int32_t n_ctx        = 0;  // draft context size
-    int32_t n_gpu_layers = -1; // number of layers to store in VRAM for the draft model (-1 - use default)
+struct common_params_speculative_ngram_cache {
+    std::string lookup_cache_static;  // path of static ngram cache file for lookup decoding
+    std::string lookup_cache_dynamic; // path of dynamic ngram cache file for lookup decoding
+};
 
-    ggml_type cache_type_k = GGML_TYPE_F16; // KV cache data type for the K
-    ggml_type cache_type_v = GGML_TYPE_F16; // KV cache data type for the V
+struct common_params_speculative {
+    std::vector<enum common_speculative_type> types = { COMMON_SPECULATIVE_TYPE_NONE };
 
-    struct cpu_params cpuparams;
-    struct cpu_params cpuparams_batch;
+    // used by Simple, MTP, Eagle3, etc. - all methods that require some kind of draft model
+    common_params_speculative_draft draft;
 
-    std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
+    common_params_speculative_ngram_mod ngram_mod;
+    common_params_speculative_ngram_map ngram_simple;
+    common_params_speculative_ngram_map ngram_map_k;
+    common_params_speculative_ngram_map ngram_map_k4v;
 
-    std::vector<std::pair<std::string, std::string>> replacements; // main to speculative model replacements
-    std::vector<llama_model_tensor_buft_override> tensor_buft_overrides;
+    common_params_speculative_ngram_cache ngram_cache;
 
     bool has_dft() const {
-        return !mparams_dft.path.empty() || !mparams_dft.hf_repo.empty();
+        return !draft.mparams.path.empty() || !draft.mparams.hf_repo.empty();
+    }
+
+    uint32_t need_n_rs_seq() const {
+        bool needs_rs_seq = std::any_of(types.begin(), types.end(), [&](auto t) {
+            return t == COMMON_SPECULATIVE_TYPE_DRAFT_MTP;
+        });
+
+        return needs_rs_seq ? draft.n_max : 0u;
     }
 };
 
 struct common_params_vocoder {
     struct common_params_model model;
 
-    std::string speaker_file = ""; // speaker file path                                      // NOLINT
+    std::string speaker_file; // speaker file path
 
-    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy            // NOLINT
+    bool use_guide_tokens = false; // enable guide tokens to improve TTS accuracy
 };
 
 struct common_params_diffusion {
@@ -433,19 +443,20 @@ struct common_params {
     // offload params
     std::vector<ggml_backend_dev_t> devices; // devices to use for offloading
 
-    int32_t n_gpu_layers       = -1;   // number of layers to store in VRAM, -1 is auto, <= -2 is all
-    int32_t main_gpu           = 0;    // the GPU that is used for scratch and small tensors
-    float   tensor_split[128]  = {0};  // how split tensors should be distributed across GPUs
-    bool    fit_params         = true; // whether to fit unset model/context parameters to free device memory
-    int32_t fit_params_min_ctx = 4096; // minimum context size to set when trying to reduce memory use
+    int32_t n_gpu_layers       = -1;    // number of layers to store in VRAM, -1 is auto, <= -2 is all
+    int32_t main_gpu           = 0;     // the GPU that is used for scratch and small tensors
+    float   tensor_split[128]  = {0};   // how split tensors should be distributed across GPUs
+    bool    fit_params         = true;  // whether to fit unset model/context parameters to free device memory
+    bool    fit_params_print   = false; // print the estimated required memory to run the model
+    int32_t fit_params_min_ctx = 4096;  // minimum context size to set when trying to reduce memory use
 
     // margin per device in bytes for fitting parameters to free memory:
     std::vector<size_t> fit_params_target = std::vector<size_t>(llama_max_devices(), 1024 * 1024*1024);
 
     enum llama_split_mode split_mode = LLAMA_SPLIT_MODE_LAYER; // how to split the model across GPUs
 
-    struct cpu_params cpuparams;
-    struct cpu_params cpuparams_batch;
+    common_cpu_params cpuparams;
+    common_cpu_params cpuparams_batch;
 
     ggml_backend_sched_eval_callback cb_eval = nullptr;
     void * cb_eval_user_data                 = nullptr;
@@ -579,7 +590,7 @@ struct common_params {
     int32_t n_threads_http      = -1;    // number of threads to process HTTP requests (TODO: support threadpool)
     int32_t n_cache_reuse       = 0;     // min chunk size to reuse from the cache via KV shifting
     bool    cache_prompt        = true;  // whether to enable prompt caching
-    bool    clear_idle          = true;  // save and clear idle slots upon starting a new task
+    bool    cache_idle_slots    = true;  // save and clear idle slots upon starting a new task
     int32_t n_ctx_checkpoints   = 32;    // max number of context checkpoints per slot
     int32_t checkpoint_every_nt = 8192;  // make a checkpoint every n tokens during prefill
     int32_t cache_ram_mib       = 8192;  // -1 = no limit, 0 - disable, 1 = 1 MiB, etc.
@@ -593,8 +604,6 @@ struct common_params {
     bool force_pure_content_parser = false;
     common_reasoning_format reasoning_format = COMMON_REASONING_FORMAT_DEEPSEEK;
     int enable_reasoning = -1; // -1 = auto, 0 = disable, 1 = enable
-    int reasoning_budget = -1;
-    std::string reasoning_budget_message; // message injected before end tag when budget exhausted
     bool prefill_assistant = true; // if true, any trailing assistant message will be prefilled into the response
     int sleep_idle_seconds = -1;   // if >0, server will sleep after this many seconds of idle time
 
@@ -605,11 +614,21 @@ struct common_params {
 
     std::map<std::string, std::string> default_template_kwargs;
 
-    // webui configs
-    bool webui = true;
+    // UI configs
+#ifdef LLAMA_UI_DEFAULT_ENABLED
+    bool ui = LLAMA_UI_DEFAULT_ENABLED != 0;
+#else
+    bool ui = true; // default to enabled when not set
+#endif
+
+    // Deprecated: use ui, ui_mcp_proxy, ui_config_json instead
+    bool webui = ui;
     bool webui_mcp_proxy = false;
     std::string webui_config_json;
 
+    bool ui_mcp_proxy = false;
+    std::string ui_config_json;
+
     // "advanced" endpoints are disabled by default for better security
     bool endpoint_slots   = true;
     bool endpoint_props   = false; // only control POST requests, not GET
@@ -687,11 +706,12 @@ struct common_params {
 // initializes the logging system and prints info about the build
 void common_init();
 
+void common_params_print_info(const common_params & params, bool print_devices = true);
 std::string common_params_get_system_info(const common_params & params);
 
 bool parse_cpu_range(const std::string & range, bool(&boolmask)[GGML_MAX_N_THREADS]);
 bool parse_cpu_mask(const std::string & mask, bool(&boolmask)[GGML_MAX_N_THREADS]);
-void postprocess_cpu_params(cpu_params & cpuparams, const cpu_params * role_model = nullptr);
+void postprocess_cpu_params(common_cpu_params & cpuparams, const common_cpu_params * role_model = nullptr);
 bool set_process_priority(enum ggml_sched_priority prio);
 
 //
@@ -759,6 +779,11 @@ inline bool string_starts_with(std::string_view str, std::string_view prefix) {
            str.compare(0, prefix.size(), prefix) == 0;
 }
 
+// remove when moving to c++20
+inline bool string_starts_with(std::string_view str, char prefix) {
+    return !str.empty() && str.front() == prefix;
+}
+
 // remove when moving to c++20
 inline bool string_ends_with(std::string_view str, std::string_view suffix) {
     return str.size() >= suffix.size() &&
@@ -854,12 +879,33 @@ common_init_result_ptr common_init_from_params(common_params & params);
 
 struct llama_model_params     common_model_params_to_llama  (      common_params & params);
 struct llama_context_params   common_context_params_to_llama(const common_params & params);
-struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const cpu_params & params);
+struct ggml_threadpool_params ggml_threadpool_params_from_cpu_params(const common_cpu_params & params);
 
 // clear LoRA adapters from context, then apply new list of adapters
 void common_set_adapter_lora(struct llama_context * ctx, std::vector<common_adapter_lora_info> & lora);
 
-std::string                   get_model_endpoint();
+// model endpoint from env
+std::string common_get_model_endpoint();
+
+//
+// Context utils
+//
+
+enum common_context_seq_rm_type {
+    COMMON_CONTEXT_SEQ_RM_TYPE_NO           = 0, // seq_rm not supported (e.g. no memory module)
+    COMMON_CONTEXT_SEQ_RM_TYPE_PART         = 1, // can seq_rm partial sequences
+    COMMON_CONTEXT_SEQ_RM_TYPE_FULL         = 2, // can seq_rm full sequences only
+    COMMON_CONTEXT_SEQ_RM_TYPE_RS = 3, // can seq_rm partial sequences, bounded by n_rs_seq
+};
+
+// check if the llama_context can remove sequences
+// note: clears the memory of the context
+common_context_seq_rm_type common_context_can_seq_rm(llama_context * ctx);
+
+// aborts execution on failure
+void common_context_seq_rm (llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1);
+void common_context_seq_add(llama_context * ctx, llama_seq_id seq_id, llama_pos p0, llama_pos p1, llama_pos delta);
+void common_context_seq_cp (llama_context * ctx, llama_seq_id seq_id_src, llama_seq_id seq_id_dst, llama_pos p0, llama_pos p1);
 
 //
 // Batch utils
@@ -998,3 +1044,50 @@ ggml_opt_dataset_t common_opt_dataset_init(struct llama_context * ctx, const std
 
 // "adamw" or "sgd" (case insensitive)
 enum ggml_opt_optimizer_type common_opt_get_optimizer(const char *);
+
+//
+// prompt utils
+//
+
+struct common_prompt_checkpoint {
+    int64_t n_tokens;
+
+    llama_pos pos_min;
+    llama_pos pos_max;
+
+    std::vector<uint8_t> data_tgt;
+    std::vector<uint8_t> data_dft;
+
+    size_t size() const;
+
+    bool empty() const;
+    void clear();
+
+    void update_pos(
+            int64_t n_tokens,
+            llama_pos pos_min,
+            llama_pos pos_max);
+
+    void update_tgt(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags);
+
+    void update_dft(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags);
+
+    void load_tgt(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags) const;
+
+    void load_dft(
+            llama_context * ctx,
+            llama_seq_id seq_id,
+            llama_state_seq_flags flags) const;
+
+    void clear_tgt();
+    void clear_dft();
+};
diff --git a/ggml/llamacpp/third_party/common/sampling.h b/ggml/llamacpp/third_party/common/sampling.h
index 5b57ad6..49506a0 100644
--- a/ggml/llamacpp/third_party/common/sampling.h
+++ b/ggml/llamacpp/third_party/common/sampling.h
@@ -41,8 +41,8 @@ struct common_sampler * common_sampler_init(const struct llama_model * model, st
 
 void common_sampler_free(struct common_sampler * gsmpl);
 
-// if accept_grammar is true, the token is accepted both by the sampling chain and the grammar
-void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool accept_grammar);
+// if is_generated is true, the token is accepted by the sampling chain, the reasoning budget sampler, and the grammar sampler
+void                    common_sampler_accept(struct common_sampler * gsmpl, llama_token token, bool is_generated);
 void                    common_sampler_reset (struct common_sampler * gsmpl);
 struct common_sampler * common_sampler_clone (struct common_sampler * gsmpl);
 
diff --git a/ggml/llamacpp/third_party/ggml/include/ggml-backend.h b/ggml/llamacpp/third_party/ggml/include/ggml-backend.h
index 3c06aea..b6f7373 100644
--- a/ggml/llamacpp/third_party/ggml/include/ggml-backend.h
+++ b/ggml/llamacpp/third_party/ggml/include/ggml-backend.h
@@ -169,7 +169,7 @@ extern "C" {
         // device type
         enum ggml_backend_dev_type type;
         // device id
-        //   for PCI devices, this should be the PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:01:00.0")
+        //   for PCI devices, this should be the lower-case PCI bus id formatted as "domain:bus:device.function" (e.g. "0000:c1:00.0")
         //   if the id is unknown, this should be NULL
         const char * device_id;
         // device capabilities
@@ -202,8 +202,11 @@ extern "C" {
 
     // Common functions that may be obtained using ggml_backend_reg_get_proc_address
 
-    // AllReduce operation for tensor parallelism (meta backend)
-    typedef bool                         (*ggml_backend_allreduce_tensor_t)(ggml_backend_t * backends, struct ggml_tensor ** tensors, size_t n_backends);
+    // Context management and operations for faster communication between backends, used for tensor parallelism (meta backend)
+    typedef void * (*ggml_backend_comm_init_t)(ggml_backend_t * backends, size_t n_backends);
+    typedef void   (*ggml_backend_comm_free_t)(void * comm_ctx);
+    typedef bool   (*ggml_backend_comm_allreduce_tensor_t)(void * comm_ctx, struct ggml_tensor ** tensors);
+
     // Split buffer type for tensor parallelism (old)
     typedef ggml_backend_buffer_type_t   (*ggml_backend_split_buffer_type_t)(int main_device, const float * tensor_split);
     // Set the number of threads for the backend
@@ -348,6 +351,53 @@ extern "C" {
     // Set a callback to be called for each resulting node during graph compute
     GGML_API void                 ggml_backend_sched_set_eval_callback(ggml_backend_sched_t sched, ggml_backend_sched_eval_callback callback, void * user_data);
 
+    //
+    // Meta backend
+    //
+
+#define GGML_BACKEND_META_MAX_DEVICES 16
+
+    enum ggml_backend_meta_split_axis {
+        // tensor split by tensor dimensions:
+        GGML_BACKEND_SPLIT_AXIS_0 = 0,
+        GGML_BACKEND_SPLIT_AXIS_1 = 1,
+        GGML_BACKEND_SPLIT_AXIS_2 = 2,
+        GGML_BACKEND_SPLIT_AXIS_3 = 3,
+
+        GGML_BACKEND_SPLIT_AXIS_MIRRORED = 10, // all values on all backends
+        GGML_BACKEND_SPLIT_AXIS_PARTIAL  = 11, // each backend has a partial sum
+
+        // for internal bookkeeping only:
+        GGML_BACKEND_SPLIT_AXIS_NONE    = 98,
+        GGML_BACKEND_SPLIT_AXIS_UNKNOWN = 99,
+    };
+    GGML_API const char * ggml_backend_meta_split_axis_name(enum ggml_backend_meta_split_axis split_axis);
+
+    struct ggml_backend_meta_split_state {
+        enum ggml_backend_meta_split_axis axis;
+
+        // for tensors with axis >= 0 && axis < GGML_MAX_DIMS:
+        //   - each device has a slice of the tensor along the split axis
+        //   - most tensors have n_segments == 1 and a contiguous slice of the tensor data
+        //   - some tensors have an inhomogenenous data layout along the split axis,
+        //     those tensors are divided into segments which are each individually split across devices
+        //   - ne has one entry per segment and device that add up to ggml_tensor::ne for that axis,
+        //     the outer/inner loops are over segments/devices like [seg0_dev0, seg0_dev1, seg1_dev0, seg1_dev1],
+        //   - for example, a transformer may have a fused QKV matrix rather than 3 matrices, those would be 3 separate segments
+        //     that each need to be split individually across devices so that each device gets a slice of Q, K, and V
+        int64_t  ne[16*GGML_BACKEND_META_MAX_DEVICES];
+        uint32_t n_segments;
+    };
+
+    // function to assign split states for statically allocated tensors, compute tensor split states will be assigned to be compatible:
+    typedef struct ggml_backend_meta_split_state(*ggml_backend_meta_get_split_state_t)(const struct ggml_tensor * tensor, void * userdata);
+
+    // create a new meta device from "simple" devices, meta buffer type/buffer/backend is then derived from this:
+    // TODO: this looks a bit strange - a backend API creates a device. I think we should try
+    //       express this as a backend registry functionality instead
+    GGML_API ggml_backend_dev_t ggml_backend_meta_device(
+        ggml_backend_dev_t * devs, size_t n_devs, ggml_backend_meta_get_split_state_t get_split_state, void * get_split_state_ud);
+
     //
     // Utils
     //
diff --git a/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h b/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h
index 1c11495..6fcf5a4 100644
--- a/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h
+++ b/ggml/llamacpp/third_party/ggml/include/ggml-rpc.h
@@ -6,9 +6,9 @@
 extern "C" {
 #endif
 
-#define RPC_PROTO_MAJOR_VERSION    3
-#define RPC_PROTO_MINOR_VERSION    6
-#define RPC_PROTO_PATCH_VERSION    1
+#define RPC_PROTO_MAJOR_VERSION    4
+#define RPC_PROTO_MINOR_VERSION    0
+#define RPC_PROTO_PATCH_VERSION    0
 
 #ifdef  __cplusplus
 static_assert(GGML_OP_COUNT == 96, "GGML_OP_COUNT has changed - update RPC_PROTO_PATCH_VERSION");
diff --git a/ggml/llamacpp/third_party/ggml/include/ggml.h b/ggml/llamacpp/third_party/ggml/include/ggml.h
index 11d3e8a..41566d4 100644
--- a/ggml/llamacpp/third_party/ggml/include/ggml.h
+++ b/ggml/llamacpp/third_party/ggml/include/ggml.h
@@ -438,6 +438,12 @@ extern "C" {
         GGML_PREC_F32     = 10,
     };
 
+    // op hint
+    enum ggml_op_hint {
+        GGML_HINT_NONE             = 0,
+        GGML_HINT_SRC0_IS_HADAMARD = 1,
+    };
+
     // model file types
     enum ggml_ftype {
         GGML_FTYPE_UNKNOWN        = -1,
@@ -1419,6 +1425,11 @@ extern "C" {
             struct ggml_tensor * a,
             enum ggml_prec       prec);
 
+    // change the hint of a matrix multiplication
+    GGML_API void ggml_mul_mat_set_hint(
+            struct ggml_tensor * a,
+            enum ggml_op_hint    hint);
+
     // indirect matrix multiplication
     GGML_API struct ggml_tensor * ggml_mul_mat_id(
             struct ggml_context * ctx,
@@ -1773,8 +1784,32 @@ extern "C" {
             int                   n_dims,
             int                   mode);
 
-    // custom RoPE
+    // RoPE operations with extended options
+    // a is the input tensor to apply RoPE to, shape [n_embd, n_head, n_token]
+    // b is an int32 vector with size n_token
     // c is freq factors (e.g. phi3-128k), (optional)
+    // mode can be GGML_ROPE_TYPE_NORMAL or NEOX; for MROPE and VISION mode, use ggml_rope_multi
+    //
+    // pseudo-code for computing theta:
+    //   for i in [0, n_dims/2):
+    //     theta[i] = b[i] * powf(freq_base, -2.0 * i / n_dims);
+    //     theta[i] = theta[i] / c[i];  # if c is provided, divide theta by c
+    //     theta[i] = rope_yarn(theta[i], ...);  # note: theta = theta * freq_scale is applied here
+    //
+    // other params are used by YaRN RoPE scaling, these default values will disable YaRN:
+    //   freq_scale  = 1.0f
+    //   ext_factor  = 0.0f
+    //   attn_factor = 1.0f
+    //   beta_fast   = 0.0f
+    //   beta_slow   = 0.0f
+    //
+    // example:
+    //   (marking: c = cos, s = sin, 0 = unrotated)
+    //   given a single head with size = 8 --> [00000000]
+    //   GGML_ROPE_TYPE_NORMAL  n_dims = 4 --> [cscs0000]
+    //   GGML_ROPE_TYPE_NORMAL  n_dims = 8 --> [cscscscs]
+    //   GGML_ROPE_TYPE_NEOX    n_dims = 4 --> [ccss0000]
+    //   GGML_ROPE_TYPE_NEOX    n_dims = 8 --> [ccccssss]
     GGML_API struct ggml_tensor * ggml_rope_ext(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -1790,6 +1825,36 @@ extern "C" {
             float                 beta_fast,
             float                 beta_slow);
 
+    // multi-dimensional RoPE, for Qwen-VL and similar vision models
+    // mode can be either VISION, MROPE, IMROPE, cannot be combined with NORMAL or NEOX
+    // sections specify how many dimensions to rotate in each section:
+    //   section length is equivalent to number of cos/sin pairs, NOT the number of dims
+    //   (i.e. sum of 4 sections are expected to be n_dims/2)
+    //   last sections can be 0, means ignored
+    // all other options are identical to ggml_rope_ext
+    //
+    // important note:
+    //   - NEOX ordering is automatically applied and cannot be disabled for MROPE and VISION
+    //     if you need normal ordering, there are 2 methods:
+    //     (1) split the tensor manually using ggml_view
+    //     (2) permute the weight upon conversion
+    //   - for VISION, n_dims must be head_size/2
+    //
+    // example M-RoPE:
+    //  given sections = [t=4, y=2, x=2, 0]
+    //  given a single head with size = 18 --> [000000000000000000]
+    //  GGML_ROPE_TYPE_MROPE   n_dims = 16 --> [ttttyyxxttttyyxx00] (cos/sin are applied in NEOX ordering)
+    //  GGML_ROPE_TYPE_IMROPE  n_dims = 16 --> [ttyxttyxttyxttyx00] (interleaved M-RoPE, still NEOX ordering)
+    //  note: the theta for each dim is computed the same way as ggml_rope_ext, no matter the section
+    //        in other words, idx used for theta: [0123456789... until n_dims/2], not reset for each section
+    //
+    // example vision RoPE:
+    //  given sections = [y=4, x=4, 0, 0] (last 2 sections are ignored)
+    //  given a single head with size = 8 --> [00000000]
+    //  GGML_ROPE_TYPE_VISION  n_dims = 4 --> [yyyyxxxx]
+    //  other values of n_dims are untested and is undefined behavior
+    //  note: unlike MROPE, the theta for each dim is computed differently for each section
+    //        in other words, idx used for theta: [0123] for y section, then [0123] for x section
     GGML_API struct ggml_tensor * ggml_rope_multi(
             struct ggml_context * ctx,
             struct ggml_tensor  * a,
@@ -2476,6 +2541,11 @@ extern "C" {
 
     // TODO: add ggml_gated_delta_net_set_bcast() to be able to configure Q, K broadcast type: tiled vs interleaved [TAG_GGML_GDN_BCAST]
     // ref: https://github.com/ggml-org/llama.cpp/pull/19468#discussion_r2786394306
+    //
+    // state is a 3D tensor of shape (S_v*S_v*H, K, n_seqs):
+    //   K == 1: output carries the final state only.
+    //   K  > 1: output carries K snapshot slots; the kernel writes the last min(n_tokens, K)
+    //   per-token snapshots into the trailing slots
     GGML_API struct ggml_tensor * ggml_gated_delta_net(
             struct ggml_context * ctx,
             struct ggml_tensor  * q,
diff --git a/ggml/llamacpp/third_party/include/llama.h b/ggml/llamacpp/third_party/include/llama.h
index ac267b5..75095b2 100644
--- a/ggml/llamacpp/third_party/include/llama.h
+++ b/ggml/llamacpp/third_party/include/llama.h
@@ -198,6 +198,11 @@ extern "C" {
         LLAMA_SPLIT_MODE_TENSOR = 3,
     };
 
+    enum llama_context_type {
+        LLAMA_CONTEXT_TYPE_DEFAULT = 0,
+        LLAMA_CONTEXT_TYPE_MTP     = 1,
+    };
+
     // TODO: simplify (https://github.com/ggml-org/llama.cpp/pull/9294#pullrequestreview-2286561979)
     typedef struct llama_token_data {
         llama_token id; // token id
@@ -333,9 +338,11 @@ extern "C" {
         uint32_t n_batch;           // logical maximum batch size that can be submitted to llama_decode
         uint32_t n_ubatch;          // physical maximum batch size
         uint32_t n_seq_max;         // max number of sequences (i.e. distinct states for recurrent models)
+        uint32_t n_rs_seq;          // number of recurrent-state snapshots per seq for rollback (0 = no rollback) [EXPERIMENTAL]
         int32_t  n_threads;         // number of threads to use for generation
         int32_t  n_threads_batch;   // number of threads to use for batch processing
 
+        enum llama_context_type      ctx_type;          // set the context type (e.g. MTP)
         enum llama_rope_scaling_type rope_scaling_type; // RoPE scaling type, from `enum llama_rope_scaling_type`
         enum llama_pooling_type      pooling_type;      // whether to pool (sum) embedding results by sequence id
         enum llama_attention_type    attention_type;    // attention type to use for embeddings
@@ -511,27 +518,6 @@ extern "C" {
     // Frees all allocated memory
     LLAMA_API void llama_free(struct llama_context * ctx);
 
-    enum llama_params_fit_status {
-        LLAMA_PARAMS_FIT_STATUS_SUCCESS = 0, // found allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_FAILURE = 1, // could not find allocations that are projected to fit
-        LLAMA_PARAMS_FIT_STATUS_ERROR   = 2, // a hard error occurred, e.g. because no model could be found at the specified path
-    };
-
-    // fits mparams and cparams to free device memory (assumes system memory is unlimited)
-    //   - returns true if the parameters could be successfully modified to fit device memory
-    //   - this function is NOT thread safe because it modifies the global llama logger state
-    //   - only parameters that have the same value as in llama_default_model_params are modified
-    //     with the exception of the context size which is modified if and only if equal to 0
-    LLAMA_API enum llama_params_fit_status llama_params_fit(
-                                   const char   * path_model,
-                    struct llama_model_params   * mparams,
-                    struct llama_context_params * cparams,
-                                          float * tensor_split,          // writable buffer for tensor split, needs at least llama_max_devices elements
-        struct llama_model_tensor_buft_override * tensor_buft_overrides, // writable buffer for overrides, needs at least llama_max_tensor_buft_overrides elements
-                                         size_t * margins,               // margins of memory to leave per device in bytes
-                                       uint32_t   n_ctx_min,             // minimum context size to set when trying to reduce memory use
-                            enum ggml_log_level   log_level);            // minimum log level to print during fitting, lower levels go to debug log
-
     LLAMA_API int64_t llama_time_us(void);
 
     LLAMA_API size_t llama_max_devices(void);
@@ -551,6 +537,7 @@ extern "C" {
     LLAMA_API uint32_t llama_n_batch    (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_ubatch   (const struct llama_context * ctx);
     LLAMA_API uint32_t llama_n_seq_max  (const struct llama_context * ctx);
+    LLAMA_API uint32_t llama_n_rs_seq   (const struct llama_context * ctx);
 
     DEPRECATED(LLAMA_API int32_t llama_n_ctx_train(const struct llama_model * model), "use llama_model_n_ctx_train instead");
     DEPRECATED(LLAMA_API int32_t llama_n_embd     (const struct llama_model * model), "use llama_model_n_embd instead");
@@ -879,12 +866,17 @@ extern "C" {
                           size_t   n_token_capacity,
                           size_t * n_token_count_out);
 
+#define LLAMA_STATE_SEQ_FLAGS_NONE 0
+
 // for backwards-compat
 #define LLAMA_STATE_SEQ_FLAGS_SWA_ONLY 1
 
 // work only with partial states, such as SWA KV cache or recurrent cache (e.g. Mamba)
 #define LLAMA_STATE_SEQ_FLAGS_PARTIAL_ONLY 1
 
+// keeps the tensor data on device buffers (i.e. not accessible in host memory, but faster save/load)
+#define LLAMA_STATE_SEQ_FLAGS_ON_DEVICE 2
+
     typedef uint32_t llama_state_seq_flags;
 
     LLAMA_API size_t llama_state_seq_get_size_ext(
@@ -1546,9 +1538,6 @@ extern "C" {
     LLAMA_API void                           llama_perf_sampler_print(const struct llama_sampler * chain);
     LLAMA_API void                           llama_perf_sampler_reset(      struct llama_sampler * chain);
 
-    // print a breakdown of per-device memory use via LLAMA_LOG:
-    LLAMA_API void llama_memory_breakdown_print(const struct llama_context * ctx);
-
     //
     // training
     //
diff --git a/ggml/llamacpp/third_party/include/mtmd-helper.h b/ggml/llamacpp/third_party/include/mtmd-helper.h
index 5036b92..57da78a 100644
--- a/ggml/llamacpp/third_party/include/mtmd-helper.h
+++ b/ggml/llamacpp/third_party/include/mtmd-helper.h
@@ -47,6 +47,10 @@ MTMD_API size_t mtmd_helper_get_n_tokens(const mtmd_input_chunks * chunks);
 // normally, n_pos is equal to n_tokens, but for M-RoPE it is different
 MTMD_API llama_pos mtmd_helper_get_n_pos(const mtmd_input_chunks * chunks);
 
+// helper to get the list of relative positions corresponding to the embedding tokens, to be used by M-RoPE
+// out_pos must have length == mtmd_helper_get_n_tokens(image)
+MTMD_API void mtmd_helper_image_get_decoder_pos(const mtmd_image_tokens * image, llama_pos pos_0, struct mtmd_decoder_pos * out_pos);
+
 // helper function that automatically:
 // 1. run llama_decode() on text chunks
 // 2. run mtmd_encode() on image chunks, then mtmd_get_output_embd() and then llama_decode()
diff --git a/ggml/llamacpp/third_party/include/mtmd.h b/ggml/llamacpp/third_party/include/mtmd.h
index ebb4a18..54b9515 100644
--- a/ggml/llamacpp/third_party/include/mtmd.h
+++ b/ggml/llamacpp/third_party/include/mtmd.h
@@ -46,9 +46,6 @@
 #    define MTMD_API
 #endif
 
-// deprecated marker, use mtmd_default_marker() instead
-#define MTMD_DEFAULT_IMAGE_MARKER "<__image__>"
-
 #ifdef __cplusplus
 extern "C" {
 #endif
@@ -114,20 +111,21 @@ MTMD_API mtmd_context * mtmd_init_from_file(const char * mmproj_fname,
 MTMD_API void mtmd_free(mtmd_context * ctx);
 
 // whether we need to set non-causal mask before llama_decode
-MTMD_API bool mtmd_decode_use_non_causal(mtmd_context * ctx);
+// if chunk is nullptr, we assume the default case where chunk is an image chunk
+MTMD_API bool mtmd_decode_use_non_causal(const mtmd_context * ctx, const mtmd_input_chunk * chunk);
 
 // whether the current model use M-RoPE for llama_decode
-MTMD_API bool mtmd_decode_use_mrope(mtmd_context * ctx);
+MTMD_API bool mtmd_decode_use_mrope(const mtmd_context * ctx);
 
 // whether the current model supports vision input
-MTMD_API bool mtmd_support_vision(mtmd_context * ctx);
+MTMD_API bool mtmd_support_vision(const mtmd_context * ctx);
 
 // whether the current model supports audio input
-MTMD_API bool mtmd_support_audio(mtmd_context * ctx);
+MTMD_API bool mtmd_support_audio(const mtmd_context * ctx);
 
 // get audio sample rate in Hz, for example 16000 for Whisper
 // return -1 if audio is not supported
-MTMD_API int mtmd_get_audio_sample_rate(mtmd_context * ctx);
+MTMD_API int mtmd_get_audio_sample_rate(const mtmd_context * ctx);
 
 // mtmd_bitmap
 //
@@ -185,12 +183,27 @@ MTMD_API void               mtmd_input_chunk_free(mtmd_input_chunk * chunk);
 // the instance will be constructed via mtmd_tokenize()
 // it will be freed along with mtmd_input_chunk
 MTMD_API size_t       mtmd_image_tokens_get_n_tokens(const mtmd_image_tokens * image_tokens); // TODO: deprecate
-MTMD_API size_t       mtmd_image_tokens_get_nx      (const mtmd_image_tokens * image_tokens);
-MTMD_API size_t       mtmd_image_tokens_get_ny      (const mtmd_image_tokens * image_tokens);
 MTMD_API const char * mtmd_image_tokens_get_id      (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 // number of temporal positions (equals to max(t,h,w) for M-RoPE; equals to n_tokens otherwise)
 MTMD_API llama_pos    mtmd_image_tokens_get_n_pos   (const mtmd_image_tokens * image_tokens); // TODO: deprecate
 
+DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_nx(const mtmd_image_tokens * image_tokens),
+           "use mtmd_image_tokens_get_decoder_pos() instead");
+DEPRECATED(MTMD_API size_t mtmd_image_tokens_get_ny(const mtmd_image_tokens * image_tokens),
+           "use mtmd_image_tokens_get_decoder_pos() instead");
+
+struct mtmd_decoder_pos {
+    uint32_t t;
+    uint32_t x;
+    uint32_t y;
+    uint32_t z; // unused for now, reserved for future use
+};
+// get position for decoder attention, to be used by M-RoPE models
+// i is the index of the embedding token, ranging from 0 to mtmd_image_tokens_get_n_tokens() - 1
+// pos_0 is the absolute position of the first token
+// return relative position (for example, embedding 0 will have position (0, 0, 0); remember to adjust it to the current absolute position)
+MTMD_API struct mtmd_decoder_pos mtmd_image_tokens_get_decoder_pos(const mtmd_image_tokens * image_tokens, llama_pos pos_0, size_t i);
+
 // tokenize an input text prompt and a list of bitmaps (images/audio)
 // the prompt must have the input image marker (default: "<__media__>") in it
 // the default marker is defined by mtmd_default_marker()
@@ -231,6 +244,14 @@ MTMD_API float * mtmd_get_output_embd(mtmd_context * ctx);
 // If this is not called, or NULL is supplied, everything is output on stderr.
 MTMD_API void mtmd_log_set(ggml_log_callback log_callback, void * user_data);
 
+// EXPERIMENTAL API to get mmproj's capabilities without initializing the full context
+// This is only intended to be used by llama-server, breaking changes is expected
+struct mtmd_caps {
+    bool inp_vision;
+    bool inp_audio;
+};
+MTMD_API struct mtmd_caps mtmd_get_cap_from_file(const char * mmproj_fname);
+
 /////////////////////////////////////////
 
 // test function, to be used in test-mtmd-c-api.c
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a
index 7170f50..7a911d7 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a
index 2e703f0..9f4e570 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a
index 017b54f..2af6a19 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a
index f9ad770..206a9ae 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common-base.a
new file mode 100644
index 0000000..91b6cf1
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common.a
similarity index 66%
rename from ggml/llamacpp/third_party/prebuilt/android-arm64/libcommon.a
rename to ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common.a
index 6658711..2b2dfd8 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-diffusion.a
new file mode 100644
index 0000000..6ee355d
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-diffusion.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-ui.a
new file mode 100644
index 0000000..02eb28e
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a
index 5ab86cb..c6fe540 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a
index b97718a..ea81ce2 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a
index a8fb085..842aff5 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/android-arm64/libserver-context.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a
index 0c57130..2b29ab2 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a
index 6c6fddc..c8915dc 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a
index bbf6a65..91a2591 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-blas.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a
index 4fe09a0..ff452ab 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a
index 4a6e268..b6ae635 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml-metal.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a
index f13db70..248b382 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a
index 445e276..8b267b1 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libgguf-model-data.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common-base.a
new file mode 100644
index 0000000..57599d1
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common.a
similarity index 50%
rename from ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a
rename to ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common.a
index c636085..be273b4 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-diffusion.a
new file mode 100644
index 0000000..52f8732
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-diffusion.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-ui.a
new file mode 100644
index 0000000..18b968b
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a
index d6e5a29..845d176 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a
index eb6ad00..05784b2 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a
index 01c4738..51f6aa8 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/darwin-amd64/libserver-context.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libcpp-httplib.a
new file mode 100644
index 0000000..7e9eba2
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-base.a
new file mode 100644
index 0000000..7fabd54
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-blas.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-blas.a
new file mode 100644
index 0000000..9626f00
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-blas.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a
new file mode 100644
index 0000000..4f2b53f
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-metal.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-metal.a
new file mode 100644
index 0000000..5b2052a
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml-metal.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml.a
new file mode 100644
index 0000000..f057c51
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common-base.a
new file mode 100644
index 0000000..0eed854
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common.a
new file mode 100644
index 0000000..b746bfd
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-ui.a
new file mode 100644
index 0000000..e1c026f
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama.a
new file mode 100644
index 0000000..3e4ff66
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libmtmd.a
new file mode 100644
index 0000000..151b516
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libserver-context.a
new file mode 100644
index 0000000..6b84e60
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/darwin-arm64/libserver-context.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a
index e1fefa7..9deb533 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a
index 94e4ce7..cf26741 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a
index ff49e8f..57e3b66 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a
index ae671c5..65400f0 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a
index 249cdb8..ed5dea2 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common-base.a
new file mode 100644
index 0000000..fb44406
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common.a
similarity index 57%
rename from ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a
rename to ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common.a
index f6f06ff..16631a5 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-diffusion.a
new file mode 100644
index 0000000..9263253
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-diffusion.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-ui.a
new file mode 100644
index 0000000..ce03eaf
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a
index 3379693..55718cf 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a
index 6cd24ec..57b74cd 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a
index dbeb59d..cf77963 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-cuda/libserver-context.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a
index 9c5a1ac..9deb533 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a
index ba99cb7..cf26741 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a
index da2df04..57e3b66 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a
index 88a9ac4..6fddb02 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a
index 72fc96e..6dc3907 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common-base.a
new file mode 100644
index 0000000..fb44406
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common.a
similarity index 57%
rename from ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a
rename to ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common.a
index a8468fe..16631a5 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-diffusion.a
new file mode 100644
index 0000000..9263253
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-diffusion.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-ui.a
new file mode 100644
index 0000000..ce03eaf
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a
index 9da3eae..55718cf 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a
index 6ecf783..57b74cd 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a
index 213ce33..cf77963 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64-vulkan/libserver-context.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a
index f8f5d13..4d0a0d6 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a
index bd61e10..2b01df0 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a
index e690af0..947396e 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a
index ff7fa77..9d4e8f0 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common-base.a
new file mode 100644
index 0000000..fb7b545
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common.a
similarity index 63%
rename from ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a
rename to ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common.a
index aeceb6b..e366b95 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-diffusion.a
new file mode 100644
index 0000000..d94553f
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-diffusion.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-ui.a
new file mode 100644
index 0000000..869028f
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a
index 1ed1847..e0cac12 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a
index 0792c2c..dbe3eae 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a
index 8b56512..17e7aaf 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-amd64/libserver-context.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a
index 0f0a2e3..1db6a89 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcpp-httplib.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a
index d7498bd..c70452b 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a
index 53465bf..129bff7 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml-cpu.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a
index a3f95e8..bc80da1 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libggml.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common-base.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common-base.a
new file mode 100644
index 0000000..54d45f2
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common-base.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcommon.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common.a
similarity index 51%
rename from ggml/llamacpp/third_party/prebuilt/linux-arm64/libcommon.a
rename to ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common.a
index e8a6ab9..18957c5 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libcommon.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-common.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-diffusion.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-diffusion.a
new file mode 100644
index 0000000..4e342b1
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-diffusion.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-ui.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-ui.a
new file mode 100644
index 0000000..e86fb73
Binary files /dev/null and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama-ui.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a
index 945138f..ee22667 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libllama.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a
index b23259b..0d25011 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libmtmd.a differ
diff --git a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a
index 0cb0b6e..65897a4 100644
Binary files a/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a and b/ggml/llamacpp/third_party/prebuilt/linux-arm64/libserver-context.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a
index c58565f..9992462 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libcommon.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a
index 3a51b19..c0bdda3 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-base.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a
index a1a690e..26e4777 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-blas.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a
index b78e00f..1674f5c 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-cpu.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a
index f2434b6..b2d6730 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml-metal.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a
index ed939f4..fef498c 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libggml.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a
index 4ff1a6b..a1afac6 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/darwin-amd64/libwhisper.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libcommon.a
new file mode 100644
index 0000000..a438119
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libcommon.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-base.a
new file mode 100644
index 0000000..7c89ec0
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-base.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-blas.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-blas.a
new file mode 100644
index 0000000..fdf718a
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-blas.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a
new file mode 100644
index 0000000..03dcc94
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-cpu.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-metal.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-metal.a
new file mode 100644
index 0000000..ec7fb4a
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml-metal.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml.a
new file mode 100644
index 0000000..b04cacd
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libggml.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libwhisper.a
new file mode 100644
index 0000000..cdb2d26
Binary files /dev/null and b/ggml/whispercpp/third_party/prebuilt/darwin-arm64/libwhisper.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a
index 61aa619..0f941af 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libcommon.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a
index 4207df2..d343ff2 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-base.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a
index dbaf36f..568975d 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cpu.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a
index c4e6da5..01d0914 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml-cuda.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a
index 8db5376..50b3c7c 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libggml.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a
index a0faa0f..ba57e1c 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-cuda/libwhisper.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a
index 79bf1cb..0f941af 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libcommon.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a
index 6312e47..d343ff2 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-base.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a
index a6adb24..568975d 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-cpu.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a
index c93f533..4ddf15c 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml-vulkan.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a
index 00e15d4..a3918c7 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libggml.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a
index babf534..2e5cad8 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64-vulkan/libwhisper.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a
index d6395a7..23844ab 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-amd64/libggml-cpu.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a
index 94ed338..97f3889 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libcommon.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a
index 38cf53b..6d03128 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-base.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a
index a37112a..38373aa 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml-cpu.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a
index 39a9565..218ce14 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libggml.a differ
diff --git a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a
index 453f978..c5537de 100644
Binary files a/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a and b/ggml/whispercpp/third_party/prebuilt/linux-arm64/libwhisper.a differ
diff --git a/version.go b/version.go
index aafe579..df49375 100644
--- a/version.go
+++ b/version.go
@@ -2,6 +2,6 @@ package gonativeml
 
 const (
 	Version           = "v0.1.7"
-	LlamaCppVersion   = "b8772"
+	LlamaCppVersion   = "b9222"
 	WhisperCppVersion = "v1.8.3"
 )