defilantech · Defilan · May 15, 2026 · May 15, 2026
diff --git a/.github/workflows/ci.yml b/.github/workflows/ci.yml
@@ -11,20 +11,38 @@ jobs:
     steps:
       - uses: actions/checkout@v4
 
-      - name: Select Xcode 16
+      - name: Select Xcode
         uses: maxim-lobanov/setup-xcode@v1
         with:
           xcode-version: latest-stable
 
       - name: Show Swift version
         run: swift --version
 
-      - name: Resolve dependencies
-        run: swift package resolve
+      # mlx-swift's Metal shaders + the MLX C++ sources are expensive to
+      # compile; cache the SwiftPM and Xcode build products across runs.
+      - name: Cache build
+        uses: actions/cache@v4
+        with:
+          path: |
+            .build
+            ~/Library/Caches/org.swift.swiftpm
+          key: ${{ runner.os }}-build-${{ hashFiles('Package.swift') }}
+          restore-keys: ${{ runner.os }}-build-
 
-      - name: Build
-        run: swift build -v
+      # xcodebuild (not `swift build`) compiles the Metal shaders, so this
+      # verifies the artifact users actually run. -skipMacroValidation is
+      # required for the MLXHuggingFace macro plugin in non-interactive runs.
+      - name: Build (xcodebuild — compiles Metal shaders)
+        run: |
+          xcodebuild -scheme mlx-server \
+            -destination 'platform=macOS' \
+            -configuration Debug \
+            -derivedDataPath .build/xcode \
+            -skipMacroValidation \
+            build
 
-      # TODO(phase-1): re-enable once a test target exists.
-      # - name: Test
-      #   run: swift test
+      # The test suite is model-free (no GPU / no weights), so SwiftPM runs
+      # it directly and fast.
+      - name: Test
+        run: swift test
diff --git a/.gitignore b/.gitignore
@@ -8,3 +8,4 @@ DerivedData/
 .idea/
 .vscode/
 *.swp
+*.profraw
diff --git a/Package.swift b/Package.swift
@@ -18,26 +18,45 @@ let package = Package(
         .package(url: "https://github.com/apple/swift-log.git", from: "1.6.0"),
         // Metrics API (Prometheus backend wired up in Phase 2).
         .package(url: "https://github.com/apple/swift-metrics.git", from: "2.5.0"),
-
-        // TODO(phase-1): add mlx-swift-lm once the upstream Package.swift either
-        // exposes mlx-swift as a remote URL dependency or we adopt a workspace
-        // / submodule strategy. mlx-swift-lm currently uses .package(path: "../mlx-swift")
-        // which blocks remote consumption.
-        //   https://github.com/ekryski/mlx-swift-lm/blob/alpha/Package.swift
+        // MLX inference for Apple Silicon: LLMs/VLMs plus the chat-template
+        // tool-call parsers. Consumed remotely from the v3.32.1-alpha tag.
+        .package(url: "https://github.com/ekryski/mlx-swift-lm", exact: "3.32.1-alpha"),
+        // HuggingFace hub client + tokenizers. Required by the MLXHuggingFace
+        // macros that generate the model Downloader / TokenizerLoader.
+        .package(url: "https://github.com/huggingface/swift-transformers", from: "1.3.0"),
+        .package(url: "https://github.com/huggingface/swift-huggingface", from: "0.9.0"),
     ],
     targets: [
+        // Thin executable: CLI parsing only. All logic lives in MLXServerKit.
         .executableTarget(
             name: "MLXServer",
             dependencies: [
-                .product(name: "Hummingbird", package: "hummingbird"),
+                "MLXServerKit",
                 .product(name: "ArgumentParser", package: "swift-argument-parser"),
+            ]
+        ),
+        // Library target: server, routing, inference engine, OpenAI types.
+        // Separated from the executable so it is unit-testable.
+        .target(
+            name: "MLXServerKit",
+            dependencies: [
+                .product(name: "Hummingbird", package: "hummingbird"),
                 .product(name: "Logging", package: "swift-log"),
                 .product(name: "Metrics", package: "swift-metrics"),
+                .product(name: "MLXLLM", package: "mlx-swift-lm"),
+                .product(name: "MLXLMCommon", package: "mlx-swift-lm"),
+                .product(name: "MLXHuggingFace", package: "mlx-swift-lm"),
+                .product(name: "Tokenizers", package: "swift-transformers"),
+                .product(name: "HuggingFace", package: "swift-huggingface"),
+            ]
+        ),
+        // swift-testing (ships with the Swift 6 toolchain; needs a full Xcode).
+        .testTarget(
+            name: "MLXServerTests",
+            dependencies: [
+                "MLXServerKit",
+                .product(name: "HummingbirdTesting", package: "hummingbird"),
             ]
         ),
-        // TODO(phase-1): re-add test target once we have real handlers to test.
-        // Will use swift-testing (built into Swift 6+) when targeting a full
-        // Xcode toolchain. Command Line Tools-only installs do not ship the
-        // Testing module.
     ]
 )
diff --git a/README.md b/README.md
@@ -4,7 +4,10 @@ OpenAI-compatible HTTP server for [mlx-swift-lm](https://github.com/ekryski/mlx-
 
 ## Status
 
-**Phase 0: scaffolding only.** No model loading, no inference yet. The repository exists so that the design conversation can proceed against real code. See the [roadmap](#roadmap) for what is planned.
+**Phase 1 + tool calling: working.** Loads an MLX model and serves
+`/v1/chat/completions` (streaming and non-streaming), `/v1/models`, and
+`/health`, with OpenAI-compatible tool calling. Validated end-to-end against
+Qwen3-4B and Qwen3.6-35B-A3B (MoE). See the [roadmap](#roadmap) for what is next.
 
 ## Why this exists
 
@@ -31,25 +34,37 @@ The end goal is to be a drop-in replacement for `llama-server` in [LLMKube](http
 
 [TheTom's MLXServer](https://github.com/ekryski/mlx-swift-lm/tree/ek/tom-eric-moe-tuning/Sources/MLXServer) (abandoned in favor of vllm-swift) was the proof-of-concept that an MLX-swift HTTP server is feasible. Several design decisions here, particularly around the slot manager and longest-prefix KV cache, are informed by his approach. The decision to rebuild rather than fork is mainly because his original used hand-rolled socket code; this repo uses [Hummingbird](https://github.com/hummingbird-project/hummingbird) for the HTTP layer.
 
-## Build
+## Build and run
 
 Requires:
 - macOS 14 (Sonoma) or later, Apple Silicon
 - Swift 6.0 or later (Xcode 16+)
 
+`swift build` compiles the project (and is what CI runs), but **SwiftPM cannot
+compile mlx-swift's Metal shaders** — a binary built that way fails at runtime
+with `Failed to load the default metallib`. To run the server, build with
+`xcodebuild`, which compiles and bundles the Metal library next to the binary:
+
 ```bash
-swift build
-.build/debug/mlx-server --help
+xcodebuild -scheme mlx-server -destination 'platform=macOS,arch=arm64' \
+  -configuration Debug -derivedDataPath .build/xcode -skipMacroValidation build
+
+.build/xcode/Build/Products/Debug/mlx-server \
+  --model /path/to/mlx-model-dir --port 8080
 ```
 
+`--model` takes a local MLX model directory or a HuggingFace id. Other flags:
+`--host`, `--port`, `--max-slots`, `--tool-call-format` (e.g. `xml_function`
+for Qwen3.5 / Qwen3-Coder; auto-inferred when unset).
+
 ## Roadmap
 
 | Phase | Scope | Status |
 |-------|-------|--------|
-| 0 | Scaffolding, CI, `/health` endpoint, dependency wiring | In progress |
-| 1 | `/v1/chat/completions` (streaming + non-streaming), `/v1/models`, single-slot model loading | Pending mlx-swift-lm Tier 1 release tag |
+| 0 | Scaffolding, CI, `/health` endpoint, dependency wiring | Done |
+| 1 | `/v1/chat/completions` (streaming + non-streaming), `/v1/models`, single-slot model loading | Done |
 | 2 | Multi-slot `SlotManager`, longest-prefix prompt cache, Prometheus `/metrics`, structured logging, graceful shutdown | |
-| 3 | Tool calling, thinking-model support, vision-language models, speculative decoding knobs, `/v1/embeddings` | |
+| 3 | Tool calling, thinking-model support, vision-language models, speculative decoding knobs, `/v1/embeddings` | Tool calling done |
 | 4 | LLMKube `runtime: mlx-server` integration | |
 
 ## License

diff --git a/Sources/MLXServer/MLXServerCommand.swift b/Sources/MLXServer/MLXServerCommand.swift
@@ -0,0 +1,41 @@
+import ArgumentParser
+import MLXServerKit
+
+@main
+struct MLXServerCommand: AsyncParsableCommand {
+    static let configuration = CommandConfiguration(
+        commandName: "mlx-server",
+        abstract: "OpenAI-compatible HTTP server for mlx-swift-lm on Apple Silicon."
+    )
+
+    @Option(name: .long, help: "Model identifier (HuggingFace ID or local directory path).")
+    var model: String?
+
+    @Option(name: .long, help: "Bind address.")
+    var host: String = "127.0.0.1"
+
+    @Option(name: .long, help: "Bind port.")
+    var port: Int = 8080
+
+    @Option(name: .long, help: "Maximum concurrent inference slots.")
+    var maxSlots: Int = 4
+
+    @Option(name: .long, help: "Tool-call format override (e.g. xml_function, json). Auto-inferred when unset.")
+    var toolCallFormat: String?
+
+    func run() async throws {
+        guard let model else {
+            throw ValidationError("--model is required (HuggingFace ID or local directory path).")
+        }
+
+        let config = ServerConfig(
+            model: model,
+            host: host,
+            port: port,
+            maxSlots: maxSlots,
+            toolCallFormat: toolCallFormat
+        )
+
+        try await MLXServerKit.run(config: config)
+    }
+}
diff --git a/Sources/MLXServer/main.swift b/Sources/MLXServer/main.swift
-Original file line number
+Diff line change
@@ Expand Up / @@ -8,3 +8,4 @@ DerivedData/ @@
     .idea/
     .vscode/
     *.swp
+    *.profraw