Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
10 changes: 8 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -54,8 +54,14 @@ xcodebuild -scheme mlx-server -destination 'platform=macOS,arch=arm64' \
```

`--model` takes a local MLX model directory or a HuggingFace id. Other flags:
`--host`, `--port`, `--max-slots`, `--tool-call-format` (e.g. `xml_function`
for Qwen3.5 / Qwen3-Coder; auto-inferred when unset).

- `--host`, `--port`, `--max-slots`
- `--tool-call-format` — e.g. `xml_function` for Qwen3.5 / Qwen3-Coder;
auto-inferred when unset
- `--reasoning` — how thinking output is split into `reasoning_content` vs
`content`: `auto` (default; splits on a literal `<think>`/`</think>`),
`prefilled` (output starts mid-thought — use for Qwen3.5 / Qwen3.6), or
`off`

## Roadmap

Expand Down
9 changes: 8 additions & 1 deletion Sources/MLXServer/MLXServerCommand.swift
Original file line number Diff line number Diff line change
Expand Up @@ -23,17 +23,24 @@ struct MLXServerCommand: AsyncParsableCommand {
@Option(name: .long, help: "Tool-call format override (e.g. xml_function, json). Auto-inferred when unset.")
var toolCallFormat: String?

@Option(name: .long, help: "Reasoning split mode: auto, prefilled, or off. Use 'prefilled' for Qwen3.5 / Qwen3.6.")
var reasoning: String = "auto"

func run() async throws {
guard let model else {
throw ValidationError("--model is required (HuggingFace ID or local directory path).")
}
guard let reasoningMode = ReasoningMode(rawValue: reasoning) else {
throw ValidationError("--reasoning must be one of: auto, prefilled, off")
}

let config = ServerConfig(
model: model,
host: host,
port: port,
maxSlots: maxSlots,
toolCallFormat: toolCallFormat
toolCallFormat: toolCallFormat,
reasoningMode: reasoningMode
)

try await MLXServerKit.run(config: config)
Expand Down
28 changes: 24 additions & 4 deletions Sources/MLXServerKit/ChatCompletion.swift
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,9 @@ extension InferenceEngine {
let input = try await prepareInput(for: request)
let parameters = ChatMapping.resolveGenerateParameters(request)

var text = ""
var splitter = ReasoningSplitter(mode: reasoningMode)
var content = ""
var reasoning = ""
var toolCalls: [ToolCallObject] = []
var info: GenerateCompletionInfo?

Expand All @@ -16,7 +18,9 @@ extension InferenceEngine {
for await generation in stream {
switch generation {
case .chunk(let chunk):
text += chunk
let split = splitter.push(chunk)
content += split.content
reasoning += split.reasoning
case .toolCall(let call):
toolCalls.append(Self.toolCallObject(call, index: toolCalls.count))
case .info(let completionInfo):
Expand All @@ -26,10 +30,14 @@ extension InferenceEngine {
} catch {
throw ServerError.inferenceFailed(String(describing: error))
}
let tail = splitter.flush()
content += tail.content
reasoning += tail.reasoning

let hasToolCalls = !toolCalls.isEmpty
let message = ChatCompletionResponse.ResponseMessage(
content: hasToolCalls ? (text.isEmpty ? nil : text) : text,
content: hasToolCalls && content.isEmpty ? nil : content,
reasoningContent: reasoning.isEmpty ? nil : reasoning,
toolCalls: hasToolCalls ? toolCalls : nil
)
return ChatCompletionResponse(
Expand Down Expand Up @@ -86,6 +94,7 @@ extension InferenceEngine {
/// Semantic events emitted by a streaming completion, before OpenAI framing.
enum StreamEvent: Sendable {
case textDelta(String)
case reasoningDelta(String)
case toolCall(ToolCallObject)
case finished(reason: String, usage: Usage)
}
Expand Down Expand Up @@ -117,14 +126,21 @@ extension InferenceEngine {
let input = try await prepareInput(for: request)
let parameters = ChatMapping.resolveGenerateParameters(request)

var splitter = ReasoningSplitter(mode: reasoningMode)
var toolCallCount = 0
var info: GenerateCompletionInfo?
do {
let generations = try await container.generate(input: input, parameters: parameters)
for await generation in generations {
switch generation {
case .chunk(let chunk):
continuation.yield(.textDelta(chunk))
let split = splitter.push(chunk)
if !split.reasoning.isEmpty {
continuation.yield(.reasoningDelta(split.reasoning))
}
if !split.content.isEmpty {
continuation.yield(.textDelta(split.content))
}
case .toolCall(let call):
continuation.yield(.toolCall(Self.toolCallObject(call, index: toolCallCount)))
toolCallCount += 1
Expand All @@ -136,6 +152,10 @@ extension InferenceEngine {
throw ServerError.inferenceFailed(String(describing: error))
}

let tail = splitter.flush()
if !tail.reasoning.isEmpty { continuation.yield(.reasoningDelta(tail.reasoning)) }
if !tail.content.isEmpty { continuation.yield(.textDelta(tail.content)) }

continuation.yield(
.finished(
reason: toolCallCount > 0 ? "tool_calls" : "stop",
Expand Down
7 changes: 7 additions & 0 deletions Sources/MLXServerKit/ChatCompletionsHandler.swift
Original file line number Diff line number Diff line change
Expand Up @@ -64,6 +64,13 @@ enum ChatCompletionsHandler {
roleSent = true
try await writer.write(
SSE.event(chunk(id, created, model, delta, finishReason: nil)))
case .reasoningDelta(let text):
let delta = ChatCompletionChunk.Delta(
role: roleSent ? nil : "assistant", content: nil,
reasoningContent: text, toolCalls: nil)
roleSent = true
try await writer.write(
SSE.event(chunk(id, created, model, delta, finishReason: nil)))
case .toolCall(let call):
let delta = ChatCompletionChunk.Delta(
role: roleSent ? nil : "assistant", content: nil, toolCalls: [call])
Expand Down
5 changes: 5 additions & 0 deletions Sources/MLXServerKit/InferenceEngine.swift
Original file line number Diff line number Diff line change
Expand Up @@ -17,17 +17,21 @@ public actor InferenceEngine {
public let modelID: String
/// Resolved tool-call format, or `nil` to let the model config decide.
let toolCallFormat: ToolCallFormat?
/// How thinking output is split into reasoning vs answer text.
let reasoningMode: ReasoningMode
let logger: Logger

private init(
container: ModelContainer,
modelID: String,
toolCallFormat: ToolCallFormat?,
reasoningMode: ReasoningMode,
logger: Logger
) {
self.container = container
self.modelID = modelID
self.toolCallFormat = toolCallFormat
self.reasoningMode = reasoningMode
self.logger = logger
}

Expand Down Expand Up @@ -63,6 +67,7 @@ public actor InferenceEngine {
container: container,
modelID: config.model,
toolCallFormat: format,
reasoningMode: config.reasoningMode,
logger: logger)
}

Expand Down
4 changes: 4 additions & 0 deletions Sources/MLXServerKit/OpenAITypes.swift
Original file line number Diff line number Diff line change
Expand Up @@ -212,10 +212,12 @@ public struct ChatCompletionResponse: Encodable, Sendable {
public struct ResponseMessage: Encodable, Sendable {
public var role: String = "assistant"
public var content: String?
public var reasoningContent: String?
public var toolCalls: [ToolCallObject]?

enum CodingKeys: String, CodingKey {
case role, content
case reasoningContent = "reasoning_content"
case toolCalls = "tool_calls"
}
}
Expand Down Expand Up @@ -262,10 +264,12 @@ public struct ChatCompletionChunk: Encodable, Sendable {
public struct Delta: Encodable, Sendable {
public var role: String?
public var content: String?
public var reasoningContent: String?
public var toolCalls: [ToolCallObject]?

enum CodingKeys: String, CodingKey {
case role, content
case reasoningContent = "reasoning_content"
case toolCalls = "tool_calls"
}
}
Expand Down
125 changes: 125 additions & 0 deletions Sources/MLXServerKit/ReasoningSplitter.swift
Original file line number Diff line number Diff line change
@@ -0,0 +1,125 @@
import Foundation

/// How the server separates a model's thinking output from its answer.
public enum ReasoningMode: String, Sendable {
/// Start in the answer; a literal `<think>` opens a reasoning block and
/// `</think>` closes it. Safe for non-reasoning models (no markers ever
/// appear, so everything stays in the answer).
case auto
/// Start already inside reasoning — the chat template prefilled the
/// opening `<think>`, so generated output begins mid-thought. `</think>`
/// switches to the answer. Use for Qwen3.5 / Qwen3.6.
case prefilled
/// No splitting; all output is the answer.
case off
}

/// Streaming splitter that classifies model output into reasoning vs. answer
/// text by tracking `<think>` / `</think>` markers.
///
/// Marker-safe across chunk boundaries: a partial marker at a chunk edge is
/// held back until the next chunk completes (or fails to complete) it.
struct ReasoningSplitter {
/// Text classified out of a `push` or `flush` call.
struct Split: Equatable {
var reasoning = ""
var content = ""
}

private static let openMarker = "<think>"
private static let closeMarker = "</think>"

private enum Phase { case reasoning, content }
private var phase: Phase
/// True while a `<think>` opener could still appear.
private var watchingForOpen: Bool
private let mode: ReasoningMode
/// Holds a possible partial marker straddling a chunk boundary.
private var pending = ""

init(mode: ReasoningMode) {
self.mode = mode
switch mode {
case .auto:
phase = .content
watchingForOpen = true
case .prefilled:
phase = .reasoning
watchingForOpen = false
case .off:
phase = .content
watchingForOpen = false
}
}

/// The marker currently being scanned for, or `nil` when none applies.
private var activeMarker: String? {
if watchingForOpen { return Self.openMarker }
if phase == .reasoning { return Self.closeMarker }
return nil
}

/// Feed a chunk of model output; returns the text split by phase.
mutating func push(_ text: String) -> Split {
var split = Split()
guard mode != .off else {
split.content = text
return split
}

var work = pending + text
pending = ""

while let marker = activeMarker {
if let range = work.range(of: marker) {
emit(String(work[work.startIndex..<range.lowerBound]), into: &split)
work = String(work[range.upperBound...])
advancePhase(after: marker)
} else {
// No complete marker — emit everything except a trailing
// suffix that could be the start of the marker.
let hold = partialMarkerSuffixLength(of: work, marker: marker)
emit(String(work.dropLast(hold)), into: &split)
pending = String(work.suffix(hold))
return split
}
}

// No active marker: the answer phase with nothing left to find.
emit(work, into: &split)
return split
}

/// Emit any held-back text once generation has finished.
mutating func flush() -> Split {
var split = Split()
emit(pending, into: &split)
pending = ""
return split
}

private func emit(_ text: String, into split: inout Split) {
guard !text.isEmpty else { return }
switch phase {
case .reasoning: split.reasoning += text
case .content: split.content += text
}
}

private mutating func advancePhase(after marker: String) {
phase = (marker == Self.openMarker) ? .reasoning : .content
watchingForOpen = false
}

/// Length of the longest suffix of `text` that is a proper prefix of `marker`.
private func partialMarkerSuffixLength(of text: String, marker: String) -> Int {
var length = min(text.count, marker.count - 1)
while length > 0 {
if marker.hasPrefix(text.suffix(length)) {
return length
}
length -= 1
}
return 0
}
}
6 changes: 5 additions & 1 deletion Sources/MLXServerKit/ServerConfig.swift
Original file line number Diff line number Diff line change
Expand Up @@ -15,18 +15,22 @@ public struct ServerConfig: Sendable {
/// Optional tool-call format override (e.g. `xml_function`, `json`).
/// When `nil` the format is inferred from the model's `config.json`.
public var toolCallFormat: String?
/// How thinking-model output is split into `reasoning_content` vs `content`.
public var reasoningMode: ReasoningMode

public init(
model: String,
host: String,
port: Int,
maxSlots: Int,
toolCallFormat: String? = nil
toolCallFormat: String? = nil,
reasoningMode: ReasoningMode = .auto
) {
self.model = model
self.host = host
self.port = port
self.maxSlots = maxSlots
self.toolCallFormat = toolCallFormat
self.reasoningMode = reasoningMode
}
}
Loading
Loading