Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 6 additions & 0 deletions cmd/metal-agent/main.go
Original file line number Diff line number Diff line change
Expand Up @@ -52,6 +52,7 @@ type AgentConfig struct {
Namespace string
ModelStorePath string
LlamaServerBin string
LlamaServerPort int
Runtime string
OMLXBin string
OMLXPort int
Expand Down Expand Up @@ -209,6 +210,10 @@ func main() {
flag.StringVar(&cfg.Namespace, "namespace", "default", "Kubernetes namespace to watch")
flag.StringVar(&cfg.ModelStorePath, "model-store", "/tmp/llmkube-models", "Path to store downloaded models")
flag.StringVar(&llamaServerFlag, "llama-server", "", "Path to llama-server binary (auto-detected if not set)")
flag.IntVar(&cfg.LlamaServerPort, "llama-server-port", 0,
"Fixed port for the llama-server runtime. 0 (default) allocates an "+
"ephemeral port per process; set a fixed port for stable native "+
"clients (e.g. an OpenAI-compatible tool pointed at localhost).")
flag.StringVar(&cfg.Runtime, "runtime", "llama-server",
"Inference runtime: llama-server, omlx, ollama, vllm-swift, or mlx-server")
flag.StringVar(&cfg.OMLXBin, "omlx-bin", "", "Path to omlx binary (auto-detected if not set)")
Expand Down Expand Up @@ -433,6 +438,7 @@ func main() {
Namespace: cfg.Namespace,
ModelStorePath: cfg.ModelStorePath,
LlamaServerBin: cfg.LlamaServerBin,
LlamaServerPort: cfg.LlamaServerPort,
Runtime: cfg.Runtime,
OMLXBin: cfg.OMLXBin,
OMLXPort: cfg.OMLXPort,
Expand Down
5 changes: 5 additions & 0 deletions pkg/agent/agent.go
Original file line number Diff line number Diff line change
Expand Up @@ -90,6 +90,10 @@ type MetalAgentConfig struct {
// MLXServerPort is the fixed port the mlx-server process binds.
// Only used when Runtime is "mlx-server"; zero defaults to 8080.
MLXServerPort int
// LlamaServerPort is a fixed port for the llama-server runtime. Only used
// when Runtime is "llama-server"; zero allocates an ephemeral port per
// process (the historical behavior).
LlamaServerPort int

// MemoryProvider supplies system memory info. Nil defaults to DarwinMemoryProvider.
MemoryProvider MemoryProvider
Expand Down Expand Up @@ -346,6 +350,7 @@ func (a *MetalAgent) Start(ctx context.Context) error {
if a.config.LlamaServerStartupTimeout > 0 {
metalExec.SetStartupTimeout(a.config.LlamaServerStartupTimeout)
}
metalExec.SetPort(a.config.LlamaServerPort)
a.executor = metalExec
}

Expand Down
25 changes: 22 additions & 3 deletions pkg/agent/executor.go
Original file line number Diff line number Diff line change
Expand Up @@ -127,6 +127,10 @@ type MetalExecutor struct {
modelStorePath string
logger *zap.SugaredLogger
startupTimeout time.Duration
// fixedPort, when non-zero, is the port every spawned llama-server binds
// instead of an ephemeral one. Set via SetPort. A fixed port gives native
// OpenAI-compatible clients a stable endpoint across process respawns.
fixedPort int
}

func NewMetalExecutor(llamaServerBin, modelStorePath string, logger *zap.SugaredLogger) *MetalExecutor {
Expand All @@ -147,15 +151,30 @@ func (e *MetalExecutor) SetStartupTimeout(d time.Duration) {
e.startupTimeout = d
}

// SetPort fixes the port every spawned llama-server binds. A value <= 0
// (the default) keeps the historical behavior of allocating an ephemeral
// port per process. Only one llama-server can use a given fixed port, which
// matches the one-process-per-agent expectation of the Metal path.
func (e *MetalExecutor) SetPort(port int) {
if port < 0 {
port = 0
}
e.fixedPort = port
}

func (e *MetalExecutor) StartProcess(ctx context.Context, config ExecutorConfig) (*ManagedProcess, error) {
modelPath, err := e.ensureModel(ctx, config.ModelSource, config.ModelName)
if err != nil {
return nil, fmt.Errorf("failed to ensure model: %w", err)
}

port, err := e.allocatePort()
if err != nil {
return nil, fmt.Errorf("failed to allocate port: %w", err)
port := e.fixedPort
if port == 0 {
var err error
port, err = e.allocatePort()
if err != nil {
return nil, fmt.Errorf("failed to allocate port: %w", err)
}
}

args := buildLlamaServerArgs(modelPath, port, config)
Expand Down
20 changes: 20 additions & 0 deletions pkg/agent/executor_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -36,6 +36,26 @@ func TestNewMetalExecutor(t *testing.T) {
}
}

func TestMetalExecutorSetPort(t *testing.T) {
executor := NewMetalExecutor("/bin/llama-server", "/models", newNopLogger())

// Default: no fixed port, so StartProcess falls back to an ephemeral one.
if executor.fixedPort != 0 {
t.Errorf("fixedPort default = %d, want 0", executor.fixedPort)
}

executor.SetPort(8080)
if executor.fixedPort != 8080 {
t.Errorf("fixedPort after SetPort(8080) = %d, want 8080", executor.fixedPort)
}

// Negative values are coerced back to 0 (ephemeral).
executor.SetPort(-1)
if executor.fixedPort != 0 {
t.Errorf("fixedPort after SetPort(-1) = %d, want 0", executor.fixedPort)
}
}

func TestAllocatePort(t *testing.T) {
executor := NewMetalExecutor("/bin/llama-server", "/models", newNopLogger())

Expand Down