diff --git a/cmd/metal-agent/main.go b/cmd/metal-agent/main.go index 2806b0c..44af8e7 100644 --- a/cmd/metal-agent/main.go +++ b/cmd/metal-agent/main.go @@ -52,6 +52,7 @@ type AgentConfig struct { Namespace string ModelStorePath string LlamaServerBin string + LlamaServerPort int Runtime string OMLXBin string OMLXPort int @@ -209,6 +210,10 @@ func main() { flag.StringVar(&cfg.Namespace, "namespace", "default", "Kubernetes namespace to watch") flag.StringVar(&cfg.ModelStorePath, "model-store", "/tmp/llmkube-models", "Path to store downloaded models") flag.StringVar(&llamaServerFlag, "llama-server", "", "Path to llama-server binary (auto-detected if not set)") + flag.IntVar(&cfg.LlamaServerPort, "llama-server-port", 0, + "Fixed port for the llama-server runtime. 0 (default) allocates an "+ + "ephemeral port per process; set a fixed port for stable native "+ + "clients (e.g. an OpenAI-compatible tool pointed at localhost).") flag.StringVar(&cfg.Runtime, "runtime", "llama-server", "Inference runtime: llama-server, omlx, ollama, vllm-swift, or mlx-server") flag.StringVar(&cfg.OMLXBin, "omlx-bin", "", "Path to omlx binary (auto-detected if not set)") @@ -433,6 +438,7 @@ func main() { Namespace: cfg.Namespace, ModelStorePath: cfg.ModelStorePath, LlamaServerBin: cfg.LlamaServerBin, + LlamaServerPort: cfg.LlamaServerPort, Runtime: cfg.Runtime, OMLXBin: cfg.OMLXBin, OMLXPort: cfg.OMLXPort, diff --git a/pkg/agent/agent.go b/pkg/agent/agent.go index 7d3bd28..19a59df 100644 --- a/pkg/agent/agent.go +++ b/pkg/agent/agent.go @@ -90,6 +90,10 @@ type MetalAgentConfig struct { // MLXServerPort is the fixed port the mlx-server process binds. // Only used when Runtime is "mlx-server"; zero defaults to 8080. MLXServerPort int + // LlamaServerPort is a fixed port for the llama-server runtime. Only used + // when Runtime is "llama-server"; zero allocates an ephemeral port per + // process (the historical behavior). + LlamaServerPort int // MemoryProvider supplies system memory info. Nil defaults to DarwinMemoryProvider. MemoryProvider MemoryProvider @@ -346,6 +350,7 @@ func (a *MetalAgent) Start(ctx context.Context) error { if a.config.LlamaServerStartupTimeout > 0 { metalExec.SetStartupTimeout(a.config.LlamaServerStartupTimeout) } + metalExec.SetPort(a.config.LlamaServerPort) a.executor = metalExec } diff --git a/pkg/agent/executor.go b/pkg/agent/executor.go index 413f286..d9ed60e 100644 --- a/pkg/agent/executor.go +++ b/pkg/agent/executor.go @@ -127,6 +127,10 @@ type MetalExecutor struct { modelStorePath string logger *zap.SugaredLogger startupTimeout time.Duration + // fixedPort, when non-zero, is the port every spawned llama-server binds + // instead of an ephemeral one. Set via SetPort. A fixed port gives native + // OpenAI-compatible clients a stable endpoint across process respawns. + fixedPort int } func NewMetalExecutor(llamaServerBin, modelStorePath string, logger *zap.SugaredLogger) *MetalExecutor { @@ -147,15 +151,30 @@ func (e *MetalExecutor) SetStartupTimeout(d time.Duration) { e.startupTimeout = d } +// SetPort fixes the port every spawned llama-server binds. A value <= 0 +// (the default) keeps the historical behavior of allocating an ephemeral +// port per process. Only one llama-server can use a given fixed port, which +// matches the one-process-per-agent expectation of the Metal path. +func (e *MetalExecutor) SetPort(port int) { + if port < 0 { + port = 0 + } + e.fixedPort = port +} + func (e *MetalExecutor) StartProcess(ctx context.Context, config ExecutorConfig) (*ManagedProcess, error) { modelPath, err := e.ensureModel(ctx, config.ModelSource, config.ModelName) if err != nil { return nil, fmt.Errorf("failed to ensure model: %w", err) } - port, err := e.allocatePort() - if err != nil { - return nil, fmt.Errorf("failed to allocate port: %w", err) + port := e.fixedPort + if port == 0 { + var err error + port, err = e.allocatePort() + if err != nil { + return nil, fmt.Errorf("failed to allocate port: %w", err) + } } args := buildLlamaServerArgs(modelPath, port, config) diff --git a/pkg/agent/executor_test.go b/pkg/agent/executor_test.go index da9f037..ed6c4f8 100644 --- a/pkg/agent/executor_test.go +++ b/pkg/agent/executor_test.go @@ -36,6 +36,26 @@ func TestNewMetalExecutor(t *testing.T) { } } +func TestMetalExecutorSetPort(t *testing.T) { + executor := NewMetalExecutor("/bin/llama-server", "/models", newNopLogger()) + + // Default: no fixed port, so StartProcess falls back to an ephemeral one. + if executor.fixedPort != 0 { + t.Errorf("fixedPort default = %d, want 0", executor.fixedPort) + } + + executor.SetPort(8080) + if executor.fixedPort != 8080 { + t.Errorf("fixedPort after SetPort(8080) = %d, want 8080", executor.fixedPort) + } + + // Negative values are coerced back to 0 (ephemeral). + executor.SetPort(-1) + if executor.fixedPort != 0 { + t.Errorf("fixedPort after SetPort(-1) = %d, want 0", executor.fixedPort) + } +} + func TestAllocatePort(t *testing.T) { executor := NewMetalExecutor("/bin/llama-server", "/models", newNopLogger())