defilantech · Defilan · May 20, 2026 · May 19, 2026
@@ -52,6 +52,7 @@ type AgentConfig struct {
 	Namespace                 string
 	ModelStorePath            string
 	LlamaServerBin            string
+	LlamaServerPort           int
 	Runtime                   string
 	OMLXBin                   string
 	OMLXPort                  int
@@ -209,6 +210,10 @@ func main() {
 	flag.StringVar(&cfg.Namespace, "namespace", "default", "Kubernetes namespace to watch")
 	flag.StringVar(&cfg.ModelStorePath, "model-store", "/tmp/llmkube-models", "Path to store downloaded models")
 	flag.StringVar(&llamaServerFlag, "llama-server", "", "Path to llama-server binary (auto-detected if not set)")
+	flag.IntVar(&cfg.LlamaServerPort, "llama-server-port", 0,
+		"Fixed port for the llama-server runtime. 0 (default) allocates an "+
+			"ephemeral port per process; set a fixed port for stable native "+
+			"clients (e.g. an OpenAI-compatible tool pointed at localhost).")
 	flag.StringVar(&cfg.Runtime, "runtime", "llama-server",
 		"Inference runtime: llama-server, omlx, ollama, vllm-swift, or mlx-server")
 	flag.StringVar(&cfg.OMLXBin, "omlx-bin", "", "Path to omlx binary (auto-detected if not set)")
@@ -433,6 +438,7 @@ func main() {
 		Namespace:                 cfg.Namespace,
 		ModelStorePath:            cfg.ModelStorePath,
 		LlamaServerBin:            cfg.LlamaServerBin,
+		LlamaServerPort:           cfg.LlamaServerPort,
 		Runtime:                   cfg.Runtime,
 		OMLXBin:                   cfg.OMLXBin,
 		OMLXPort:                  cfg.OMLXPort,

@@ -90,6 +90,10 @@ type MetalAgentConfig struct {
 	// MLXServerPort is the fixed port the mlx-server process binds.
 	// Only used when Runtime is "mlx-server"; zero defaults to 8080.
 	MLXServerPort int
+	// LlamaServerPort is a fixed port for the llama-server runtime. Only used
+	// when Runtime is "llama-server"; zero allocates an ephemeral port per
+	// process (the historical behavior).
+	LlamaServerPort int
 
 	// MemoryProvider supplies system memory info. Nil defaults to DarwinMemoryProvider.
 	MemoryProvider MemoryProvider
@@ -346,6 +350,7 @@ func (a *MetalAgent) Start(ctx context.Context) error {
 		if a.config.LlamaServerStartupTimeout > 0 {
 			metalExec.SetStartupTimeout(a.config.LlamaServerStartupTimeout)
 		}
+		metalExec.SetPort(a.config.LlamaServerPort)
 		a.executor = metalExec
 	}
 

@@ -127,6 +127,10 @@ type MetalExecutor struct {
 	modelStorePath string
 	logger         *zap.SugaredLogger
 	startupTimeout time.Duration
+	// fixedPort, when non-zero, is the port every spawned llama-server binds
+	// instead of an ephemeral one. Set via SetPort. A fixed port gives native
+	// OpenAI-compatible clients a stable endpoint across process respawns.
+	fixedPort int
 }
 
 func NewMetalExecutor(llamaServerBin, modelStorePath string, logger *zap.SugaredLogger) *MetalExecutor {
@@ -147,15 +151,30 @@ func (e *MetalExecutor) SetStartupTimeout(d time.Duration) {
 	e.startupTimeout = d
 }
 
+// SetPort fixes the port every spawned llama-server binds. A value <= 0
+// (the default) keeps the historical behavior of allocating an ephemeral
+// port per process. Only one llama-server can use a given fixed port, which
+// matches the one-process-per-agent expectation of the Metal path.
+func (e *MetalExecutor) SetPort(port int) {
+	if port < 0 {
+		port = 0
+	}
+	e.fixedPort = port
+}
+
 func (e *MetalExecutor) StartProcess(ctx context.Context, config ExecutorConfig) (*ManagedProcess, error) {
 	modelPath, err := e.ensureModel(ctx, config.ModelSource, config.ModelName)
 	if err != nil {
 		return nil, fmt.Errorf("failed to ensure model: %w", err)
 	}
 
-	port, err := e.allocatePort()
-	if err != nil {
-		return nil, fmt.Errorf("failed to allocate port: %w", err)
+	port := e.fixedPort
+	if port == 0 {
+		var err error
+		port, err = e.allocatePort()
+		if err != nil {
+			return nil, fmt.Errorf("failed to allocate port: %w", err)
+		}
 	}
 
 	args := buildLlamaServerArgs(modelPath, port, config)

@@ -36,6 +36,26 @@ func TestNewMetalExecutor(t *testing.T) {
 	}
 }
 
+func TestMetalExecutorSetPort(t *testing.T) {
+	executor := NewMetalExecutor("/bin/llama-server", "/models", newNopLogger())
+
+	// Default: no fixed port, so StartProcess falls back to an ephemeral one.
+	if executor.fixedPort != 0 {
+		t.Errorf("fixedPort default = %d, want 0", executor.fixedPort)
+	}
+
+	executor.SetPort(8080)
+	if executor.fixedPort != 8080 {
+		t.Errorf("fixedPort after SetPort(8080) = %d, want 8080", executor.fixedPort)
+	}
+
+	// Negative values are coerced back to 0 (ephemeral).
+	executor.SetPort(-1)
+	if executor.fixedPort != 0 {
+		t.Errorf("fixedPort after SetPort(-1) = %d, want 0", executor.fixedPort)
+	}
+}
+
 func TestAllocatePort(t *testing.T) {
 	executor := NewMetalExecutor("/bin/llama-server", "/models", newNopLogger())