From 51caa3cf3e23ec95b2525b0f52b0e5d200e9ffc3 Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Tue, 19 May 2026 16:14:29 -0700 Subject: [PATCH 1/5] feat: scaffold Foreman API group and operator (v0.1 M0) Foreman is an opt-in add-on layered on LLMKube that schedules agentic workloads (Workload, AgenticTask) across a fleet of nodes (FleetNode). Installing LLMKube alone does not install or require it. M0 is the scaffolding milestone: types, controller stubs, operator binary, Helm chart skeleton. The reconcilers log and return for now; real scheduling lands in M2, the planner in M6. New API group foreman.llmkube.dev/v1alpha1: - Workload: the v0.1 entrypoint, a natural-language intent the planner decomposes into AgenticTasks. - AgenticTask: a dispatchable unit of work (issue-fix, verify, freeform), with RequiredCapability for capability-aware scheduling. - FleetNode: cluster-scoped registry entry the FleetAgent owns; carries the heartbeat and the capability the scheduler matches against. New paths: - api/foreman/v1alpha1/ the three CRD types + groupversion_info - internal/foreman/controller/ empty reconciler stubs (one per kind) - cmd/foreman-operator/ the new operator binary, separate from cmd/main.go; only registers the foreman group, leader-election ID is its own - charts/foreman/ new Helm chart, dependsOn llmkube Core touches are surgical and inference-flow-byte-identical: - scripts/sync-crds.sh now scopes its glob to inference.llmkube.dev_* so foreman CRDs are not pulled into the llmkube chart. - Makefile gains foreman-chart-crds (mirrors chart-crds for the foreman chart). manifests / generate / chart-crds are untouched in behavior; they still produce exactly the same inference outputs. - config/rbac/role.yaml grows by the kubebuilder:rbac markers on the three foreman reconcilers (auto-regenerated by make manifests). Verification: - make generate produces api/foreman/v1alpha1/zz_generated.deepcopy.go. - make manifests produces the three foreman CRD YAMLs. - make foreman-chart-crds copies them into charts/foreman/templates/crds. - make chart-crds remains inference-only (verified: charts/llmkube/templates/crds has only the three inference CRDs). - make test passes the full envtest suite; no core regressions. - make lint passes (0 issues). - go build ./cmd/foreman-operator produces a working binary. - kubectl apply of each foreman CRD against kind-llmkube-local accepts a real object; the operator's three reconcilers log the reconcile and return ctrl.Result{} as the stub design intends. Part of the Foreman v0.1 MVP plan: M0 done; M1 (FleetNode heartbeat) next. Signed-off-by: Christopher Maher --- Makefile | 11 + api/foreman/v1alpha1/agentictask_types.go | 254 +++++++++++ api/foreman/v1alpha1/fleetnode_types.go | 168 +++++++ api/foreman/v1alpha1/groupversion_info.go | 41 ++ api/foreman/v1alpha1/workload_types.go | 135 ++++++ api/foreman/v1alpha1/zz_generated.deepcopy.go | 411 ++++++++++++++++++ charts/foreman/Chart.yaml | 24 + .../foreman/templates/crds/agentictasks.yaml | 296 +++++++++++++ charts/foreman/templates/crds/fleetnodes.yaml | 229 ++++++++++ charts/foreman/templates/crds/workloads.yaml | 235 ++++++++++ cmd/foreman-operator/main.go | 121 ++++++ .../foreman.llmkube.dev_agentictasks.yaml | 296 +++++++++++++ .../bases/foreman.llmkube.dev_fleetnodes.yaml | 229 ++++++++++ .../bases/foreman.llmkube.dev_workloads.yaml | 235 ++++++++++ config/rbac/role.yaml | 32 ++ .../controller/agentictask_controller.go | 72 +++ .../controller/fleetnode_controller.go | 71 +++ .../foreman/controller/workload_controller.go | 70 +++ scripts/sync-crds.sh | 4 +- 19 files changed, 2933 insertions(+), 1 deletion(-) create mode 100644 api/foreman/v1alpha1/agentictask_types.go create mode 100644 api/foreman/v1alpha1/fleetnode_types.go create mode 100644 api/foreman/v1alpha1/groupversion_info.go create mode 100644 api/foreman/v1alpha1/workload_types.go create mode 100644 api/foreman/v1alpha1/zz_generated.deepcopy.go create mode 100644 charts/foreman/Chart.yaml create mode 100644 charts/foreman/templates/crds/agentictasks.yaml create mode 100644 charts/foreman/templates/crds/fleetnodes.yaml create mode 100644 charts/foreman/templates/crds/workloads.yaml create mode 100644 cmd/foreman-operator/main.go create mode 100644 config/crd/bases/foreman.llmkube.dev_agentictasks.yaml create mode 100644 config/crd/bases/foreman.llmkube.dev_fleetnodes.yaml create mode 100644 config/crd/bases/foreman.llmkube.dev_workloads.yaml create mode 100644 internal/foreman/controller/agentictask_controller.go create mode 100644 internal/foreman/controller/fleetnode_controller.go create mode 100644 internal/foreman/controller/workload_controller.go diff --git a/Makefile b/Makefile index befb7302..6c834899 100644 --- a/Makefile +++ b/Makefile @@ -50,6 +50,17 @@ manifests: controller-gen ## Generate WebhookConfiguration, ClusterRole and Cust chart-crds: manifests ## Generate CRDs and sync to Helm chart templates @./scripts/sync-crds.sh +.PHONY: foreman-chart-crds +foreman-chart-crds: manifests ## Sync foreman.llmkube.dev CRDs to the foreman chart. + @mkdir -p charts/foreman/templates/crds + @synced=0; for src in config/crd/bases/foreman.llmkube.dev_*.yaml; do \ + [ -e "$$src" ] || { echo "no foreman CRDs in config/crd/bases (did make manifests run?)"; exit 1; }; \ + base=$$(basename $$src); short=$${base#foreman.llmkube.dev_}; \ + echo "Syncing $$base -> $$short"; \ + cp $$src charts/foreman/templates/crds/$$short; \ + synced=$$((synced+1)); \ + done; echo "Synced $$synced foreman CRD(s)" + .PHONY: generate generate: controller-gen ## Generate code containing DeepCopy, DeepCopyInto, and DeepCopyObject method implementations. $(CONTROLLER_GEN) object:headerFile="hack/boilerplate.go.txt" paths="./..." diff --git a/api/foreman/v1alpha1/agentictask_types.go b/api/foreman/v1alpha1/agentictask_types.go new file mode 100644 index 00000000..2e8872ad --- /dev/null +++ b/api/foreman/v1alpha1/agentictask_types.go @@ -0,0 +1,254 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// AgenticTaskKind is the unit of work the task performs. Each kind has a +// payload shape, scheduler routing, and lifecycle. +// +kubebuilder:validation:Enum=issue-fix;verify;freeform +type AgenticTaskKind string + +const ( + // AgenticTaskKindIssueFix runs an agent against a GitHub issue: read the + // issue, edit the repo, run the verification, commit (DCO), push a branch. + AgenticTaskKindIssueFix AgenticTaskKind = "issue-fix" + // AgenticTaskKindVerify runs the project's gate (fmt/vet/lint/test + + // codegen sync) against a pushed branch. Typically scheduled by the + // controller as a child of a Succeeded issue-fix task. + AgenticTaskKindVerify AgenticTaskKind = "verify" + // AgenticTaskKindFreeform passes an arbitrary prompt to a named agent. + AgenticTaskKindFreeform AgenticTaskKind = "freeform" +) + +// AgenticTaskAccelerator pins which accelerator family a task needs from the +// node that runs it. "any" lets the scheduler pick from any Ready FleetNode. +// +kubebuilder:validation:Enum=metal;cuda;any +type AgenticTaskAccelerator string + +// RequiredCapability tells the scheduler which FleetNodes can serve this task. +// The scheduler matches each field against FleetNode.status.capability; +// unset fields are unconstrained. +type RequiredCapability struct { + // MinRAMGB is the minimum available RAM the node must advertise. + // +kubebuilder:validation:Minimum=0 + // +optional + MinRAMGB int32 `json:"minRAMGB,omitempty"` + + // MinContextTokens is the minimum context window the node's installed + // model must support. Set to 0 to leave unconstrained. + // +kubebuilder:validation:Minimum=0 + // +optional + MinContextTokens int32 `json:"minContextTokens,omitempty"` + + // Accelerator selects an accelerator family. "any" matches any node. + // +kubebuilder:default=any + // +optional + Accelerator AgenticTaskAccelerator `json:"accelerator,omitempty"` + + // NodeSelector is a hard pin: only FleetNodes whose labels match every + // key are eligible. Used for tasks that must run on a specific node + // (e.g. verify tasks targeting the gate runner). + // +optional + NodeSelector map[string]string `json:"nodeSelector,omitempty"` +} + +// AgenticTaskPayload is the kind-discriminated work spec. Each field is only +// meaningful for the kinds named in its description. +type AgenticTaskPayload struct { + // Repo is the "owner/name" GitHub repo. Required for issue-fix and verify. + // +optional + Repo string `json:"repo,omitempty"` + + // Issue is the GitHub issue number. Required for issue-fix. + // +kubebuilder:validation:Minimum=1 + // +optional + Issue int32 `json:"issue,omitempty"` + + // Branch is the existing branch to gate. Required for verify. + // +optional + Branch string `json:"branch,omitempty"` + + // BranchPrefix overrides the branch name prefix on issue-fix tasks + // (default derived from the issue's labels via conventional commit + // prefixes: fix/, feat/, chore/, etc.). + // +optional + BranchPrefix string `json:"branchPrefix,omitempty"` + + // Prompt is the agent input. Required for freeform. + // +optional + Prompt string `json:"prompt,omitempty"` + + // Agent is the named agent to invoke. Required for freeform; defaults + // to "issue-fixer" for issue-fix and "verify" for verify. + // +optional + Agent string `json:"agent,omitempty"` +} + +// AgenticTaskSpec defines the desired state of an AgenticTask. +type AgenticTaskSpec struct { + // Kind selects the work type and payload shape. + // +kubebuilder:validation:Required + Kind AgenticTaskKind `json:"kind"` + + // ModelRef names the Model the agent should use. Optional; the + // scheduler can pick a default based on RequiredCapability. + // +optional + ModelRef string `json:"modelRef,omitempty"` + + // RequiredCapability filters which FleetNodes can serve this task. + // +optional + RequiredCapability RequiredCapability `json:"requiredCapability,omitempty"` + + // Payload is the kind-discriminated work spec. + // +kubebuilder:validation:Required + Payload AgenticTaskPayload `json:"payload"` + + // TimeoutSeconds bounds the agent's run time. Zero uses the operator's + // default (2700, matching the autofix pipeline's value). + // +kubebuilder:validation:Minimum=0 + // +optional + TimeoutSeconds int32 `json:"timeoutSeconds,omitempty"` + + // DependsOn lists AgenticTasks (by name in the same namespace) that + // must reach Succeeded before this task is dispatched. v0.1 uses this + // only to chain verify tasks behind their parent issue-fix. + // +optional + DependsOn []string `json:"dependsOn,omitempty"` + + // Priority is a hint for the scheduler when many tasks are Pending. + // Higher values dispatch first. v0.1 is FIFO and ignores priority. + // +optional + Priority int32 `json:"priority,omitempty"` +} + +// AgenticTaskPhase is the lifecycle state of a task. +// +kubebuilder:validation:Enum=Pending;Scheduled;Running;Verifying;Succeeded;Failed +type AgenticTaskPhase string + +const ( + AgenticTaskPhasePending AgenticTaskPhase = "Pending" + AgenticTaskPhaseScheduled AgenticTaskPhase = "Scheduled" + AgenticTaskPhaseRunning AgenticTaskPhase = "Running" + AgenticTaskPhaseVerifying AgenticTaskPhase = "Verifying" + AgenticTaskPhaseSucceeded AgenticTaskPhase = "Succeeded" + AgenticTaskPhaseFailed AgenticTaskPhase = "Failed" +) + +// AgenticTaskVerdict is the final outcome category, distinct from Phase. +// A task can be Succeeded with a NO-GO verdict (the agent legitimately +// declined to fix the issue) or Failed with no verdict at all (the run +// timed out before producing a verdict). +// +kubebuilder:validation:Enum=GO;NO-GO;INCOMPLETE;GATE-PASS;GATE-FAIL;GATE-ERROR +type AgenticTaskVerdict string + +// AgenticTaskStatus defines the observed state of an AgenticTask. +type AgenticTaskStatus struct { + // Phase is the current lifecycle phase. + // +optional + Phase AgenticTaskPhase `json:"phase,omitempty"` + + // AssignedNode is the FleetNode.metadata.name the scheduler routed to. + // +optional + AssignedNode string `json:"assignedNode,omitempty"` + + // ClaimedAt is when the FleetAgent on AssignedNode claimed the task. + // +optional + ClaimedAt *metav1.Time `json:"claimedAt,omitempty"` + + // StartedAt is when the executor began work. + // +optional + StartedAt *metav1.Time `json:"startedAt,omitempty"` + + // FinishedAt is when the executor produced a verdict (success or fail). + // +optional + FinishedAt *metav1.Time `json:"finishedAt,omitempty"` + + // Verdict is the final outcome category. + // +optional + Verdict AgenticTaskVerdict `json:"verdict,omitempty"` + + // Result is the structured JSON the agent emitted, validated against the + // foreman.v1 schema. Opaque to the API server. + // +optional + Result *runtime.RawExtension `json:"result,omitempty"` + + // Branch is the pushed branch, set on a successful issue-fix. + // +optional + Branch string `json:"branch,omitempty"` + + // CommitSHA is the head commit of Branch. + // +optional + CommitSHA string `json:"commitSHA,omitempty"` + + // TranscriptRef points to where the agent's full transcript was stored + // (typically a ConfigMap in the operator's namespace). + // +optional + TranscriptRef string `json:"transcriptRef,omitempty"` + + // Conditions represent the current state of the task. Standard types: + // Scheduled, Running, Completed, Failed. + // +listType=map + // +listMapKey=type + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:shortName=at +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="Kind",type=string,JSONPath=`.spec.kind` +// +kubebuilder:printcolumn:name="Node",type=string,JSONPath=`.status.assignedNode` +// +kubebuilder:printcolumn:name="Verdict",type=string,JSONPath=`.status.verdict` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// AgenticTask is the unit of dispatchable agentic work. The Foreman scheduler +// matches each Pending task to a FleetNode whose advertised capability +// satisfies the task's RequiredCapability, then a FleetAgent on that node +// picks up the task and runs the matching executor. +type AgenticTask struct { + metav1.TypeMeta `json:",inline"` + + // metadata is a standard object metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // spec is the desired task definition. + Spec AgenticTaskSpec `json:"spec"` + + // status reflects the observed state, updated by the scheduler and the + // assigned FleetAgent. + // +optional + Status AgenticTaskStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// AgenticTaskList is a list of AgenticTasks. +type AgenticTaskList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []AgenticTask `json:"items"` +} + +func init() { + SchemeBuilder.Register(&AgenticTask{}, &AgenticTaskList{}) +} diff --git a/api/foreman/v1alpha1/fleetnode_types.go b/api/foreman/v1alpha1/fleetnode_types.go new file mode 100644 index 00000000..85d11c27 --- /dev/null +++ b/api/foreman/v1alpha1/fleetnode_types.go @@ -0,0 +1,168 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// FleetNodePhase is the heartbeat-driven health state of a fleet worker. +// +kubebuilder:validation:Enum=Ready;Draining;NotReady;Unknown +type FleetNodePhase string + +const ( + FleetNodePhaseReady FleetNodePhase = "Ready" + FleetNodePhaseDraining FleetNodePhase = "Draining" + FleetNodePhaseNotReady FleetNodePhase = "NotReady" + FleetNodePhaseUnknown FleetNodePhase = "Unknown" +) + +// FleetNodeAccelerator names the accelerator family the node hosts. +// +kubebuilder:validation:Enum=metal;cuda;none +type FleetNodeAccelerator string + +// FleetNodeCapability is what the FleetAgent advertises about its host so the +// scheduler can match incoming AgenticTasks to nodes that can serve them. +type FleetNodeCapability struct { + // Accelerator names the accelerator family available on this node. + // +optional + Accelerator FleetNodeAccelerator `json:"accelerator,omitempty"` + + // TotalRAMGB is the physical RAM in GiB. + // +kubebuilder:validation:Minimum=0 + // +optional + TotalRAMGB int32 `json:"totalRAMGB,omitempty"` + + // AvailableRAMGB is the FleetAgent's estimate of RAM not currently + // committed to a running model or task. Refreshed on heartbeat. + // +kubebuilder:validation:Minimum=0 + // +optional + AvailableRAMGB int32 `json:"availableRAMGB,omitempty"` + + // InstalledModels is the list of Model CR names this node has locally + // available (the model files are present and the runtime can load them). + // +optional + InstalledModels []string `json:"installedModels,omitempty"` + + // MaxContextTokens is the largest context window the loaded model + // supports. Used by the scheduler to filter tasks with high + // MinContextTokens requirements. + // +kubebuilder:validation:Minimum=0 + // +optional + MaxContextTokens int32 `json:"maxContextTokens,omitempty"` + + // TokensPerSecond is a coarse decode throughput estimate. v0.1 takes + // this from configuration; v0.2 will benchmark on heartbeat. + // +kubebuilder:validation:Minimum=0 + // +optional + TokensPerSecond int32 `json:"tokensPerSecond,omitempty"` +} + +// FleetNodeSpec is the small bit of identity the FleetAgent owns on its own +// FleetNode object. Most of the resource's interesting fields live in Status, +// which the FleetAgent updates on every heartbeat. +type FleetNodeSpec struct { + // NodeName is the human-readable identity of the worker. Conventionally + // matches metadata.name; required for the scheduler to address it. + // +kubebuilder:validation:Required + NodeName string `json:"nodeName"` + + // TailscaleAddr is the Tailscale address (IP or MagicDNS name) the + // FleetAgent listens on. Optional; the operator does not connect to the + // agent directly in v0.1 (dispatch is via the agent's CRD watch). + // +optional + TailscaleAddr string `json:"tailscaleAddr,omitempty"` + + // Roles label the node for capability-aware scheduling beyond raw + // accelerator type. Conventionally one or more of: "worker", "verifier". + // +optional + Roles []string `json:"roles,omitempty"` +} + +// FleetNodeStatus is the FleetAgent's live view of its host. Updated on +// every heartbeat (every 30s); the FleetNodeReconciler marks the phase +// NotReady when the heartbeat goes stale. +type FleetNodeStatus struct { + // Phase is the heartbeat-driven health state. The scheduler treats + // only Ready nodes as eligible. + // +optional + Phase FleetNodePhase `json:"phase,omitempty"` + + // LastHeartbeatTime is the most recent heartbeat the FleetAgent + // successfully patched. The reconciler marks the phase NotReady if + // this stalls (default threshold: 90 seconds). + // +optional + LastHeartbeatTime *metav1.Time `json:"lastHeartbeatTime,omitempty"` + + // Capability is what this node advertises to the scheduler. + // +optional + Capability FleetNodeCapability `json:"capability,omitempty"` + + // CurrentTask is the namespaced name of the AgenticTask the agent is + // running, or empty if idle. The scheduler skips nodes with a non-empty + // CurrentTask (v0.1 concurrency is one task per node). + // +optional + CurrentTask string `json:"currentTask,omitempty"` + + // Conditions track standard health signals: Ready, Draining, etc. + // +listType=map + // +listMapKey=type + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:resource:scope=Cluster,shortName=fn +// +kubebuilder:subresource:status +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="Accelerator",type=string,JSONPath=`.status.capability.accelerator` +// +kubebuilder:printcolumn:name="RAM",type=integer,JSONPath=`.status.capability.availableRAMGB` +// +kubebuilder:printcolumn:name="Current Task",type=string,JSONPath=`.status.currentTask` +// +kubebuilder:printcolumn:name="Heartbeat",type=date,JSONPath=`.status.lastHeartbeatTime` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// FleetNode is a worker the Foreman scheduler can dispatch tasks to. It is +// cluster-scoped because nodes are global to the fleet; the resource is +// owned and updated by the FleetAgent running on that node. +type FleetNode struct { + metav1.TypeMeta `json:",inline"` + + // metadata is a standard object metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // spec is the small bit of identity the agent owns. + Spec FleetNodeSpec `json:"spec"` + + // status is the agent's heartbeat-driven view of its host. The + // scheduler reads it; the agent writes it. + // +optional + Status FleetNodeStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// FleetNodeList is a list of FleetNodes. +type FleetNodeList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []FleetNode `json:"items"` +} + +func init() { + SchemeBuilder.Register(&FleetNode{}, &FleetNodeList{}) +} diff --git a/api/foreman/v1alpha1/groupversion_info.go b/api/foreman/v1alpha1/groupversion_info.go new file mode 100644 index 00000000..67420b15 --- /dev/null +++ b/api/foreman/v1alpha1/groupversion_info.go @@ -0,0 +1,41 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package v1alpha1 contains API Schema definitions for the foreman v1alpha1 +// API group. Foreman is an opt-in add-on layered on top of LLMKube: it +// schedules agentic workloads (Workload, AgenticTask) across a fleet of +// machines that advertise themselves as FleetNodes. Installing LLMKube alone +// does not install or require any of these types. +// +// +kubebuilder:object:generate=true +// +groupName=foreman.llmkube.dev +package v1alpha1 + +import ( + "k8s.io/apimachinery/pkg/runtime/schema" + "sigs.k8s.io/controller-runtime/pkg/scheme" +) + +var ( + // GroupVersion is group version used to register these objects. + GroupVersion = schema.GroupVersion{Group: "foreman.llmkube.dev", Version: "v1alpha1"} + + // SchemeBuilder is used to add go types to the GroupVersionKind scheme. + SchemeBuilder = &scheme.Builder{GroupVersion: GroupVersion} + + // AddToScheme adds the types in this group-version to the given scheme. + AddToScheme = SchemeBuilder.AddToScheme +) diff --git a/api/foreman/v1alpha1/workload_types.go b/api/foreman/v1alpha1/workload_types.go new file mode 100644 index 00000000..e5b6bb30 --- /dev/null +++ b/api/foreman/v1alpha1/workload_types.go @@ -0,0 +1,135 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" +) + +// WorkloadPhase is the lifecycle of a planner-driven batch. +// +kubebuilder:validation:Enum=Planning;Planned;Dispatched;Completed;Failed +type WorkloadPhase string + +const ( + WorkloadPhasePlanning WorkloadPhase = "Planning" + WorkloadPhasePlanned WorkloadPhase = "Planned" + WorkloadPhaseDispatched WorkloadPhase = "Dispatched" + WorkloadPhaseCompleted WorkloadPhase = "Completed" + WorkloadPhaseFailed WorkloadPhase = "Failed" +) + +// WorkloadSpec captures a high-level intent that the WorkloadReconciler +// decomposes (via a frontier model) into a set of AgenticTask objects. +type WorkloadSpec struct { + // Intent is the natural-language description of what to do. + // Example: "fix all open bugs in defilantech/LLMKube tagged size/small". + // +kubebuilder:validation:Required + // +kubebuilder:validation:MinLength=1 + Intent string `json:"intent"` + + // Repo is the GitHub repo in "owner/name" form that Intent applies to. + // Required for issue-fix workloads; the planner reads its open issues. + // +optional + Repo string `json:"repo,omitempty"` + + // MaxTasks caps how many AgenticTasks the planner may emit. Zero means + // no limit; the planner picks. Use this as a safety belt on the first + // runs against a new repo or intent. + // +kubebuilder:validation:Minimum=0 + // +optional + MaxTasks int32 `json:"maxTasks,omitempty"` + + // PlannerModel selects the frontier model the planner should call. + // Empty uses the operator's default (Anthropic Claude). The value is + // a free-form identifier the planner adapter interprets, e.g. + // "anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6". + // +optional + PlannerModel string `json:"plannerModel,omitempty"` +} + +// WorkloadStatus reflects the observed state of the workload. +type WorkloadStatus struct { + // Phase is the lifecycle state. + // +optional + Phase WorkloadPhase `json:"phase,omitempty"` + + // Tasks lists the AgenticTask objects the planner emitted. They are + // owner-ref'd to this Workload so they cascade-delete with it. + // +optional + Tasks []corev1.ObjectReference `json:"tasks,omitempty"` + + // SucceededTasks counts child tasks in phase Succeeded. + // +optional + SucceededTasks int32 `json:"succeededTasks,omitempty"` + + // FailedTasks counts child tasks in phase Failed. + // +optional + FailedTasks int32 `json:"failedTasks,omitempty"` + + // PlannerModel records which frontier model the planner actually used + // for this workload. Set after the planner runs. + // +optional + PlannerModel string `json:"plannerModel,omitempty"` + + // Conditions track standard signals: Planned, Dispatched, Completed. + // +listType=map + // +listMapKey=type + // +optional + Conditions []metav1.Condition `json:"conditions,omitempty"` +} + +// +kubebuilder:object:root=true +// +kubebuilder:subresource:status +// +kubebuilder:resource:shortName=wl +// +kubebuilder:printcolumn:name="Phase",type=string,JSONPath=`.status.phase` +// +kubebuilder:printcolumn:name="Repo",type=string,JSONPath=`.spec.repo` +// +kubebuilder:printcolumn:name="Tasks",type=integer,JSONPath=`.status.succeededTasks` +// +kubebuilder:printcolumn:name="Failed",type=integer,JSONPath=`.status.failedTasks` +// +kubebuilder:printcolumn:name="Age",type=date,JSONPath=`.metadata.creationTimestamp` + +// Workload is the v0.1 entrypoint to Foreman. A user creates a Workload with +// a high-level intent ("fix open bugs"); the WorkloadReconciler calls a +// frontier model to decompose it into a set of AgenticTask objects, which +// the scheduler then dispatches across the fleet. +type Workload struct { + metav1.TypeMeta `json:",inline"` + + // metadata is a standard object metadata. + // +optional + metav1.ObjectMeta `json:"metadata,omitempty,omitzero"` + + // spec is the user-supplied intent. + Spec WorkloadSpec `json:"spec"` + + // status is the planner's and scheduler's observed view. + // +optional + Status WorkloadStatus `json:"status,omitempty"` +} + +// +kubebuilder:object:root=true + +// WorkloadList is a list of Workloads. +type WorkloadList struct { + metav1.TypeMeta `json:",inline"` + metav1.ListMeta `json:"metadata,omitempty"` + Items []Workload `json:"items"` +} + +func init() { + SchemeBuilder.Register(&Workload{}, &WorkloadList{}) +} diff --git a/api/foreman/v1alpha1/zz_generated.deepcopy.go b/api/foreman/v1alpha1/zz_generated.deepcopy.go new file mode 100644 index 00000000..85bb03a8 --- /dev/null +++ b/api/foreman/v1alpha1/zz_generated.deepcopy.go @@ -0,0 +1,411 @@ +//go:build !ignore_autogenerated + +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Code generated by controller-gen. DO NOT EDIT. + +package v1alpha1 + +import ( + corev1 "k8s.io/api/core/v1" + "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" +) + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgenticTask) DeepCopyInto(out *AgenticTask) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgenticTask. +func (in *AgenticTask) DeepCopy() *AgenticTask { + if in == nil { + return nil + } + out := new(AgenticTask) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgenticTask) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgenticTaskList) DeepCopyInto(out *AgenticTaskList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]AgenticTask, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgenticTaskList. +func (in *AgenticTaskList) DeepCopy() *AgenticTaskList { + if in == nil { + return nil + } + out := new(AgenticTaskList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *AgenticTaskList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgenticTaskPayload) DeepCopyInto(out *AgenticTaskPayload) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgenticTaskPayload. +func (in *AgenticTaskPayload) DeepCopy() *AgenticTaskPayload { + if in == nil { + return nil + } + out := new(AgenticTaskPayload) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgenticTaskSpec) DeepCopyInto(out *AgenticTaskSpec) { + *out = *in + in.RequiredCapability.DeepCopyInto(&out.RequiredCapability) + out.Payload = in.Payload + if in.DependsOn != nil { + in, out := &in.DependsOn, &out.DependsOn + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgenticTaskSpec. +func (in *AgenticTaskSpec) DeepCopy() *AgenticTaskSpec { + if in == nil { + return nil + } + out := new(AgenticTaskSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *AgenticTaskStatus) DeepCopyInto(out *AgenticTaskStatus) { + *out = *in + if in.ClaimedAt != nil { + in, out := &in.ClaimedAt, &out.ClaimedAt + *out = (*in).DeepCopy() + } + if in.StartedAt != nil { + in, out := &in.StartedAt, &out.StartedAt + *out = (*in).DeepCopy() + } + if in.FinishedAt != nil { + in, out := &in.FinishedAt, &out.FinishedAt + *out = (*in).DeepCopy() + } + if in.Result != nil { + in, out := &in.Result, &out.Result + *out = new(runtime.RawExtension) + (*in).DeepCopyInto(*out) + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new AgenticTaskStatus. +func (in *AgenticTaskStatus) DeepCopy() *AgenticTaskStatus { + if in == nil { + return nil + } + out := new(AgenticTaskStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FleetNode) DeepCopyInto(out *FleetNode) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + in.Spec.DeepCopyInto(&out.Spec) + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FleetNode. +func (in *FleetNode) DeepCopy() *FleetNode { + if in == nil { + return nil + } + out := new(FleetNode) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FleetNode) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FleetNodeCapability) DeepCopyInto(out *FleetNodeCapability) { + *out = *in + if in.InstalledModels != nil { + in, out := &in.InstalledModels, &out.InstalledModels + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FleetNodeCapability. +func (in *FleetNodeCapability) DeepCopy() *FleetNodeCapability { + if in == nil { + return nil + } + out := new(FleetNodeCapability) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FleetNodeList) DeepCopyInto(out *FleetNodeList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]FleetNode, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FleetNodeList. +func (in *FleetNodeList) DeepCopy() *FleetNodeList { + if in == nil { + return nil + } + out := new(FleetNodeList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *FleetNodeList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FleetNodeSpec) DeepCopyInto(out *FleetNodeSpec) { + *out = *in + if in.Roles != nil { + in, out := &in.Roles, &out.Roles + *out = make([]string, len(*in)) + copy(*out, *in) + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FleetNodeSpec. +func (in *FleetNodeSpec) DeepCopy() *FleetNodeSpec { + if in == nil { + return nil + } + out := new(FleetNodeSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *FleetNodeStatus) DeepCopyInto(out *FleetNodeStatus) { + *out = *in + if in.LastHeartbeatTime != nil { + in, out := &in.LastHeartbeatTime, &out.LastHeartbeatTime + *out = (*in).DeepCopy() + } + in.Capability.DeepCopyInto(&out.Capability) + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new FleetNodeStatus. +func (in *FleetNodeStatus) DeepCopy() *FleetNodeStatus { + if in == nil { + return nil + } + out := new(FleetNodeStatus) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *RequiredCapability) DeepCopyInto(out *RequiredCapability) { + *out = *in + if in.NodeSelector != nil { + in, out := &in.NodeSelector, &out.NodeSelector + *out = make(map[string]string, len(*in)) + for key, val := range *in { + (*out)[key] = val + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new RequiredCapability. +func (in *RequiredCapability) DeepCopy() *RequiredCapability { + if in == nil { + return nil + } + out := new(RequiredCapability) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *Workload) DeepCopyInto(out *Workload) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ObjectMeta.DeepCopyInto(&out.ObjectMeta) + out.Spec = in.Spec + in.Status.DeepCopyInto(&out.Status) +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new Workload. +func (in *Workload) DeepCopy() *Workload { + if in == nil { + return nil + } + out := new(Workload) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *Workload) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadList) DeepCopyInto(out *WorkloadList) { + *out = *in + out.TypeMeta = in.TypeMeta + in.ListMeta.DeepCopyInto(&out.ListMeta) + if in.Items != nil { + in, out := &in.Items, &out.Items + *out = make([]Workload, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadList. +func (in *WorkloadList) DeepCopy() *WorkloadList { + if in == nil { + return nil + } + out := new(WorkloadList) + in.DeepCopyInto(out) + return out +} + +// DeepCopyObject is an autogenerated deepcopy function, copying the receiver, creating a new runtime.Object. +func (in *WorkloadList) DeepCopyObject() runtime.Object { + if c := in.DeepCopy(); c != nil { + return c + } + return nil +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadSpec) DeepCopyInto(out *WorkloadSpec) { + *out = *in +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadSpec. +func (in *WorkloadSpec) DeepCopy() *WorkloadSpec { + if in == nil { + return nil + } + out := new(WorkloadSpec) + in.DeepCopyInto(out) + return out +} + +// DeepCopyInto is an autogenerated deepcopy function, copying the receiver, writing into out. in must be non-nil. +func (in *WorkloadStatus) DeepCopyInto(out *WorkloadStatus) { + *out = *in + if in.Tasks != nil { + in, out := &in.Tasks, &out.Tasks + *out = make([]corev1.ObjectReference, len(*in)) + copy(*out, *in) + } + if in.Conditions != nil { + in, out := &in.Conditions, &out.Conditions + *out = make([]v1.Condition, len(*in)) + for i := range *in { + (*in)[i].DeepCopyInto(&(*out)[i]) + } + } +} + +// DeepCopy is an autogenerated deepcopy function, copying the receiver, creating a new WorkloadStatus. +func (in *WorkloadStatus) DeepCopy() *WorkloadStatus { + if in == nil { + return nil + } + out := new(WorkloadStatus) + in.DeepCopyInto(out) + return out +} diff --git a/charts/foreman/Chart.yaml b/charts/foreman/Chart.yaml new file mode 100644 index 00000000..486d9205 --- /dev/null +++ b/charts/foreman/Chart.yaml @@ -0,0 +1,24 @@ +apiVersion: v2 +name: foreman +description: | + Foreman is an opt-in add-on for LLMKube that schedules agentic workloads + (Workload, AgenticTask) across a fleet of nodes (FleetNode). Installing + LLMKube alone does not install or require Foreman. Install Foreman after + llmkube; foreman-operator runs alongside llmkube-operator. +type: application +version: 0.0.1 +appVersion: "0.0.1" +keywords: + - llmkube + - foreman + - agentic + - workload + - orchestration +home: https://github.com/defilantech/LLMKube +sources: + - https://github.com/defilantech/LLMKube +dependencies: + - name: llmkube + version: ">=0.7.9" + repository: "https://defilantech.github.io/llmkube" + condition: llmkube.enabled diff --git a/charts/foreman/templates/crds/agentictasks.yaml b/charts/foreman/templates/crds/agentictasks.yaml new file mode 100644 index 00000000..1519dcc5 --- /dev/null +++ b/charts/foreman/templates/crds/agentictasks.yaml @@ -0,0 +1,296 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: agentictasks.foreman.llmkube.dev +spec: + group: foreman.llmkube.dev + names: + kind: AgenticTask + listKind: AgenticTaskList + plural: agentictasks + shortNames: + - at + singular: agentictask + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .spec.kind + name: Kind + type: string + - jsonPath: .status.assignedNode + name: Node + type: string + - jsonPath: .status.verdict + name: Verdict + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AgenticTask is the unit of dispatchable agentic work. The Foreman scheduler + matches each Pending task to a FleetNode whose advertised capability + satisfies the task's RequiredCapability, then a FleetAgent on that node + picks up the task and runs the matching executor. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec is the desired task definition. + properties: + dependsOn: + description: |- + DependsOn lists AgenticTasks (by name in the same namespace) that + must reach Succeeded before this task is dispatched. v0.1 uses this + only to chain verify tasks behind their parent issue-fix. + items: + type: string + type: array + kind: + description: Kind selects the work type and payload shape. + enum: + - issue-fix + - verify + - freeform + type: string + modelRef: + description: |- + ModelRef names the Model the agent should use. Optional; the + scheduler can pick a default based on RequiredCapability. + type: string + payload: + description: Payload is the kind-discriminated work spec. + properties: + agent: + description: |- + Agent is the named agent to invoke. Required for freeform; defaults + to "issue-fixer" for issue-fix and "verify" for verify. + type: string + branch: + description: Branch is the existing branch to gate. Required for + verify. + type: string + branchPrefix: + description: |- + BranchPrefix overrides the branch name prefix on issue-fix tasks + (default derived from the issue's labels via conventional commit + prefixes: fix/, feat/, chore/, etc.). + type: string + issue: + description: Issue is the GitHub issue number. Required for issue-fix. + format: int32 + minimum: 1 + type: integer + prompt: + description: Prompt is the agent input. Required for freeform. + type: string + repo: + description: Repo is the "owner/name" GitHub repo. Required for + issue-fix and verify. + type: string + type: object + priority: + description: |- + Priority is a hint for the scheduler when many tasks are Pending. + Higher values dispatch first. v0.1 is FIFO and ignores priority. + format: int32 + type: integer + requiredCapability: + description: RequiredCapability filters which FleetNodes can serve + this task. + properties: + accelerator: + default: any + description: Accelerator selects an accelerator family. "any" + matches any node. + enum: + - metal + - cuda + - any + type: string + minContextTokens: + description: |- + MinContextTokens is the minimum context window the node's installed + model must support. Set to 0 to leave unconstrained. + format: int32 + minimum: 0 + type: integer + minRAMGB: + description: MinRAMGB is the minimum available RAM the node must + advertise. + format: int32 + minimum: 0 + type: integer + nodeSelector: + additionalProperties: + type: string + description: |- + NodeSelector is a hard pin: only FleetNodes whose labels match every + key are eligible. Used for tasks that must run on a specific node + (e.g. verify tasks targeting the gate runner). + type: object + type: object + timeoutSeconds: + description: |- + TimeoutSeconds bounds the agent's run time. Zero uses the operator's + default (2700, matching the autofix pipeline's value). + format: int32 + minimum: 0 + type: integer + required: + - kind + - payload + type: object + status: + description: |- + status reflects the observed state, updated by the scheduler and the + assigned FleetAgent. + properties: + assignedNode: + description: AssignedNode is the FleetNode.metadata.name the scheduler + routed to. + type: string + branch: + description: Branch is the pushed branch, set on a successful issue-fix. + type: string + claimedAt: + description: ClaimedAt is when the FleetAgent on AssignedNode claimed + the task. + format: date-time + type: string + commitSHA: + description: CommitSHA is the head commit of Branch. + type: string + conditions: + description: |- + Conditions represent the current state of the task. Standard types: + Scheduled, Running, Completed, Failed. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + finishedAt: + description: FinishedAt is when the executor produced a verdict (success + or fail). + format: date-time + type: string + phase: + description: Phase is the current lifecycle phase. + enum: + - Pending + - Scheduled + - Running + - Verifying + - Succeeded + - Failed + type: string + result: + description: |- + Result is the structured JSON the agent emitted, validated against the + foreman.v1 schema. Opaque to the API server. + type: object + x-kubernetes-preserve-unknown-fields: true + startedAt: + description: StartedAt is when the executor began work. + format: date-time + type: string + transcriptRef: + description: |- + TranscriptRef points to where the agent's full transcript was stored + (typically a ConfigMap in the operator's namespace). + type: string + verdict: + description: Verdict is the final outcome category. + enum: + - GO + - NO-GO + - INCOMPLETE + - GATE-PASS + - GATE-FAIL + - GATE-ERROR + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/foreman/templates/crds/fleetnodes.yaml b/charts/foreman/templates/crds/fleetnodes.yaml new file mode 100644 index 00000000..78f3b287 --- /dev/null +++ b/charts/foreman/templates/crds/fleetnodes.yaml @@ -0,0 +1,229 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: fleetnodes.foreman.llmkube.dev +spec: + group: foreman.llmkube.dev + names: + kind: FleetNode + listKind: FleetNodeList + plural: fleetnodes + shortNames: + - fn + singular: fleetnode + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .status.capability.accelerator + name: Accelerator + type: string + - jsonPath: .status.capability.availableRAMGB + name: RAM + type: integer + - jsonPath: .status.currentTask + name: Current Task + type: string + - jsonPath: .status.lastHeartbeatTime + name: Heartbeat + type: date + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + FleetNode is a worker the Foreman scheduler can dispatch tasks to. It is + cluster-scoped because nodes are global to the fleet; the resource is + owned and updated by the FleetAgent running on that node. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec is the small bit of identity the agent owns. + properties: + nodeName: + description: |- + NodeName is the human-readable identity of the worker. Conventionally + matches metadata.name; required for the scheduler to address it. + type: string + roles: + description: |- + Roles label the node for capability-aware scheduling beyond raw + accelerator type. Conventionally one or more of: "worker", "verifier". + items: + type: string + type: array + tailscaleAddr: + description: |- + TailscaleAddr is the Tailscale address (IP or MagicDNS name) the + FleetAgent listens on. Optional; the operator does not connect to the + agent directly in v0.1 (dispatch is via the agent's CRD watch). + type: string + required: + - nodeName + type: object + status: + description: |- + status is the agent's heartbeat-driven view of its host. The + scheduler reads it; the agent writes it. + properties: + capability: + description: Capability is what this node advertises to the scheduler. + properties: + accelerator: + description: Accelerator names the accelerator family available + on this node. + enum: + - metal + - cuda + - none + type: string + availableRAMGB: + description: |- + AvailableRAMGB is the FleetAgent's estimate of RAM not currently + committed to a running model or task. Refreshed on heartbeat. + format: int32 + minimum: 0 + type: integer + installedModels: + description: |- + InstalledModels is the list of Model CR names this node has locally + available (the model files are present and the runtime can load them). + items: + type: string + type: array + maxContextTokens: + description: |- + MaxContextTokens is the largest context window the loaded model + supports. Used by the scheduler to filter tasks with high + MinContextTokens requirements. + format: int32 + minimum: 0 + type: integer + tokensPerSecond: + description: |- + TokensPerSecond is a coarse decode throughput estimate. v0.1 takes + this from configuration; v0.2 will benchmark on heartbeat. + format: int32 + minimum: 0 + type: integer + totalRAMGB: + description: TotalRAMGB is the physical RAM in GiB. + format: int32 + minimum: 0 + type: integer + type: object + conditions: + description: 'Conditions track standard health signals: Ready, Draining, + etc.' + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + currentTask: + description: |- + CurrentTask is the namespaced name of the AgenticTask the agent is + running, or empty if idle. The scheduler skips nodes with a non-empty + CurrentTask (v0.1 concurrency is one task per node). + type: string + lastHeartbeatTime: + description: |- + LastHeartbeatTime is the most recent heartbeat the FleetAgent + successfully patched. The reconciler marks the phase NotReady if + this stalls (default threshold: 90 seconds). + format: date-time + type: string + phase: + description: |- + Phase is the heartbeat-driven health state. The scheduler treats + only Ready nodes as eligible. + enum: + - Ready + - Draining + - NotReady + - Unknown + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/charts/foreman/templates/crds/workloads.yaml b/charts/foreman/templates/crds/workloads.yaml new file mode 100644 index 00000000..147588fe --- /dev/null +++ b/charts/foreman/templates/crds/workloads.yaml @@ -0,0 +1,235 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: workloads.foreman.llmkube.dev +spec: + group: foreman.llmkube.dev + names: + kind: Workload + listKind: WorkloadList + plural: workloads + shortNames: + - wl + singular: workload + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .spec.repo + name: Repo + type: string + - jsonPath: .status.succeededTasks + name: Tasks + type: integer + - jsonPath: .status.failedTasks + name: Failed + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Workload is the v0.1 entrypoint to Foreman. A user creates a Workload with + a high-level intent ("fix open bugs"); the WorkloadReconciler calls a + frontier model to decompose it into a set of AgenticTask objects, which + the scheduler then dispatches across the fleet. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec is the user-supplied intent. + properties: + intent: + description: |- + Intent is the natural-language description of what to do. + Example: "fix all open bugs in defilantech/LLMKube tagged size/small". + minLength: 1 + type: string + maxTasks: + description: |- + MaxTasks caps how many AgenticTasks the planner may emit. Zero means + no limit; the planner picks. Use this as a safety belt on the first + runs against a new repo or intent. + format: int32 + minimum: 0 + type: integer + plannerModel: + description: |- + PlannerModel selects the frontier model the planner should call. + Empty uses the operator's default (Anthropic Claude). The value is + a free-form identifier the planner adapter interprets, e.g. + "anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6". + type: string + repo: + description: |- + Repo is the GitHub repo in "owner/name" form that Intent applies to. + Required for issue-fix workloads; the planner reads its open issues. + type: string + required: + - intent + type: object + status: + description: status is the planner's and scheduler's observed view. + properties: + conditions: + description: 'Conditions track standard signals: Planned, Dispatched, + Completed.' + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + failedTasks: + description: FailedTasks counts child tasks in phase Failed. + format: int32 + type: integer + phase: + description: Phase is the lifecycle state. + enum: + - Planning + - Planned + - Dispatched + - Completed + - Failed + type: string + plannerModel: + description: |- + PlannerModel records which frontier model the planner actually used + for this workload. Set after the planner runs. + type: string + succeededTasks: + description: SucceededTasks counts child tasks in phase Succeeded. + format: int32 + type: integer + tasks: + description: |- + Tasks lists the AgenticTask objects the planner emitted. They are + owner-ref'd to this Workload so they cascade-delete with it. + items: + description: ObjectReference contains enough information to let + you inspect or modify the referred object. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/cmd/foreman-operator/main.go b/cmd/foreman-operator/main.go new file mode 100644 index 00000000..e198f8be --- /dev/null +++ b/cmd/foreman-operator/main.go @@ -0,0 +1,121 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Command foreman-operator is the control plane for the Foreman agentic +// workload subsystem. It runs alongside (not inside) the LLMKube core +// operator and reconciles the foreman.llmkube.dev API group: Workload, +// AgenticTask, FleetNode. Installing LLMKube alone does not install or +// require this binary. +package main + +import ( + "flag" + "os" + + // Import all Kubernetes client auth plugins (e.g. Azure, GCP, OIDC) so + // exec-entrypoint and run can make use of them. + _ "k8s.io/client-go/plugin/pkg/client/auth" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/healthz" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + metricsserver "sigs.k8s.io/controller-runtime/pkg/metrics/server" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" + foremancontroller "github.com/defilantech/llmkube/internal/foreman/controller" +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(foremanv1alpha1.AddToScheme(scheme)) +} + +func main() { + var metricsAddr string + var probeAddr string + var enableLeaderElection bool + flag.StringVar(&metricsAddr, "metrics-bind-address", ":8081", + "The address the metrics endpoint binds to.") + flag.StringVar(&probeAddr, "health-probe-bind-address", ":8082", + "The address the probe endpoint binds to.") + flag.BoolVar(&enableLeaderElection, "leader-elect", false, + "Enable leader election for controller manager. "+ + "Enabling this will ensure there is only one active controller manager.") + + opts := zap.Options{Development: true} + opts.BindFlags(flag.CommandLine) + flag.Parse() + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + mgr, err := ctrl.NewManager(ctrl.GetConfigOrDie(), ctrl.Options{ + Scheme: scheme, + Metrics: metricsserver.Options{BindAddress: metricsAddr}, + HealthProbeBindAddress: probeAddr, + LeaderElection: enableLeaderElection, + LeaderElectionID: "foreman-operator.llmkube.dev", + }) + if err != nil { + setupLog.Error(err, "unable to start manager") + os.Exit(1) + } + + if err := (&foremancontroller.WorkloadReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "Workload") + os.Exit(1) + } + + if err := (&foremancontroller.AgenticTaskReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "AgenticTask") + os.Exit(1) + } + + if err := (&foremancontroller.FleetNodeReconciler{ + Client: mgr.GetClient(), + Scheme: mgr.GetScheme(), + }).SetupWithManager(mgr); err != nil { + setupLog.Error(err, "unable to create controller", "controller", "FleetNode") + os.Exit(1) + } + + if err := mgr.AddHealthzCheck("healthz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up health check") + os.Exit(1) + } + if err := mgr.AddReadyzCheck("readyz", healthz.Ping); err != nil { + setupLog.Error(err, "unable to set up ready check") + os.Exit(1) + } + + setupLog.Info("starting foreman-operator") + if err := mgr.Start(ctrl.SetupSignalHandler()); err != nil { + setupLog.Error(err, "problem running manager") + os.Exit(1) + } +} diff --git a/config/crd/bases/foreman.llmkube.dev_agentictasks.yaml b/config/crd/bases/foreman.llmkube.dev_agentictasks.yaml new file mode 100644 index 00000000..1519dcc5 --- /dev/null +++ b/config/crd/bases/foreman.llmkube.dev_agentictasks.yaml @@ -0,0 +1,296 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: agentictasks.foreman.llmkube.dev +spec: + group: foreman.llmkube.dev + names: + kind: AgenticTask + listKind: AgenticTaskList + plural: agentictasks + shortNames: + - at + singular: agentictask + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .spec.kind + name: Kind + type: string + - jsonPath: .status.assignedNode + name: Node + type: string + - jsonPath: .status.verdict + name: Verdict + type: string + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + AgenticTask is the unit of dispatchable agentic work. The Foreman scheduler + matches each Pending task to a FleetNode whose advertised capability + satisfies the task's RequiredCapability, then a FleetAgent on that node + picks up the task and runs the matching executor. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec is the desired task definition. + properties: + dependsOn: + description: |- + DependsOn lists AgenticTasks (by name in the same namespace) that + must reach Succeeded before this task is dispatched. v0.1 uses this + only to chain verify tasks behind their parent issue-fix. + items: + type: string + type: array + kind: + description: Kind selects the work type and payload shape. + enum: + - issue-fix + - verify + - freeform + type: string + modelRef: + description: |- + ModelRef names the Model the agent should use. Optional; the + scheduler can pick a default based on RequiredCapability. + type: string + payload: + description: Payload is the kind-discriminated work spec. + properties: + agent: + description: |- + Agent is the named agent to invoke. Required for freeform; defaults + to "issue-fixer" for issue-fix and "verify" for verify. + type: string + branch: + description: Branch is the existing branch to gate. Required for + verify. + type: string + branchPrefix: + description: |- + BranchPrefix overrides the branch name prefix on issue-fix tasks + (default derived from the issue's labels via conventional commit + prefixes: fix/, feat/, chore/, etc.). + type: string + issue: + description: Issue is the GitHub issue number. Required for issue-fix. + format: int32 + minimum: 1 + type: integer + prompt: + description: Prompt is the agent input. Required for freeform. + type: string + repo: + description: Repo is the "owner/name" GitHub repo. Required for + issue-fix and verify. + type: string + type: object + priority: + description: |- + Priority is a hint for the scheduler when many tasks are Pending. + Higher values dispatch first. v0.1 is FIFO and ignores priority. + format: int32 + type: integer + requiredCapability: + description: RequiredCapability filters which FleetNodes can serve + this task. + properties: + accelerator: + default: any + description: Accelerator selects an accelerator family. "any" + matches any node. + enum: + - metal + - cuda + - any + type: string + minContextTokens: + description: |- + MinContextTokens is the minimum context window the node's installed + model must support. Set to 0 to leave unconstrained. + format: int32 + minimum: 0 + type: integer + minRAMGB: + description: MinRAMGB is the minimum available RAM the node must + advertise. + format: int32 + minimum: 0 + type: integer + nodeSelector: + additionalProperties: + type: string + description: |- + NodeSelector is a hard pin: only FleetNodes whose labels match every + key are eligible. Used for tasks that must run on a specific node + (e.g. verify tasks targeting the gate runner). + type: object + type: object + timeoutSeconds: + description: |- + TimeoutSeconds bounds the agent's run time. Zero uses the operator's + default (2700, matching the autofix pipeline's value). + format: int32 + minimum: 0 + type: integer + required: + - kind + - payload + type: object + status: + description: |- + status reflects the observed state, updated by the scheduler and the + assigned FleetAgent. + properties: + assignedNode: + description: AssignedNode is the FleetNode.metadata.name the scheduler + routed to. + type: string + branch: + description: Branch is the pushed branch, set on a successful issue-fix. + type: string + claimedAt: + description: ClaimedAt is when the FleetAgent on AssignedNode claimed + the task. + format: date-time + type: string + commitSHA: + description: CommitSHA is the head commit of Branch. + type: string + conditions: + description: |- + Conditions represent the current state of the task. Standard types: + Scheduled, Running, Completed, Failed. + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + finishedAt: + description: FinishedAt is when the executor produced a verdict (success + or fail). + format: date-time + type: string + phase: + description: Phase is the current lifecycle phase. + enum: + - Pending + - Scheduled + - Running + - Verifying + - Succeeded + - Failed + type: string + result: + description: |- + Result is the structured JSON the agent emitted, validated against the + foreman.v1 schema. Opaque to the API server. + type: object + x-kubernetes-preserve-unknown-fields: true + startedAt: + description: StartedAt is when the executor began work. + format: date-time + type: string + transcriptRef: + description: |- + TranscriptRef points to where the agent's full transcript was stored + (typically a ConfigMap in the operator's namespace). + type: string + verdict: + description: Verdict is the final outcome category. + enum: + - GO + - NO-GO + - INCOMPLETE + - GATE-PASS + - GATE-FAIL + - GATE-ERROR + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/foreman.llmkube.dev_fleetnodes.yaml b/config/crd/bases/foreman.llmkube.dev_fleetnodes.yaml new file mode 100644 index 00000000..78f3b287 --- /dev/null +++ b/config/crd/bases/foreman.llmkube.dev_fleetnodes.yaml @@ -0,0 +1,229 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: fleetnodes.foreman.llmkube.dev +spec: + group: foreman.llmkube.dev + names: + kind: FleetNode + listKind: FleetNodeList + plural: fleetnodes + shortNames: + - fn + singular: fleetnode + scope: Cluster + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .status.capability.accelerator + name: Accelerator + type: string + - jsonPath: .status.capability.availableRAMGB + name: RAM + type: integer + - jsonPath: .status.currentTask + name: Current Task + type: string + - jsonPath: .status.lastHeartbeatTime + name: Heartbeat + type: date + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + FleetNode is a worker the Foreman scheduler can dispatch tasks to. It is + cluster-scoped because nodes are global to the fleet; the resource is + owned and updated by the FleetAgent running on that node. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec is the small bit of identity the agent owns. + properties: + nodeName: + description: |- + NodeName is the human-readable identity of the worker. Conventionally + matches metadata.name; required for the scheduler to address it. + type: string + roles: + description: |- + Roles label the node for capability-aware scheduling beyond raw + accelerator type. Conventionally one or more of: "worker", "verifier". + items: + type: string + type: array + tailscaleAddr: + description: |- + TailscaleAddr is the Tailscale address (IP or MagicDNS name) the + FleetAgent listens on. Optional; the operator does not connect to the + agent directly in v0.1 (dispatch is via the agent's CRD watch). + type: string + required: + - nodeName + type: object + status: + description: |- + status is the agent's heartbeat-driven view of its host. The + scheduler reads it; the agent writes it. + properties: + capability: + description: Capability is what this node advertises to the scheduler. + properties: + accelerator: + description: Accelerator names the accelerator family available + on this node. + enum: + - metal + - cuda + - none + type: string + availableRAMGB: + description: |- + AvailableRAMGB is the FleetAgent's estimate of RAM not currently + committed to a running model or task. Refreshed on heartbeat. + format: int32 + minimum: 0 + type: integer + installedModels: + description: |- + InstalledModels is the list of Model CR names this node has locally + available (the model files are present and the runtime can load them). + items: + type: string + type: array + maxContextTokens: + description: |- + MaxContextTokens is the largest context window the loaded model + supports. Used by the scheduler to filter tasks with high + MinContextTokens requirements. + format: int32 + minimum: 0 + type: integer + tokensPerSecond: + description: |- + TokensPerSecond is a coarse decode throughput estimate. v0.1 takes + this from configuration; v0.2 will benchmark on heartbeat. + format: int32 + minimum: 0 + type: integer + totalRAMGB: + description: TotalRAMGB is the physical RAM in GiB. + format: int32 + minimum: 0 + type: integer + type: object + conditions: + description: 'Conditions track standard health signals: Ready, Draining, + etc.' + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + currentTask: + description: |- + CurrentTask is the namespaced name of the AgenticTask the agent is + running, or empty if idle. The scheduler skips nodes with a non-empty + CurrentTask (v0.1 concurrency is one task per node). + type: string + lastHeartbeatTime: + description: |- + LastHeartbeatTime is the most recent heartbeat the FleetAgent + successfully patched. The reconciler marks the phase NotReady if + this stalls (default threshold: 90 seconds). + format: date-time + type: string + phase: + description: |- + Phase is the heartbeat-driven health state. The scheduler treats + only Ready nodes as eligible. + enum: + - Ready + - Draining + - NotReady + - Unknown + type: string + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/crd/bases/foreman.llmkube.dev_workloads.yaml b/config/crd/bases/foreman.llmkube.dev_workloads.yaml new file mode 100644 index 00000000..147588fe --- /dev/null +++ b/config/crd/bases/foreman.llmkube.dev_workloads.yaml @@ -0,0 +1,235 @@ +--- +apiVersion: apiextensions.k8s.io/v1 +kind: CustomResourceDefinition +metadata: + annotations: + controller-gen.kubebuilder.io/version: v0.19.0 + name: workloads.foreman.llmkube.dev +spec: + group: foreman.llmkube.dev + names: + kind: Workload + listKind: WorkloadList + plural: workloads + shortNames: + - wl + singular: workload + scope: Namespaced + versions: + - additionalPrinterColumns: + - jsonPath: .status.phase + name: Phase + type: string + - jsonPath: .spec.repo + name: Repo + type: string + - jsonPath: .status.succeededTasks + name: Tasks + type: integer + - jsonPath: .status.failedTasks + name: Failed + type: integer + - jsonPath: .metadata.creationTimestamp + name: Age + type: date + name: v1alpha1 + schema: + openAPIV3Schema: + description: |- + Workload is the v0.1 entrypoint to Foreman. A user creates a Workload with + a high-level intent ("fix open bugs"); the WorkloadReconciler calls a + frontier model to decompose it into a set of AgenticTask objects, which + the scheduler then dispatches across the fleet. + properties: + apiVersion: + description: |- + APIVersion defines the versioned schema of this representation of an object. + Servers should convert recognized schemas to the latest internal value, and + may reject unrecognized values. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#resources + type: string + kind: + description: |- + Kind is a string value representing the REST resource this object represents. + Servers may infer this from the endpoint the client submits requests to. + Cannot be updated. + In CamelCase. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + metadata: + type: object + spec: + description: spec is the user-supplied intent. + properties: + intent: + description: |- + Intent is the natural-language description of what to do. + Example: "fix all open bugs in defilantech/LLMKube tagged size/small". + minLength: 1 + type: string + maxTasks: + description: |- + MaxTasks caps how many AgenticTasks the planner may emit. Zero means + no limit; the planner picks. Use this as a safety belt on the first + runs against a new repo or intent. + format: int32 + minimum: 0 + type: integer + plannerModel: + description: |- + PlannerModel selects the frontier model the planner should call. + Empty uses the operator's default (Anthropic Claude). The value is + a free-form identifier the planner adapter interprets, e.g. + "anthropic/claude-opus-4-7", "anthropic/claude-sonnet-4-6". + type: string + repo: + description: |- + Repo is the GitHub repo in "owner/name" form that Intent applies to. + Required for issue-fix workloads; the planner reads its open issues. + type: string + required: + - intent + type: object + status: + description: status is the planner's and scheduler's observed view. + properties: + conditions: + description: 'Conditions track standard signals: Planned, Dispatched, + Completed.' + items: + description: Condition contains details for one aspect of the current + state of this API Resource. + properties: + lastTransitionTime: + description: |- + lastTransitionTime is the last time the condition transitioned from one status to another. + This should be when the underlying condition changed. If that is not known, then using the time when the API field changed is acceptable. + format: date-time + type: string + message: + description: |- + message is a human readable message indicating details about the transition. + This may be an empty string. + maxLength: 32768 + type: string + observedGeneration: + description: |- + observedGeneration represents the .metadata.generation that the condition was set based upon. + For instance, if .metadata.generation is currently 12, but the .status.conditions[x].observedGeneration is 9, the condition is out of date + with respect to the current state of the instance. + format: int64 + minimum: 0 + type: integer + reason: + description: |- + reason contains a programmatic identifier indicating the reason for the condition's last transition. + Producers of specific condition types may define expected values and meanings for this field, + and whether the values are considered a guaranteed API. + The value should be a CamelCase string. + This field may not be empty. + maxLength: 1024 + minLength: 1 + pattern: ^[A-Za-z]([A-Za-z0-9_,:]*[A-Za-z0-9_])?$ + type: string + status: + description: status of the condition, one of True, False, Unknown. + enum: + - "True" + - "False" + - Unknown + type: string + type: + description: type of condition in CamelCase or in foo.example.com/CamelCase. + maxLength: 316 + pattern: ^([a-z0-9]([-a-z0-9]*[a-z0-9])?(\.[a-z0-9]([-a-z0-9]*[a-z0-9])?)*/)?(([A-Za-z0-9][-A-Za-z0-9_.]*)?[A-Za-z0-9])$ + type: string + required: + - lastTransitionTime + - message + - reason + - status + - type + type: object + type: array + x-kubernetes-list-map-keys: + - type + x-kubernetes-list-type: map + failedTasks: + description: FailedTasks counts child tasks in phase Failed. + format: int32 + type: integer + phase: + description: Phase is the lifecycle state. + enum: + - Planning + - Planned + - Dispatched + - Completed + - Failed + type: string + plannerModel: + description: |- + PlannerModel records which frontier model the planner actually used + for this workload. Set after the planner runs. + type: string + succeededTasks: + description: SucceededTasks counts child tasks in phase Succeeded. + format: int32 + type: integer + tasks: + description: |- + Tasks lists the AgenticTask objects the planner emitted. They are + owner-ref'd to this Workload so they cascade-delete with it. + items: + description: ObjectReference contains enough information to let + you inspect or modify the referred object. + properties: + apiVersion: + description: API version of the referent. + type: string + fieldPath: + description: |- + If referring to a piece of an object instead of an entire object, this string + should contain a valid JSON/Go field access statement, such as desiredState.manifest.containers[2]. + For example, if the object reference is to a container within a pod, this would take on a value like: + "spec.containers{name}" (where "name" refers to the name of the container that triggered + the event) or if no container name is specified "spec.containers[2]" (container with + index 2 in this pod). This syntax is chosen only to have some well-defined way of + referencing a part of an object. + type: string + kind: + description: |- + Kind of the referent. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#types-kinds + type: string + name: + description: |- + Name of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#names + type: string + namespace: + description: |- + Namespace of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/namespaces/ + type: string + resourceVersion: + description: |- + Specific resourceVersion to which this reference is made, if any. + More info: https://git.k8s.io/community/contributors/devel/sig-architecture/api-conventions.md#concurrency-control-and-consistency + type: string + uid: + description: |- + UID of the referent. + More info: https://kubernetes.io/docs/concepts/overview/working-with-objects/names/#uids + type: string + type: object + x-kubernetes-map-type: atomic + type: array + type: object + required: + - spec + type: object + served: true + storage: true + subresources: + status: {} diff --git a/config/rbac/role.yaml b/config/rbac/role.yaml index 12e019ae..24efdf0d 100644 --- a/config/rbac/role.yaml +++ b/config/rbac/role.yaml @@ -69,6 +69,38 @@ rules: - patch - update - watch +- apiGroups: + - foreman.llmkube.dev + resources: + - agentictasks + - fleetnodes + - workloads + verbs: + - create + - delete + - get + - list + - patch + - update + - watch +- apiGroups: + - foreman.llmkube.dev + resources: + - agentictasks/finalizers + - fleetnodes/finalizers + - workloads/finalizers + verbs: + - update +- apiGroups: + - foreman.llmkube.dev + resources: + - agentictasks/status + - fleetnodes/status + - workloads/status + verbs: + - get + - patch + - update - apiGroups: - inference.llmkube.dev resources: diff --git a/internal/foreman/controller/agentictask_controller.go b/internal/foreman/controller/agentictask_controller.go new file mode 100644 index 00000000..d864ba5f --- /dev/null +++ b/internal/foreman/controller/agentictask_controller.go @@ -0,0 +1,72 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// AgenticTaskReconciler is the scheduler: it watches AgenticTask objects, +// matches Pending tasks to Ready FleetNodes by capability, writes the +// assignment back onto the task, and (later) chains a verify child task +// when an issue-fix succeeds. +// +// v0.1 / M0: this is a stub. It reads the task and logs the phase; no +// scheduling, no node matching. The real logic lands in M2. +type AgenticTaskReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=agentictasks,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=agentictasks/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=agentictasks/finalizers,verbs=update +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=fleetnodes,verbs=get;list;watch + +// Reconcile is the entry point for AgenticTask events. +func (r *AgenticTaskReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + var task foremanv1alpha1.AgenticTask + if err := r.Get(ctx, req.NamespacedName, &task); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + log.Info("reconciling AgenticTask", + "kind", task.Spec.Kind, + "phase", task.Status.Phase, + "assignedNode", task.Status.AssignedNode, + ) + + // M0 stub: no-op. Scheduling logic lands in M2. + return ctrl.Result{}, nil +} + +// SetupWithManager wires the reconciler into the controller-runtime manager. +func (r *AgenticTaskReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&foremanv1alpha1.AgenticTask{}). + Named("agentictask"). + Complete(r) +} diff --git a/internal/foreman/controller/fleetnode_controller.go b/internal/foreman/controller/fleetnode_controller.go new file mode 100644 index 00000000..36ca7686 --- /dev/null +++ b/internal/foreman/controller/fleetnode_controller.go @@ -0,0 +1,71 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// FleetNodeReconciler watches FleetNode objects and marks them NotReady when +// their heartbeat goes stale. On a phase transition to NotReady it triggers +// re-queue of any AgenticTask whose status.assignedNode points at the +// stale node and whose phase is still Scheduled or Running. +// +// v0.1 / M0: stub. Heartbeat-staleness sweep lands in M1 (alongside the +// FleetAgent that writes the heartbeats in the first place). +type FleetNodeReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=fleetnodes,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=fleetnodes/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=fleetnodes/finalizers,verbs=update + +// Reconcile is the entry point for FleetNode events. +func (r *FleetNodeReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + var node foremanv1alpha1.FleetNode + if err := r.Get(ctx, req.NamespacedName, &node); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + log.Info("reconciling FleetNode", + "nodeName", node.Spec.NodeName, + "phase", node.Status.Phase, + "currentTask", node.Status.CurrentTask, + ) + + // M0 stub: no-op. Heartbeat staleness check lands in M1. + return ctrl.Result{}, nil +} + +// SetupWithManager wires the reconciler into the controller-runtime manager. +func (r *FleetNodeReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&foremanv1alpha1.FleetNode{}). + Named("fleetnode"). + Complete(r) +} diff --git a/internal/foreman/controller/workload_controller.go b/internal/foreman/controller/workload_controller.go new file mode 100644 index 00000000..d00cf3bb --- /dev/null +++ b/internal/foreman/controller/workload_controller.go @@ -0,0 +1,70 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + "context" + + "k8s.io/apimachinery/pkg/runtime" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// WorkloadReconciler turns a high-level Workload (a natural-language intent) +// into a set of AgenticTask objects by calling a frontier model planner. +// +// v0.1 / M0: stub. The planner client + prompt land in M6. For now the +// reconciler just reads the workload and logs. +type WorkloadReconciler struct { + client.Client + Scheme *runtime.Scheme +} + +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=workloads,verbs=get;list;watch;create;update;patch;delete +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=workloads/status,verbs=get;update;patch +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=workloads/finalizers,verbs=update +// +kubebuilder:rbac:groups=foreman.llmkube.dev,resources=agentictasks,verbs=create;get;list;watch + +// Reconcile is the entry point for Workload events. +func (r *WorkloadReconciler) Reconcile(ctx context.Context, req ctrl.Request) (ctrl.Result, error) { + log := logf.FromContext(ctx) + + var workload foremanv1alpha1.Workload + if err := r.Get(ctx, req.NamespacedName, &workload); err != nil { + return ctrl.Result{}, client.IgnoreNotFound(err) + } + + log.Info("reconciling Workload", + "intent", workload.Spec.Intent, + "repo", workload.Spec.Repo, + "phase", workload.Status.Phase, + ) + + // M0 stub: no-op. Planner integration lands in M6. + return ctrl.Result{}, nil +} + +// SetupWithManager wires the reconciler into the controller-runtime manager. +func (r *WorkloadReconciler) SetupWithManager(mgr ctrl.Manager) error { + return ctrl.NewControllerManagedBy(mgr). + For(&foremanv1alpha1.Workload{}). + Named("workload"). + Complete(r) +} diff --git a/scripts/sync-crds.sh b/scripts/sync-crds.sh index bc126023..d6f0bd6d 100755 --- a/scripts/sync-crds.sh +++ b/scripts/sync-crds.sh @@ -20,7 +20,9 @@ fi mkdir -p "$CRD_TARGET_DIR" synced=0 -for src in "$CRD_SOURCE_DIR"/*.yaml; do +# Scope to the inference.llmkube.dev group; other groups (e.g. foreman) have +# their own chart-sync target (`make foreman-chart-crds`). +for src in "$CRD_SOURCE_DIR"/inference.llmkube.dev_*.yaml; do # Strip kubebuilder group prefix: inference.llmkube.dev_inferenceservices.yaml → inferenceservices.yaml base="$(basename "$src")" short="${base%.*}" # strip .yaml extension From 587d9fa0876e30306656bc927ce0f5d69bc18ee3 Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Tue, 19 May 2026 17:49:15 -0700 Subject: [PATCH 2/5] feat: add foreman-agent with FleetNode self-registration and heartbeat (v0.1 M1) The Foreman node-side daemon. One foreman-agent runs per fleet host. In M1 it owns a single responsibility: keep this host's FleetNode CR present and current so the scheduler (lands in M2) can target it. Lifecycle: - on startup: upsert the FleetNode (create if missing, update spec if flag-supplied identity changed since last run); - every --heartbeat-interval (default 30s): patch FleetNode.status with phase=Ready, fresh lastHeartbeatTime, current capability snapshot; - on SIGTERM/SIGINT: best-effort drain patch (phase=Draining) so the scheduler stops dispatching to this node before the process exits. Cross-platform: - capability_darwin.go uses the metal-agent's existing DarwinMemoryProvider (sysctl hw.memsize + vm_stat) so available RAM is live, not flag-supplied. Defaults accelerator=metal. - capability_other.go is a stub for linux/amd64 so the binary builds cross-arch from day one. Live probing on Linux + NVIDIA lands at M4 when ShadowStack joins the fleet. Reuse, not modification: pkg/foreman/agent imports pkg/agent.DarwinMemoryProvider but does not touch it. The LLMKube metal-agent's behavior is unchanged. Flags: --fleet-node-name, --tailscale-addr, --roles, --accelerator, --installed-models, --max-context-tokens, --tokens-per-second, --total-ram-gb, --heartbeat-interval, --kube-context, --workspace-dir, --opencode-bin (last two are placeholders the M3 executor will require). --kubeconfig is auto-registered by controller-runtime's config init. New paths: - pkg/foreman/agent/fleetnode.go Registrar (Upsert/Run/PatchHeartbeat) - pkg/foreman/agent/capability.go CapabilityOptions - pkg/foreman/agent/capability_darwin.go DarwinMemoryProvider backed - pkg/foreman/agent/capability_other.go !darwin stub - cmd/foreman-agent/main.go the binary Verification on kind-llmkube-local, --heartbeat-interval=3s, 10s run: - kubectl get fleetnodes NAME PHASE ACCELERATOR RAM CURRENT TASK HEARTBEAT AGE m5-max Ready metal 22 1s 10s - status.capability.totalRAMGB=128 (live sysctl), availableRAMGB=22 (live vm_stat), installedModels=[minimax-m2-7], maxContextTokens=131072, tokensPerSecond=47. - 3 heartbeat patches over 10s, all successful. - SIGTERM produced phase=Draining; agent exited cleanly. - make test (full envtest), make lint (0 issues), go vet all clean. Signed-off-by: Christopher Maher --- cmd/foreman-agent/main.go | 244 +++++++++++++++++++++++++ pkg/foreman/agent/capability.go | 49 +++++ pkg/foreman/agent/capability_darwin.go | 78 ++++++++ pkg/foreman/agent/capability_other.go | 57 ++++++ pkg/foreman/agent/fleetnode.go | 173 ++++++++++++++++++ 5 files changed, 601 insertions(+) create mode 100644 cmd/foreman-agent/main.go create mode 100644 pkg/foreman/agent/capability.go create mode 100644 pkg/foreman/agent/capability_darwin.go create mode 100644 pkg/foreman/agent/capability_other.go create mode 100644 pkg/foreman/agent/fleetnode.go diff --git a/cmd/foreman-agent/main.go b/cmd/foreman-agent/main.go new file mode 100644 index 00000000..51b64827 --- /dev/null +++ b/cmd/foreman-agent/main.go @@ -0,0 +1,244 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// foreman-agent is the Foreman node-side daemon. One instance runs on each +// host in the fleet. In M1 it owns a single responsibility: keep the +// FleetNode CR for this host present and current (initial upsert + 30s +// heartbeat). M2+ adds the AgenticTaskWatcher and executors. +// +// Cross-platform: builds on darwin (real Metal capability) and linux/amd64 +// (stub capability for now; M4 fills it in). +package main + +import ( + "context" + "flag" + "fmt" + "math" + "os" + "os/signal" + "regexp" + "strings" + "syscall" + "time" + + "k8s.io/apimachinery/pkg/runtime" + utilruntime "k8s.io/apimachinery/pkg/util/runtime" + clientgoscheme "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + ctrl "sigs.k8s.io/controller-runtime" + "sigs.k8s.io/controller-runtime/pkg/client" + clientconfig "sigs.k8s.io/controller-runtime/pkg/client/config" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" + foremanagent "github.com/defilantech/llmkube/pkg/foreman/agent" +) + +var ( + scheme = runtime.NewScheme() + setupLog = ctrl.Log.WithName("setup") +) + +func init() { + utilruntime.Must(clientgoscheme.AddToScheme(scheme)) + utilruntime.Must(foremanv1alpha1.AddToScheme(scheme)) +} + +func main() { + // Note: --kubeconfig is auto-registered by sigs.k8s.io/controller-runtime/pkg/client/config + // at import time; loadKubeconfig honors it via GetConfigWithContext. We + // only add --kube-context on top. + var ( + fleetNodeName string + tailscaleAddr string + kubeContext string + workspaceDir string + opencodeBin string + rolesFlag string + acceleratorFlag string + installedModels string + heartbeat time.Duration + maxCtx int + tokensPerSec int + staticTotalRAMGB int + ) + + flag.StringVar(&fleetNodeName, "fleet-node-name", "", + "Identity of this node in Foreman. Defaults to a sanitized OS hostname.") + flag.StringVar(&tailscaleAddr, "tailscale-addr", "", + "Tailscale IP or MagicDNS name this node listens on (advertised on FleetNode.spec).") + flag.StringVar(&kubeContext, "kube-context", "", + "kubeconfig context override.") + flag.StringVar(&workspaceDir, "workspace-dir", "", + "Working directory for executor scratch (clones, transcripts). Required for M3+; unused in M1.") + flag.StringVar(&opencodeBin, "opencode-bin", "", + "Path to the opencode binary. Required for M3+; unused in M1.") + flag.StringVar(&rolesFlag, "roles", "worker", + "Comma-separated roles this node serves (worker, verifier).") + flag.StringVar(&acceleratorFlag, "accelerator", "", + "Accelerator label override. Defaults to metal on darwin; required on linux in v0.1.") + flag.StringVar(&installedModels, "installed-models", "", + "Comma-separated Model CR names this node has cached locally.") + flag.DurationVar(&heartbeat, "heartbeat-interval", foremanagent.DefaultHeartbeatInterval, + "How often to patch FleetNode.status with a fresh heartbeat.") + flag.IntVar(&maxCtx, "max-context-tokens", 0, + "Advertised max context window in tokens (0 = unset).") + flag.IntVar(&tokensPerSec, "tokens-per-second", 0, + "Advertised decode throughput in tok/s (0 = unset).") + flag.IntVar(&staticTotalRAMGB, "total-ram-gb", 0, + "Advertised total RAM on platforms without live memory probing (non-darwin only).") + + opts := zap.Options{Development: true} + opts.BindFlags(flag.CommandLine) + flag.Parse() + ctrl.SetLogger(zap.New(zap.UseFlagOptions(&opts))) + + if fleetNodeName == "" { + host, err := os.Hostname() + if err != nil || host == "" { + setupLog.Error(err, "--fleet-node-name is required; OS hostname unavailable") + os.Exit(1) + } + fleetNodeName = sanitizeName(host) + } else { + // User-supplied name still needs to be a valid DNS-1123 label. + clean := sanitizeName(fleetNodeName) + if clean != fleetNodeName { + setupLog.Info("fleet-node-name sanitized for DNS-1123 compliance", + "input", fleetNodeName, "result", clean) + fleetNodeName = clean + } + } + + if workspaceDir == "" && opencodeBin == "" { + setupLog.Info("running in M1 mode: no executor wired yet; --workspace-dir / --opencode-bin become required at M3") + } + + cfg, err := loadKubeconfig(kubeContext) + if err != nil { + setupLog.Error(err, "failed to load kubeconfig") + os.Exit(1) + } + + kc, err := client.New(cfg, client.Options{Scheme: scheme}) + if err != nil { + setupLog.Error(err, "failed to construct kubernetes client") + os.Exit(1) + } + + spec := foremanv1alpha1.FleetNodeSpec{ + NodeName: fleetNodeName, + TailscaleAddr: tailscaleAddr, + Roles: splitCSV(rolesFlag), + } + + provider := foremanagent.NewCapability(foremanagent.CapabilityOptions{ + Accelerator: foremanv1alpha1.FleetNodeAccelerator(acceleratorFlag), + InstalledModels: splitCSV(installedModels), + MaxContextTokens: clampInt32(maxCtx), + TokensPerSecond: clampInt32(tokensPerSec), + StaticTotalRAMGB: clampInt32(staticTotalRAMGB), + }) + + reg := &foremanagent.Registrar{ + Client: kc, + NodeName: fleetNodeName, + Spec: spec, + Provider: provider, + Interval: heartbeat, + } + + ctx, cancel := signal.NotifyContext(context.Background(), syscall.SIGINT, syscall.SIGTERM) + defer cancel() + + if err := reg.Upsert(ctx); err != nil { + setupLog.Error(err, "failed to upsert FleetNode") + os.Exit(1) + } + + cap := provider.Capability() + setupLog.Info("foreman-agent started", + "fleetNode", fleetNodeName, + "tailscaleAddr", tailscaleAddr, + "roles", spec.Roles, + "accelerator", cap.Accelerator, + "totalRAMGB", cap.TotalRAMGB, + "heartbeat", heartbeat.String(), + ) + + if err := reg.Run(ctx); err != nil { + setupLog.Error(err, "registrar exited with error") + os.Exit(1) + } + setupLog.Info("foreman-agent stopped cleanly") +} + +// loadKubeconfig defers to controller-runtime's standard discovery chain: +// the auto-registered --kubeconfig flag, then $KUBECONFIG, then +// in-cluster, then ~/.kube/config. An optional --kube-context selects a +// non-current context. +func loadKubeconfig(contextName string) (*rest.Config, error) { + cfg, err := clientconfig.GetConfigWithContext(contextName) + if err != nil { + return nil, fmt.Errorf("load kubeconfig: %w", err) + } + return cfg, nil +} + +func splitCSV(s string) []string { + if s == "" { + return nil + } + parts := strings.Split(s, ",") + out := make([]string, 0, len(parts)) + for _, p := range parts { + p = strings.TrimSpace(p) + if p == "" { + continue + } + out = append(out, p) + } + return out +} + +// clampInt32 narrows a user-supplied int flag to int32, treating negatives +// as 0 and saturating at math.MaxInt32 so the CRD's int32 field is always +// in range. +func clampInt32(n int) int32 { + if n < 0 { + return 0 + } + if n > math.MaxInt32 { + return math.MaxInt32 + } + return int32(n) //nolint:gosec // bounded above +} + +var dns1123Bad = regexp.MustCompile(`[^a-z0-9-]+`) + +func sanitizeName(s string) string { + s = strings.ToLower(s) + s = dns1123Bad.ReplaceAllString(s, "-") + s = strings.Trim(s, "-") + if s == "" { + return "fleetnode" + } + if len(s) > 63 { + s = strings.TrimRight(s[:63], "-") + } + return s +} diff --git a/pkg/foreman/agent/capability.go b/pkg/foreman/agent/capability.go new file mode 100644 index 00000000..7e227e17 --- /dev/null +++ b/pkg/foreman/agent/capability.go @@ -0,0 +1,49 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agent + +import ( + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// CapabilityOptions is the cross-platform constructor input for +// NewCapability. Fields the OS can probe (Total / Available RAM on +// darwin) are read at heartbeat time and not duplicated here. +type CapabilityOptions struct { + // Accelerator labels the host's accelerator family. On darwin this + // defaults to "metal"; on other platforms it must be set explicitly + // in v0.1 (M4 will probe NVIDIA + similar). + Accelerator foremanv1alpha1.FleetNodeAccelerator + + // InstalledModels are the Model CR names this node can load. + InstalledModels []string + + // MaxContextTokens is the largest context window the loaded model + // supports. Advertised verbatim; the scheduler uses it to filter + // AgenticTasks whose RequiredCapability.MinContextTokens exceeds it. + MaxContextTokens int32 + + // TokensPerSecond is the operator's coarse decode-throughput estimate + // for the loaded model. v0.1 takes this as a flag; v0.2 will + // benchmark on heartbeat. + TokensPerSecond int32 + + // StaticTotalRAMGB is used only on platforms where memory probing + // is not yet implemented (non-darwin in v0.1). On darwin it is + // ignored in favor of the live sysctl value. + StaticTotalRAMGB int32 +} diff --git a/pkg/foreman/agent/capability_darwin.go b/pkg/foreman/agent/capability_darwin.go new file mode 100644 index 00000000..8f919458 --- /dev/null +++ b/pkg/foreman/agent/capability_darwin.go @@ -0,0 +1,78 @@ +//go:build darwin + +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agent + +import ( + "math" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" + llmkubeagent "github.com/defilantech/llmkube/pkg/agent" +) + +// darwinCapability advertises the host as a Metal worker. Total / available +// RAM come from sysctl + vm_stat via the LLMKube metal-agent's existing +// DarwinMemoryProvider; the rest are flag-supplied for v0.1 (M5 will +// derive MaxContextTokens / TokensPerSecond from loaded model metadata). +type darwinCapability struct { + mem *llmkubeagent.DarwinMemoryProvider + models []string + maxContext int32 + tokensPerSec int32 + acceleratorLabel foremanv1alpha1.FleetNodeAccelerator +} + +// NewCapability is the cross-platform constructor. On darwin it returns a +// Metal capability provider backed by the existing metal-agent memory +// probes; on other platforms the build-tagged sibling returns a stub. +func NewCapability(opts CapabilityOptions) CapabilityProvider { + acc := opts.Accelerator + if acc == "" { + acc = "metal" + } + return &darwinCapability{ + mem: &llmkubeagent.DarwinMemoryProvider{}, + models: opts.InstalledModels, + maxContext: opts.MaxContextTokens, + tokensPerSec: opts.TokensPerSecond, + acceleratorLabel: acc, + } +} + +func (d *darwinCapability) Capability() foremanv1alpha1.FleetNodeCapability { + totalB, _ := d.mem.TotalMemory() + availB, _ := d.mem.AvailableMemory() + return foremanv1alpha1.FleetNodeCapability{ + Accelerator: d.acceleratorLabel, + TotalRAMGB: bytesToGB(totalB), + AvailableRAMGB: bytesToGB(availB), + InstalledModels: d.models, + MaxContextTokens: d.maxContext, + TokensPerSecond: d.tokensPerSec, + } +} + +// bytesToGB safely narrows a byte count to int32 gigabytes. A 2.1 EB host +// would saturate the field; real machines clear int32 by orders of magnitude. +func bytesToGB(b uint64) int32 { + gb := b / (1024 * 1024 * 1024) + if gb > math.MaxInt32 { + return math.MaxInt32 + } + return int32(gb) //nolint:gosec // bounded above +} diff --git a/pkg/foreman/agent/capability_other.go b/pkg/foreman/agent/capability_other.go new file mode 100644 index 00000000..9f265d6b --- /dev/null +++ b/pkg/foreman/agent/capability_other.go @@ -0,0 +1,57 @@ +//go:build !darwin + +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agent + +import ( + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// stubCapability is the v0.1 non-darwin fallback so the foreman-agent +// builds cross-platform. Dynamic memory probing on Linux / NVIDIA lands +// in M4 when ShadowStack joins the fleet; for now this advertises only +// the static fields the operator passes via flags. +type stubCapability struct { + models []string + maxContext int32 + tokensPerSec int32 + totalRAMGB int32 + acceleratorLabel foremanv1alpha1.FleetNodeAccelerator +} + +// NewCapability returns the build-tagged provider. +func NewCapability(opts CapabilityOptions) CapabilityProvider { + return &stubCapability{ + models: opts.InstalledModels, + maxContext: opts.MaxContextTokens, + tokensPerSec: opts.TokensPerSecond, + totalRAMGB: opts.StaticTotalRAMGB, + acceleratorLabel: opts.Accelerator, + } +} + +func (s *stubCapability) Capability() foremanv1alpha1.FleetNodeCapability { + return foremanv1alpha1.FleetNodeCapability{ + Accelerator: s.acceleratorLabel, + TotalRAMGB: s.totalRAMGB, + AvailableRAMGB: s.totalRAMGB, + InstalledModels: s.models, + MaxContextTokens: s.maxContext, + TokensPerSecond: s.tokensPerSec, + } +} diff --git a/pkg/foreman/agent/fleetnode.go b/pkg/foreman/agent/fleetnode.go new file mode 100644 index 00000000..d6d4f03d --- /dev/null +++ b/pkg/foreman/agent/fleetnode.go @@ -0,0 +1,173 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package agent is the Foreman worker library: it lets a node register +// itself as a FleetNode, heartbeat its capability, and (in later +// milestones) watch + execute AgenticTasks dispatched to it. The +// cmd/foreman-agent binary is a thin wrapper around this package. +package agent + +import ( + "context" + "fmt" + "time" + + apierrors "k8s.io/apimachinery/pkg/api/errors" + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + "sigs.k8s.io/controller-runtime/pkg/client" + logf "sigs.k8s.io/controller-runtime/pkg/log" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// DefaultHeartbeatInterval is the v0.1 cadence. The scheduler treats a +// FleetNode as stale (NotReady) when LastHeartbeatTime is older than +// roughly 3x this interval, so 30s gives a ~90s detection window. +const DefaultHeartbeatInterval = 30 * time.Second + +// CapabilityProvider returns the current capability profile of this node. +// Implementations must be cheap (called on every heartbeat) and must not +// panic; on transient failure, return a zero or last-known value. +type CapabilityProvider interface { + Capability() foremanv1alpha1.FleetNodeCapability +} + +// Registrar owns the FleetNode CR for this host: upserts it on startup, +// patches its status every heartbeat, and patches phase=Draining on +// clean shutdown so the scheduler stops routing tasks to us promptly. +type Registrar struct { + Client client.Client + NodeName string + Spec foremanv1alpha1.FleetNodeSpec + Provider CapabilityProvider + Interval time.Duration // zero defaults to DefaultHeartbeatInterval +} + +// Upsert creates the FleetNode if missing, otherwise updates its Spec so +// flag changes between agent restarts take effect immediately. +func (r *Registrar) Upsert(ctx context.Context) error { + log := logf.FromContext(ctx) + key := types.NamespacedName{Name: r.NodeName} + + var existing foremanv1alpha1.FleetNode + err := r.Client.Get(ctx, key, &existing) + switch { + case apierrors.IsNotFound(err): + node := &foremanv1alpha1.FleetNode{ + ObjectMeta: metav1.ObjectMeta{Name: r.NodeName}, + Spec: r.Spec, + } + if err := r.Client.Create(ctx, node); err != nil { + return fmt.Errorf("create FleetNode %q: %w", r.NodeName, err) + } + log.Info("created FleetNode", "name", r.NodeName) + return nil + case err != nil: + return fmt.Errorf("get FleetNode %q: %w", r.NodeName, err) + } + + // Update spec only if it actually changed; avoids touch noise on + // every restart with identical flags. + if specEqual(existing.Spec, r.Spec) { + log.Info("FleetNode spec unchanged", "name", r.NodeName) + return nil + } + existing.Spec = r.Spec + if err := r.Client.Update(ctx, &existing); err != nil { + return fmt.Errorf("update FleetNode %q spec: %w", r.NodeName, err) + } + log.Info("updated FleetNode spec", "name", r.NodeName) + return nil +} + +// PatchHeartbeat patches the FleetNode's status with a fresh heartbeat +// time, the current phase, and the latest capability snapshot. Uses a +// merge patch so concurrent edits to other status fields by future +// reconcilers (M2+) do not conflict. +func (r *Registrar) PatchHeartbeat(ctx context.Context, phase foremanv1alpha1.FleetNodePhase) error { + var node foremanv1alpha1.FleetNode + if err := r.Client.Get(ctx, types.NamespacedName{Name: r.NodeName}, &node); err != nil { + return fmt.Errorf("get FleetNode for heartbeat: %w", err) + } + patch := client.MergeFrom(node.DeepCopy()) + now := metav1.Now() + node.Status.Phase = phase + node.Status.LastHeartbeatTime = &now + node.Status.Capability = r.Provider.Capability() + if err := r.Client.Status().Patch(ctx, &node, patch); err != nil { + return fmt.Errorf("patch FleetNode status: %w", err) + } + return nil +} + +// Run blocks, heartbeating every Interval until ctx is cancelled. On +// cancellation it makes a best-effort drain patch (phase=Draining) so +// the scheduler stops dispatching to us before the process exits. +func (r *Registrar) Run(ctx context.Context) error { + log := logf.FromContext(ctx) + if err := r.PatchHeartbeat(ctx, foremanv1alpha1.FleetNodePhaseReady); err != nil { + return fmt.Errorf("initial heartbeat: %w", err) + } + log.Info("FleetNode Ready", "name", r.NodeName) + + interval := r.Interval + if interval <= 0 { + interval = DefaultHeartbeatInterval + } + ticker := time.NewTicker(interval) + defer ticker.Stop() + + for { + select { + case <-ctx.Done(): + // Best-effort drain. A short fresh timeout keeps us from + // hanging on a dead apiserver during shutdown. + drainCtx, cancel := context.WithTimeout(context.Background(), 5*time.Second) + err := r.PatchHeartbeat(drainCtx, foremanv1alpha1.FleetNodePhaseDraining) + cancel() + if err != nil { + log.Error(err, "drain heartbeat failed") + } else { + log.Info("FleetNode Draining", "name", r.NodeName) + } + return nil + case <-ticker.C: + if err := r.PatchHeartbeat(ctx, foremanv1alpha1.FleetNodePhaseReady); err != nil { + // Don't return on transient errors; the next tick can + // recover. A persistent failure is visible via stale + // LastHeartbeatTime, which is exactly the staleness + // signal the scheduler uses anyway. + log.Error(err, "heartbeat patch failed; will retry") + } + } + } +} + +func specEqual(a, b foremanv1alpha1.FleetNodeSpec) bool { + if a.NodeName != b.NodeName || a.TailscaleAddr != b.TailscaleAddr { + return false + } + if len(a.Roles) != len(b.Roles) { + return false + } + for i := range a.Roles { + if a.Roles[i] != b.Roles[i] { + return false + } + } + return true +} From d1201350290d5ab7ed81d6d146aa3a801044849f Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Wed, 20 May 2026 00:17:31 -0700 Subject: [PATCH 3/5] test(foreman): add unit + envtest coverage for M0+M1 PR #501 shipped scaffolding (CRDs, foreman-operator, foreman-agent Registrar, capability providers) without test coverage, on the plan that 'unit + envtest coverage lands with M2'. This commit fronts that work onto #501 so M0+M1 ship with proper tests instead of a deferred promise. Refs #500. Coverage delta on the new packages: pkg/foreman/agent 0.0% -> 87.0% internal/foreman/controller 0.0% -> 85.7% What's covered: cmd/foreman-agent/main_test.go (stdlib testing.T): - clampInt32: negative/zero/MaxInt32-bound/overflow paths. - sanitizeName: DNS-1123 cleanup (lowercase, invalid-char collapse, leading/trailing hyphen trim, empty-and-all-invalid fallback, 63-char truncation, macOS '.local' hostname case). - splitCSV: empty / single / multi / whitespace / empty-entries / separator-only cases. Found a real inconsistency along the way: empty input returned nil but separator-only returned []string{}; splitCSV now collapses both to nil so the FleetNodeSpec.Roles and CapabilityOptions.InstalledModels fields see one 'absent' representation. No external callers depended on the distinction. pkg/foreman/agent/capability_darwin_test.go (//go:build darwin): - bytesToGB: zero, sub-1GB rounding, 36GB/128GB sanity, MaxInt32 edge, and uint64-max saturation. - NewCapability: default-metal accelerator, explicit override honored, flag-supplied InstalledModels/MaxContextTokens/ TokensPerSecond propagation. - Live memory probe sanity: TotalRAMGB > 0 and AvailableRAMGB <= TotalRAMGB on a real Darwin host; skip if sysctl unavailable (CI sandbox). pkg/foreman/agent/capability_other_test.go (//go:build !darwin): - Stub provider propagates all flag-supplied fields. - AvailableRAMGB == StaticTotalRAMGB in v0.1 until M4 wires up live Linux probing. - Empty Accelerator is preserved (no silent default on non-darwin). pkg/foreman/agent/fleetnode_test.go (stdlib + fake client): - specEqual: 7 table-driven cases including role-ordering sensitivity. - Registrar.Upsert: creates if missing; updates if spec changed; no-ops (no resourceVersion bump) if spec identical. - Registrar.PatchHeartbeat: writes phase, fresh LastHeartbeatTime, full Capability snapshot. - Registrar.Run: heartbeats while running; drains (phase=Draining) on ctx cancel; exits cleanly within 2s. internal/foreman/controller/suite_test.go (Ginkgo + envtest): - Mirrors internal/controller/suite_test.go: BeforeSuite starts envtest, loads config/crd/bases/, registers foremanv1alpha1 into scheme. AfterSuite tears down. - Same getFirstFoundEnvTestBinaryDir helper for IDE-run support. internal/foreman/controller/{agentictask,workload,fleetnode}_controller_test.go: - Stub-smoke contracts: each M0/M1 reconciler is exercised against a real apiserver and must (1) return no error for missing resources, (2) reconcile an existing resource without erroring, (3) leave .status unmutated. M2 deliberately breaks the agentictask contract with a corresponding test update. CI: no .github/workflows/*.yml changes needed. The existing test.yml (.github/workflows/test.yml) runs make test, which globs the foreman packages automatically. Signed-off-by: Christopher Maher --- cmd/foreman-agent/main.go | 7 + cmd/foreman-agent/main_test.go | 104 ++++++ .../controller/agentictask_controller_test.go | 81 +++++ .../controller/fleetnode_controller_test.go | 73 +++++ internal/foreman/controller/suite_test.go | 111 +++++++ .../controller/workload_controller_test.go | 73 +++++ pkg/foreman/agent/capability_darwin_test.go | 103 ++++++ pkg/foreman/agent/capability_other_test.go | 69 ++++ pkg/foreman/agent/fleetnode_test.go | 295 ++++++++++++++++++ 9 files changed, 916 insertions(+) create mode 100644 cmd/foreman-agent/main_test.go create mode 100644 internal/foreman/controller/agentictask_controller_test.go create mode 100644 internal/foreman/controller/fleetnode_controller_test.go create mode 100644 internal/foreman/controller/suite_test.go create mode 100644 internal/foreman/controller/workload_controller_test.go create mode 100644 pkg/foreman/agent/capability_darwin_test.go create mode 100644 pkg/foreman/agent/capability_other_test.go create mode 100644 pkg/foreman/agent/fleetnode_test.go diff --git a/cmd/foreman-agent/main.go b/cmd/foreman-agent/main.go index 51b64827..a34ff581 100644 --- a/cmd/foreman-agent/main.go +++ b/cmd/foreman-agent/main.go @@ -212,6 +212,13 @@ func splitCSV(s string) []string { } out = append(out, p) } + if len(out) == 0 { + // Separator-only or whitespace-only input is functionally + // the same as empty input; collapse to nil so callers + // (FleetNodeSpec.Roles, CapabilityOptions.InstalledModels) see + // a single "absent" representation rather than nil vs []string{}. + return nil + } return out } diff --git a/cmd/foreman-agent/main_test.go b/cmd/foreman-agent/main_test.go new file mode 100644 index 00000000..2f8e2a08 --- /dev/null +++ b/cmd/foreman-agent/main_test.go @@ -0,0 +1,104 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package main + +import ( + "math" + "reflect" + "strings" + "testing" +) + +func TestClampInt32(t *testing.T) { + cases := []struct { + name string + in int + want int32 + }{ + {"negative_clamps_to_zero", -1, 0}, + {"min_int_clamps_to_zero", math.MinInt32, 0}, + {"zero_stays_zero", 0, 0}, + {"small_positive_passes_through", 47, 47}, + {"context_size_passes_through", 131072, 131072}, + {"exactly_max_int32_passes_through", math.MaxInt32, math.MaxInt32}, + {"over_max_int32_saturates", int(math.MaxInt32) + 1, math.MaxInt32}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := clampInt32(tc.in) + if got != tc.want { + t.Errorf("clampInt32(%d) = %d, want %d", tc.in, got, tc.want) + } + }) + } +} + +func TestSanitizeName(t *testing.T) { + cases := []struct { + name string + in string + want string + }{ + {"already_clean", "m5-max", "m5-max"}, + {"uppercase_lowercased", "M5-MAX", "m5-max"}, + {"dots_become_hyphens", "m5.max.local", "m5-max-local"}, + {"underscores_become_hyphens", "m5_max", "m5-max"}, + {"runs_of_invalid_collapse", "m5...max", "m5-max"}, + {"leading_hyphen_trimmed", "-m5-max", "m5-max"}, + {"trailing_hyphen_trimmed", "m5-max-", "m5-max"}, + {"both_ends_trimmed", "---m5-max---", "m5-max"}, + {"empty_falls_back_to_fleetnode", "", "fleetnode"}, + {"only_invalid_falls_back", "...", "fleetnode"}, + {"too_long_truncated_to_63", strings.Repeat("a", 100), strings.Repeat("a", 63)}, + {"hostname_with_macos_local_suffix", "Christophers-MacBook-Pro.local", "christophers-macbook-pro-local"}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := sanitizeName(tc.in) + if got != tc.want { + t.Errorf("sanitizeName(%q) = %q, want %q", tc.in, got, tc.want) + } + if len(got) > 63 { + t.Errorf("sanitizeName returned %d chars; DNS-1123 cap is 63", len(got)) + } + }) + } +} + +func TestSplitCSV(t *testing.T) { + cases := []struct { + name string + in string + want []string + }{ + {"empty_returns_nil", "", nil}, + {"single_value", "worker", []string{"worker"}}, + {"two_values", "worker,verifier", []string{"worker", "verifier"}}, + {"three_values_with_whitespace", "worker, verifier , gate", []string{"worker", "verifier", "gate"}}, + {"empty_entries_skipped", "worker,,verifier,", []string{"worker", "verifier"}}, + {"only_separators_returns_nil", ",,,", nil}, + {"whitespace_only_entries_skipped", "worker, ,verifier", []string{"worker", "verifier"}}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := splitCSV(tc.in) + if !reflect.DeepEqual(got, tc.want) { + t.Errorf("splitCSV(%q) = %v, want %v", tc.in, got, tc.want) + } + }) + } +} diff --git a/internal/foreman/controller/agentictask_controller_test.go b/internal/foreman/controller/agentictask_controller_test.go new file mode 100644 index 00000000..d4aba429 --- /dev/null +++ b/internal/foreman/controller/agentictask_controller_test.go @@ -0,0 +1,81 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// M0/M1 ship the AgenticTaskReconciler as a logging stub: it reads the +// object and returns without touching status. These smoke tests pin +// that contract so future M2 evolution either preserves it (until M2 +// merges) or breaks it intentionally with a deliberate test update. + +var _ = Describe("AgenticTaskReconciler (M0 stub)", func() { + var reconciler *AgenticTaskReconciler + + BeforeEach(func() { + reconciler = &AgenticTaskReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + }) + + It("returns no error and no requeue when the task is not found", func() { + res, err := reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "no-such-task"}, + }) + Expect(err).NotTo(HaveOccurred()) + Expect(res.Requeue).To(BeFalse()) + Expect(res.RequeueAfter).To(BeZero()) + }) + + It("reconciles an existing task without erroring and without mutating status (M0 stub)", func() { + task := &foremanv1alpha1.AgenticTask{ + ObjectMeta: metav1.ObjectMeta{Name: "stub-smoke", Namespace: "default"}, + Spec: foremanv1alpha1.AgenticTaskSpec{ + Kind: foremanv1alpha1.AgenticTaskKindFreeform, + Payload: foremanv1alpha1.AgenticTaskPayload{Prompt: "stub-smoke"}, + }, + } + Expect(k8sClient.Create(ctx, task)).To(Succeed()) + DeferCleanup(func() { + _ = k8sClient.Delete(ctx, task) + }) + + _, err := reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "stub-smoke"}, + }) + Expect(err).NotTo(HaveOccurred()) + + var fresh foremanv1alpha1.AgenticTask + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: "stub-smoke"}, &fresh)).To(Succeed()) + + // M0 stub leaves status untouched. M2 replaces this contract. + Expect(string(fresh.Status.Phase)).To(BeEmpty()) + Expect(fresh.Status.AssignedNode).To(BeEmpty()) + Expect(fresh.Status.Verdict).To(BeEquivalentTo("")) + Expect(fresh.Status.Result).To(BeNil()) + }) +}) diff --git a/internal/foreman/controller/fleetnode_controller_test.go b/internal/foreman/controller/fleetnode_controller_test.go new file mode 100644 index 00000000..d2ee4241 --- /dev/null +++ b/internal/foreman/controller/fleetnode_controller_test.go @@ -0,0 +1,73 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// M0/M1 ship the FleetNodeReconciler as a logging stub. The +// stale-heartbeat -> NotReady logic lands in M2. These smoke tests pin +// the no-mutation contract today. + +var _ = Describe("FleetNodeReconciler (M0 stub)", func() { + var reconciler *FleetNodeReconciler + + BeforeEach(func() { + reconciler = &FleetNodeReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + }) + + It("returns no error when the FleetNode is not found", func() { + _, err := reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "absent"}, + }) + Expect(err).NotTo(HaveOccurred()) + }) + + It("reconciles an existing FleetNode without mutating status (M0 stub)", func() { + fn := &foremanv1alpha1.FleetNode{ + ObjectMeta: metav1.ObjectMeta{Name: "stub-fleetnode"}, + Spec: foremanv1alpha1.FleetNodeSpec{ + NodeName: "stub-fleetnode", + Roles: []string{"worker"}, + }, + } + Expect(k8sClient.Create(ctx, fn)).To(Succeed()) + DeferCleanup(func() { + _ = k8sClient.Delete(ctx, fn) + }) + + _, err := reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Name: "stub-fleetnode"}, + }) + Expect(err).NotTo(HaveOccurred()) + + var fresh foremanv1alpha1.FleetNode + Expect(k8sClient.Get(ctx, types.NamespacedName{Name: "stub-fleetnode"}, &fresh)).To(Succeed()) + Expect(string(fresh.Status.Phase)).To(BeEmpty()) + }) +}) diff --git a/internal/foreman/controller/suite_test.go b/internal/foreman/controller/suite_test.go new file mode 100644 index 00000000..0d1e32a4 --- /dev/null +++ b/internal/foreman/controller/suite_test.go @@ -0,0 +1,111 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +// Package controller_test (in-package: see go file `package controller`) +// hosts the envtest suite for the foreman API group's reconcilers. The +// shape mirrors internal/controller/suite_test.go so the project has a +// single recognizable testing pattern for both API groups. +package controller + +import ( + "context" + "os" + "path/filepath" + "testing" + + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + "k8s.io/client-go/kubernetes/scheme" + "k8s.io/client-go/rest" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/envtest" + logf "sigs.k8s.io/controller-runtime/pkg/log" + "sigs.k8s.io/controller-runtime/pkg/log/zap" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// These tests use Ginkgo (BDD-style Go testing framework). Refer to +// http://onsi.github.io/ginkgo/ to learn more about Ginkgo. + +var ( + ctx context.Context + cancel context.CancelFunc + testEnv *envtest.Environment + cfg *rest.Config + k8sClient client.Client +) + +func TestForemanControllers(t *testing.T) { + RegisterFailHandler(Fail) + RunSpecs(t, "Foreman Controller Suite") +} + +var _ = BeforeSuite(func() { + logf.SetLogger(zap.New(zap.WriteTo(GinkgoWriter), zap.UseDevMode(true))) + + ctx, cancel = context.WithCancel(context.TODO()) + + err := foremanv1alpha1.AddToScheme(scheme.Scheme) + Expect(err).NotTo(HaveOccurred()) + + By("bootstrapping test environment with foreman CRDs") + testEnv = &envtest.Environment{ + CRDDirectoryPaths: []string{ + filepath.Join("..", "..", "..", "config", "crd", "bases"), + }, + ErrorIfCRDPathMissing: true, + } + + if dir := getFirstFoundEnvTestBinaryDir(); dir != "" { + testEnv.BinaryAssetsDirectory = dir + } + + cfg, err = testEnv.Start() + Expect(err).NotTo(HaveOccurred()) + Expect(cfg).NotTo(BeNil()) + + k8sClient, err = client.New(cfg, client.Options{Scheme: scheme.Scheme}) + Expect(err).NotTo(HaveOccurred()) + Expect(k8sClient).NotTo(BeNil()) +}) + +var _ = AfterSuite(func() { + By("tearing down the test environment") + cancel() + err := testEnv.Stop() + Expect(err).NotTo(HaveOccurred()) +}) + +// getFirstFoundEnvTestBinaryDir mirrors the helper in +// internal/controller/suite_test.go: when running tests from an IDE +// without the Makefile, locate the kube-apiserver / etcd binaries +// `make setup-envtest` placed under bin/k8s/. +func getFirstFoundEnvTestBinaryDir() string { + basePath := filepath.Join("..", "..", "..", "bin", "k8s") + entries, err := os.ReadDir(basePath) + if err != nil { + logf.Log.Error(err, "Failed to read directory", "path", basePath) + return "" + } + for _, entry := range entries { + if entry.IsDir() { + return filepath.Join(basePath, entry.Name()) + } + } + return "" +} diff --git a/internal/foreman/controller/workload_controller_test.go b/internal/foreman/controller/workload_controller_test.go new file mode 100644 index 00000000..ae17dd72 --- /dev/null +++ b/internal/foreman/controller/workload_controller_test.go @@ -0,0 +1,73 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package controller + +import ( + . "github.com/onsi/ginkgo/v2" + . "github.com/onsi/gomega" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/types" + ctrl "sigs.k8s.io/controller-runtime" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// M0 ships the WorkloadReconciler as a logging stub. Real planner logic +// lands in M6. These smoke tests pin the stub's no-mutation contract. + +var _ = Describe("WorkloadReconciler (M0 stub)", func() { + var reconciler *WorkloadReconciler + + BeforeEach(func() { + reconciler = &WorkloadReconciler{ + Client: k8sClient, + Scheme: k8sClient.Scheme(), + } + }) + + It("returns no error when the Workload is not found", func() { + _, err := reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "absent"}, + }) + Expect(err).NotTo(HaveOccurred()) + }) + + It("reconciles an existing Workload without mutating status", func() { + wl := &foremanv1alpha1.Workload{ + ObjectMeta: metav1.ObjectMeta{Name: "stub-workload", Namespace: "default"}, + Spec: foremanv1alpha1.WorkloadSpec{ + Intent: "smoke test", + Repo: "defilantech/LLMKube", + }, + } + Expect(k8sClient.Create(ctx, wl)).To(Succeed()) + DeferCleanup(func() { + _ = k8sClient.Delete(ctx, wl) + }) + + _, err := reconciler.Reconcile(ctx, ctrl.Request{ + NamespacedName: types.NamespacedName{Namespace: "default", Name: "stub-workload"}, + }) + Expect(err).NotTo(HaveOccurred()) + + var fresh foremanv1alpha1.Workload + Expect(k8sClient.Get(ctx, types.NamespacedName{Namespace: "default", Name: "stub-workload"}, &fresh)).To(Succeed()) + Expect(string(fresh.Status.Phase)).To(BeEmpty()) + Expect(fresh.Status.Tasks).To(BeEmpty()) + }) +}) diff --git a/pkg/foreman/agent/capability_darwin_test.go b/pkg/foreman/agent/capability_darwin_test.go new file mode 100644 index 00000000..94eb1f16 --- /dev/null +++ b/pkg/foreman/agent/capability_darwin_test.go @@ -0,0 +1,103 @@ +//go:build darwin + +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agent + +import ( + "math" + "testing" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +func TestBytesToGB(t *testing.T) { + const gb = uint64(1024 * 1024 * 1024) + cases := []struct { + name string + in uint64 + want int32 + }{ + {"zero", 0, 0}, + {"sub_gb_rounds_down", gb / 2, 0}, + {"exactly_1gb", gb, 1}, + {"36gb_mac_studio", 36 * gb, 36}, + {"128gb_m5_max", 128 * gb, 128}, + {"max_int32_gb_passes_through", uint64(math.MaxInt32) * gb, math.MaxInt32}, + {"saturates_at_max_int32", math.MaxUint64, math.MaxInt32}, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := bytesToGB(tc.in) + if got != tc.want { + t.Errorf("bytesToGB(%d) = %d, want %d", tc.in, got, tc.want) + } + }) + } +} + +func TestNewCapability_Darwin_DefaultsAcceleratorToMetal(t *testing.T) { + p := NewCapability(CapabilityOptions{}) + cap := p.Capability() + if string(cap.Accelerator) != "metal" { + t.Errorf("default accelerator on darwin = %q, want %q", cap.Accelerator, "metal") + } +} + +func TestNewCapability_Darwin_HonorsExplicitAcceleratorOverride(t *testing.T) { + p := NewCapability(CapabilityOptions{ + Accelerator: foremanv1alpha1.FleetNodeAccelerator("none"), + }) + cap := p.Capability() + if string(cap.Accelerator) != "none" { + t.Errorf("override accelerator = %q, want %q", cap.Accelerator, "none") + } +} + +func TestNewCapability_Darwin_PropagatesFlagsuppliedFields(t *testing.T) { + p := NewCapability(CapabilityOptions{ + InstalledModels: []string{"minimax-m2-7", "qwen36-35b-carnice-mtp"}, + MaxContextTokens: 131072, + TokensPerSecond: 47, + }) + cap := p.Capability() + if len(cap.InstalledModels) != 2 { + t.Errorf("InstalledModels len = %d, want 2", len(cap.InstalledModels)) + } + if cap.MaxContextTokens != 131072 { + t.Errorf("MaxContextTokens = %d, want 131072", cap.MaxContextTokens) + } + if cap.TokensPerSecond != 47 { + t.Errorf("TokensPerSecond = %d, want 47", cap.TokensPerSecond) + } +} + +func TestNewCapability_Darwin_LiveMemoryProbeIsSane(t *testing.T) { + // The DarwinMemoryProvider lives in pkg/agent and runs `sysctl + // hw.memsize` + `vm_stat`. We don't assert specific RAM values + // (those depend on the host), but we do assert the relationships + // that any sane macOS host satisfies: TotalRAMGB > 0 and + // AvailableRAMGB <= TotalRAMGB. + p := NewCapability(CapabilityOptions{}) + cap := p.Capability() + if cap.TotalRAMGB <= 0 { + t.Skipf("live darwin memory probe returned %d GB; assume CI sandbox without sysctl", cap.TotalRAMGB) + } + if cap.AvailableRAMGB > cap.TotalRAMGB { + t.Errorf("AvailableRAMGB (%d) > TotalRAMGB (%d); impossible", cap.AvailableRAMGB, cap.TotalRAMGB) + } +} diff --git a/pkg/foreman/agent/capability_other_test.go b/pkg/foreman/agent/capability_other_test.go new file mode 100644 index 00000000..6a238bd6 --- /dev/null +++ b/pkg/foreman/agent/capability_other_test.go @@ -0,0 +1,69 @@ +//go:build !darwin + +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agent + +import ( + "testing" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +// The non-darwin capability provider is a stub: it advertises whatever +// CapabilityOptions hands it (flag-supplied), with AvailableRAMGB == +// StaticTotalRAMGB until M4 wires up live Linux memory probing. + +func TestNewCapability_Other_PropagatesAllFlagSuppliedFields(t *testing.T) { + p := NewCapability(CapabilityOptions{ + Accelerator: foremanv1alpha1.FleetNodeAccelerator("cuda"), + InstalledModels: []string{"qwen3-coder-30b"}, + MaxContextTokens: 32768, + TokensPerSecond: 85, + StaticTotalRAMGB: 64, + }) + cap := p.Capability() + if string(cap.Accelerator) != "cuda" { + t.Errorf("Accelerator = %q, want %q", cap.Accelerator, "cuda") + } + if cap.TotalRAMGB != 64 { + t.Errorf("TotalRAMGB = %d, want 64", cap.TotalRAMGB) + } + if cap.AvailableRAMGB != 64 { + t.Errorf("AvailableRAMGB = %d, want 64 (stub: equal to TotalRAMGB until M4)", cap.AvailableRAMGB) + } + if cap.MaxContextTokens != 32768 { + t.Errorf("MaxContextTokens = %d, want 32768", cap.MaxContextTokens) + } + if cap.TokensPerSecond != 85 { + t.Errorf("TokensPerSecond = %d, want 85", cap.TokensPerSecond) + } + if len(cap.InstalledModels) != 1 || cap.InstalledModels[0] != "qwen3-coder-30b" { + t.Errorf("InstalledModels = %v", cap.InstalledModels) + } +} + +func TestNewCapability_Other_HonorsEmptyAccelerator(t *testing.T) { + // On non-darwin, no default accelerator is filled in; v0.1 expects + // the operator to set --accelerator explicitly. Confirm we don't + // silently default to anything. + p := NewCapability(CapabilityOptions{}) + cap := p.Capability() + if string(cap.Accelerator) != "" { + t.Errorf("Accelerator = %q on non-darwin with empty options; want empty (operator must set explicitly)", cap.Accelerator) + } +} diff --git a/pkg/foreman/agent/fleetnode_test.go b/pkg/foreman/agent/fleetnode_test.go new file mode 100644 index 00000000..399274bc --- /dev/null +++ b/pkg/foreman/agent/fleetnode_test.go @@ -0,0 +1,295 @@ +/* +Copyright 2025. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package agent + +import ( + "context" + "testing" + "time" + + metav1 "k8s.io/apimachinery/pkg/apis/meta/v1" + "k8s.io/apimachinery/pkg/runtime" + "k8s.io/apimachinery/pkg/types" + "k8s.io/client-go/kubernetes/scheme" + "sigs.k8s.io/controller-runtime/pkg/client" + "sigs.k8s.io/controller-runtime/pkg/client/fake" + + foremanv1alpha1 "github.com/defilantech/llmkube/api/foreman/v1alpha1" +) + +func newTestScheme(t *testing.T) *runtime.Scheme { + t.Helper() + s := runtime.NewScheme() + if err := scheme.AddToScheme(s); err != nil { + t.Fatalf("clientgoscheme.AddToScheme: %v", err) + } + if err := foremanv1alpha1.AddToScheme(s); err != nil { + t.Fatalf("foreman scheme: %v", err) + } + return s +} + +func newFakeClient(t *testing.T, objs ...client.Object) client.Client { + t.Helper() + return fake.NewClientBuilder(). + WithScheme(newTestScheme(t)). + WithObjects(objs...). + WithStatusSubresource(&foremanv1alpha1.FleetNode{}). + Build() +} + +// fixedCapability is a deterministic CapabilityProvider so heartbeat- +// patch assertions don't depend on the host's live sysctl / vm_stat. +type fixedCapability struct { + cap foremanv1alpha1.FleetNodeCapability +} + +func (f *fixedCapability) Capability() foremanv1alpha1.FleetNodeCapability { return f.cap } + +func TestSpecEqual(t *testing.T) { + cases := []struct { + name string + a, b foremanv1alpha1.FleetNodeSpec + want bool + }{ + {"both_zero", foremanv1alpha1.FleetNodeSpec{}, foremanv1alpha1.FleetNodeSpec{}, true}, + { + "identical_fully_populated", + foremanv1alpha1.FleetNodeSpec{NodeName: "m5", TailscaleAddr: "ts", Roles: []string{"worker", "verifier"}}, + foremanv1alpha1.FleetNodeSpec{NodeName: "m5", TailscaleAddr: "ts", Roles: []string{"worker", "verifier"}}, + true, + }, + { + "different_node_name", + foremanv1alpha1.FleetNodeSpec{NodeName: "m5"}, + foremanv1alpha1.FleetNodeSpec{NodeName: "m6"}, + false, + }, + { + "different_tailscale_addr", + foremanv1alpha1.FleetNodeSpec{NodeName: "m5", TailscaleAddr: "a"}, + foremanv1alpha1.FleetNodeSpec{NodeName: "m5", TailscaleAddr: "b"}, + false, + }, + { + "different_roles_length", + foremanv1alpha1.FleetNodeSpec{Roles: []string{"worker"}}, + foremanv1alpha1.FleetNodeSpec{Roles: []string{"worker", "verifier"}}, + false, + }, + { + "role_value_mismatch", + foremanv1alpha1.FleetNodeSpec{Roles: []string{"worker"}}, + foremanv1alpha1.FleetNodeSpec{Roles: []string{"verifier"}}, + false, + }, + { + "role_order_matters", + foremanv1alpha1.FleetNodeSpec{Roles: []string{"worker", "verifier"}}, + foremanv1alpha1.FleetNodeSpec{Roles: []string{"verifier", "worker"}}, + false, + }, + } + for _, tc := range cases { + t.Run(tc.name, func(t *testing.T) { + got := specEqual(tc.a, tc.b) + if got != tc.want { + t.Errorf("specEqual(%v, %v) = %v, want %v", tc.a, tc.b, got, tc.want) + } + }) + } +} + +func TestRegistrar_Upsert_CreatesIfMissing(t *testing.T) { + kc := newFakeClient(t) + r := &Registrar{ + Client: kc, + NodeName: "m5-max", + Spec: foremanv1alpha1.FleetNodeSpec{ + NodeName: "m5-max", + Roles: []string{"worker"}, + }, + Provider: &fixedCapability{}, + } + if err := r.Upsert(context.Background()); err != nil { + t.Fatalf("Upsert: %v", err) + } + var got foremanv1alpha1.FleetNode + if err := kc.Get(context.Background(), types.NamespacedName{Name: "m5-max"}, &got); err != nil { + t.Fatalf("Get after create: %v", err) + } + if got.Spec.NodeName != "m5-max" { + t.Errorf("Spec.NodeName = %q, want %q", got.Spec.NodeName, "m5-max") + } + if len(got.Spec.Roles) != 1 || got.Spec.Roles[0] != "worker" { + t.Errorf("Spec.Roles = %v, want [worker]", got.Spec.Roles) + } +} + +func TestRegistrar_Upsert_UpdatesIfSpecChanged(t *testing.T) { + existing := &foremanv1alpha1.FleetNode{ + ObjectMeta: metav1.ObjectMeta{Name: "m5-max"}, + Spec: foremanv1alpha1.FleetNodeSpec{ + NodeName: "m5-max", + Roles: []string{"worker"}, + }, + } + kc := newFakeClient(t, existing) + r := &Registrar{ + Client: kc, + NodeName: "m5-max", + Spec: foremanv1alpha1.FleetNodeSpec{ + NodeName: "m5-max", + TailscaleAddr: "m5-max.tail-scale.ts.net", + Roles: []string{"worker", "verifier"}, + }, + Provider: &fixedCapability{}, + } + if err := r.Upsert(context.Background()); err != nil { + t.Fatalf("Upsert: %v", err) + } + var got foremanv1alpha1.FleetNode + if err := kc.Get(context.Background(), types.NamespacedName{Name: "m5-max"}, &got); err != nil { + t.Fatalf("Get: %v", err) + } + if got.Spec.TailscaleAddr != "m5-max.tail-scale.ts.net" { + t.Errorf("TailscaleAddr not updated: got %q", got.Spec.TailscaleAddr) + } + if len(got.Spec.Roles) != 2 { + t.Errorf("Roles not updated: got %v", got.Spec.Roles) + } +} + +func TestRegistrar_Upsert_NoopIfSpecUnchanged(t *testing.T) { + existing := &foremanv1alpha1.FleetNode{ + ObjectMeta: metav1.ObjectMeta{Name: "m5-max", ResourceVersion: "1"}, + Spec: foremanv1alpha1.FleetNodeSpec{ + NodeName: "m5-max", + Roles: []string{"worker"}, + }, + } + kc := newFakeClient(t, existing) + r := &Registrar{ + Client: kc, + NodeName: "m5-max", + Spec: foremanv1alpha1.FleetNodeSpec{ + NodeName: "m5-max", + Roles: []string{"worker"}, + }, + Provider: &fixedCapability{}, + } + if err := r.Upsert(context.Background()); err != nil { + t.Fatalf("Upsert: %v", err) + } + var got foremanv1alpha1.FleetNode + if err := kc.Get(context.Background(), types.NamespacedName{Name: "m5-max"}, &got); err != nil { + t.Fatalf("Get: %v", err) + } + // The fake client bumps resourceVersion on every Update. A noop + // Upsert leaves it where it was. + if got.ResourceVersion != "1" { + t.Errorf("ResourceVersion changed from %q to %q (expected noop on identical spec)", + "1", got.ResourceVersion) + } +} + +func TestRegistrar_PatchHeartbeat_WritesPhaseAndCapability(t *testing.T) { + existing := &foremanv1alpha1.FleetNode{ + ObjectMeta: metav1.ObjectMeta{Name: "m5-max"}, + Spec: foremanv1alpha1.FleetNodeSpec{NodeName: "m5-max"}, + } + kc := newFakeClient(t, existing) + cap := foremanv1alpha1.FleetNodeCapability{ + Accelerator: foremanv1alpha1.FleetNodeAccelerator("metal"), + TotalRAMGB: 128, + AvailableRAMGB: 64, + MaxContextTokens: 131072, + TokensPerSecond: 47, + } + r := &Registrar{ + Client: kc, + NodeName: "m5-max", + Provider: &fixedCapability{cap: cap}, + } + before := time.Now().Add(-time.Second) + if err := r.PatchHeartbeat(context.Background(), foremanv1alpha1.FleetNodePhaseReady); err != nil { + t.Fatalf("PatchHeartbeat: %v", err) + } + var got foremanv1alpha1.FleetNode + if err := kc.Get(context.Background(), types.NamespacedName{Name: "m5-max"}, &got); err != nil { + t.Fatalf("Get: %v", err) + } + if got.Status.Phase != foremanv1alpha1.FleetNodePhaseReady { + t.Errorf("Phase = %q, want Ready", got.Status.Phase) + } + if got.Status.LastHeartbeatTime == nil { + t.Fatal("LastHeartbeatTime is nil") + } + if got.Status.LastHeartbeatTime.Time.Before(before) { + t.Errorf("LastHeartbeatTime %v is before %v", got.Status.LastHeartbeatTime.Time, before) + } + if got.Status.Capability.TotalRAMGB != 128 { + t.Errorf("Capability.TotalRAMGB = %d, want 128", got.Status.Capability.TotalRAMGB) + } + if got.Status.Capability.AvailableRAMGB != 64 { + t.Errorf("Capability.AvailableRAMGB = %d, want 64", got.Status.Capability.AvailableRAMGB) + } + if got.Status.Capability.MaxContextTokens != 131072 { + t.Errorf("Capability.MaxContextTokens = %d, want 131072", got.Status.Capability.MaxContextTokens) + } + if got.Status.Capability.TokensPerSecond != 47 { + t.Errorf("Capability.TokensPerSecond = %d, want 47", got.Status.Capability.TokensPerSecond) + } +} + +func TestRegistrar_Run_DrainsAndExitsOnCancel(t *testing.T) { + existing := &foremanv1alpha1.FleetNode{ + ObjectMeta: metav1.ObjectMeta{Name: "m5-max"}, + Spec: foremanv1alpha1.FleetNodeSpec{NodeName: "m5-max"}, + } + kc := newFakeClient(t, existing) + r := &Registrar{ + Client: kc, + NodeName: "m5-max", + Provider: &fixedCapability{cap: foremanv1alpha1.FleetNodeCapability{TotalRAMGB: 128}}, + Interval: 50 * time.Millisecond, + } + ctx, cancel := context.WithCancel(context.Background()) + done := make(chan error, 1) + go func() { done <- r.Run(ctx) }() + + // Let the initial heartbeat + at least one ticker firing happen. + time.Sleep(75 * time.Millisecond) + + cancel() + select { + case err := <-done: + if err != nil { + t.Fatalf("Run returned error: %v", err) + } + case <-time.After(2 * time.Second): + t.Fatal("Run did not return within 2s of cancel") + } + + var got foremanv1alpha1.FleetNode + if err := kc.Get(context.Background(), types.NamespacedName{Name: "m5-max"}, &got); err != nil { + t.Fatalf("Get: %v", err) + } + if got.Status.Phase != foremanv1alpha1.FleetNodePhaseDraining { + t.Errorf("final phase = %q, want Draining", got.Status.Phase) + } +} From e2837ccf29da1fed14f5b8f9cc74e1f18aedfd07 Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Wed, 20 May 2026 00:20:22 -0700 Subject: [PATCH 4/5] test(foreman): wrap a long line in capability_other_test.go to satisfy lll CI's golangci-lint v2.4.0 on linux caught a 123-character line in the //go:build !darwin variant of the capability test that the M5 Max local lint missed (the file does not compile on darwin, so the darwin-side lint never sees it). Wrapped the t.Errorf to keep all lines under the 120-char limit. Signed-off-by: Christopher Maher --- pkg/foreman/agent/capability_other_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pkg/foreman/agent/capability_other_test.go b/pkg/foreman/agent/capability_other_test.go index 6a238bd6..2fa0540e 100644 --- a/pkg/foreman/agent/capability_other_test.go +++ b/pkg/foreman/agent/capability_other_test.go @@ -64,6 +64,9 @@ func TestNewCapability_Other_HonorsEmptyAccelerator(t *testing.T) { p := NewCapability(CapabilityOptions{}) cap := p.Capability() if string(cap.Accelerator) != "" { - t.Errorf("Accelerator = %q on non-darwin with empty options; want empty (operator must set explicitly)", cap.Accelerator) + t.Errorf( + "Accelerator = %q on non-darwin with empty options; want empty (operator must set explicitly)", + cap.Accelerator, + ) } } From 662dadca55901abcf87b77accb186a65fa05bdb9 Mon Sep 17 00:00:00 2001 From: Christopher Maher Date: Wed, 20 May 2026 00:42:50 -0700 Subject: [PATCH 5/5] test(foreman): drop deprecated Result.Requeue check in favor of RequeueAfter controller-runtime deprecated Result.Requeue (bool) in favor of expressing 'no requeue' as RequeueAfter == 0. The neighboring Expect(res.RequeueAfter).To(BeZero()) already covers the assertion, so dropping the Result.Requeue check resolves SA1019 staticcheck without changing test semantics. Caught locally via GOOS=linux golangci-lint after the previous darwin-only run missed it; same cross-arch gotcha covered in feedback_cross_arch_lint.md. Signed-off-by: Christopher Maher --- internal/foreman/controller/agentictask_controller_test.go | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/internal/foreman/controller/agentictask_controller_test.go b/internal/foreman/controller/agentictask_controller_test.go index d4aba429..ae90e784 100644 --- a/internal/foreman/controller/agentictask_controller_test.go +++ b/internal/foreman/controller/agentictask_controller_test.go @@ -47,7 +47,10 @@ var _ = Describe("AgenticTaskReconciler (M0 stub)", func() { NamespacedName: types.NamespacedName{Namespace: "default", Name: "no-such-task"}, }) Expect(err).NotTo(HaveOccurred()) - Expect(res.Requeue).To(BeFalse()) + // RequeueAfter == 0 covers both "no immediate requeue" and "no + // timer requeue" since controller-runtime deprecated the + // boolean Requeue field in favor of the zero-RequeueAfter + // representation. Expect(res.RequeueAfter).To(BeZero()) })