|
| 1 | +//! Hot-swappable compiled rule artifacts. |
| 2 | +//! |
| 3 | +//! Per `docs/superpowers/specs/2026-04-17-v1.1-vyngraph-gaps.md` section 3.3, |
| 4 | +//! this module lets RingKernel accept opaque compiled rule artifacts (PTX + |
| 5 | +//! metadata) and hot-swap them atomically without runtime restart. |
| 6 | +//! |
| 7 | +//! ## Design philosophy |
| 8 | +//! |
| 9 | +//! RingKernel stays **rule-format-agnostic**. Callers such as VynGraph own |
| 10 | +//! OWL 2 RL / SHACL parsing and compile rules to PTX using our existing |
| 11 | +//! `ringkernel-cuda-codegen` pipeline. RingKernel receives the compiled |
| 12 | +//! artifact via [`CompiledRule`] and manages versioning, validation, |
| 13 | +//! rollback, and the atomic swap state machine. |
| 14 | +//! |
| 15 | +//! ## Artifact lifecycle |
| 16 | +//! |
| 17 | +//! ```text |
| 18 | +//! CompiledRule ─register_rule()─► RuleStatus::Registered |
| 19 | +//! │ │ |
| 20 | +//! │ reload_rule() │ |
| 21 | +//! ▼ ▼ |
| 22 | +//! (new version) ─pre_stage/quiesce/swap─► RuleStatus::Active |
| 23 | +//! │ |
| 24 | +//! prior version: Superseded(new_ver) │ |
| 25 | +//! │ |
| 26 | +//! rollback_rule() ◄──────┤ |
| 27 | +//! current version: Rolledback │ |
| 28 | +//! prior version: Active │ |
| 29 | +//! ``` |
| 30 | +//! |
| 31 | +//! ## Guarantees |
| 32 | +//! |
| 33 | +//! - Version monotonicity (downgrades rejected unless explicit rollback) |
| 34 | +//! - Bounded history (FIFO eviction beyond `max_history`) |
| 35 | +//! - Validation-before-swap (compute cap, dependencies, signature) |
| 36 | +//! - Pluggable swap backend (`NoopSwapBackend` for tests, CUDA in production) |
| 37 | +//! |
| 38 | +//! ## Example |
| 39 | +//! |
| 40 | +//! ```ignore |
| 41 | +//! use std::sync::Arc; |
| 42 | +//! use ringkernel_core::rules::{ |
| 43 | +//! ActorConfig, CompiledRule, NoopSwapBackend, RuleMetadata, RuleRegistry, |
| 44 | +//! }; |
| 45 | +//! |
| 46 | +//! # async fn example() { |
| 47 | +//! let registry = RuleRegistry::new(5, Arc::new(NoopSwapBackend)); |
| 48 | +//! let rule = CompiledRule { |
| 49 | +//! rule_id: "gaap-consolidation".into(), |
| 50 | +//! version: 1, |
| 51 | +//! ptx: b".version 8.0\n.target sm_90\n".to_vec(), |
| 52 | +//! compute_cap: "sm_90".into(), |
| 53 | +//! depends_on: vec![], |
| 54 | +//! signature: None, |
| 55 | +//! actor_config: ActorConfig::default(), |
| 56 | +//! metadata: RuleMetadata::default(), |
| 57 | +//! }; |
| 58 | +//! let handle = registry.register_rule(rule, "sm_90").await.unwrap(); |
| 59 | +//! assert_eq!(handle.version, 1); |
| 60 | +//! # } |
| 61 | +//! ``` |
| 62 | +//! |
| 63 | +//! [`HotReloadManager::rule_registry()`] exposes the registry for use by |
| 64 | +//! existing multi-GPU hot-reload plumbing. |
| 65 | +//! |
| 66 | +//! [`HotReloadManager::rule_registry()`]: crate::multi_gpu::HotReloadManager::rule_registry |
| 67 | +
|
| 68 | +use std::time::{Duration, SystemTime}; |
| 69 | + |
| 70 | +pub mod registry; |
| 71 | + |
| 72 | +pub use registry::{ |
| 73 | + NoopSwapBackend, RuleRegistry, RuleSwapBackend, SignatureVerifier, |
| 74 | +}; |
| 75 | + |
| 76 | +/// A compiled rule artifact ready for GPU hot-swap. |
| 77 | +/// |
| 78 | +/// RingKernel does not inspect `ptx` beyond validating compute capability, |
| 79 | +/// dependencies and (optionally) signature. The caller owns semantic |
| 80 | +/// correctness of the compilation. |
| 81 | +#[derive(Debug, Clone)] |
| 82 | +pub struct CompiledRule { |
| 83 | + /// Caller-scoped rule set identifier (e.g. `"gaap-consolidation"`). |
| 84 | + pub rule_id: String, |
| 85 | + /// Monotonically increasing version; later versions must be strictly |
| 86 | + /// greater than the currently active version. |
| 87 | + pub version: u64, |
| 88 | + /// Compiled PTX bytes for the actor kernel. |
| 89 | + pub ptx: Vec<u8>, |
| 90 | + /// Required compute capability, e.g. `"sm_90"` for H100. |
| 91 | + pub compute_cap: String, |
| 92 | + /// Other `rule_id`s that must already be registered before this rule |
| 93 | + /// can be installed. Used for inference-rule dependency graphs. |
| 94 | + pub depends_on: Vec<String>, |
| 95 | + /// Optional integrity signature (format is verifier-specific). |
| 96 | + pub signature: Option<Vec<u8>>, |
| 97 | + /// Actor launch configuration. |
| 98 | + pub actor_config: ActorConfig, |
| 99 | + /// Opaque metadata passed through for audit/logging. RingKernel does |
| 100 | + /// not interpret any of these fields. |
| 101 | + pub metadata: RuleMetadata, |
| 102 | +} |
| 103 | + |
| 104 | +/// Launch configuration for the rule's actor kernel. |
| 105 | +#[derive(Debug, Clone)] |
| 106 | +pub struct ActorConfig { |
| 107 | + /// CUDA block dimensions `(x, y, z)`. |
| 108 | + pub block_dim: (u32, u32, u32), |
| 109 | + /// CUDA grid dimensions `(x, y, z)`. |
| 110 | + pub grid_dim: (u32, u32, u32), |
| 111 | + /// Dynamic shared-memory bytes to allocate per block. |
| 112 | + pub shared_mem_bytes: u32, |
| 113 | + /// Maximum number of in-flight messages this actor accepts. |
| 114 | + pub max_in_flight: u32, |
| 115 | +} |
| 116 | + |
| 117 | +impl Default for ActorConfig { |
| 118 | + fn default() -> Self { |
| 119 | + Self { |
| 120 | + block_dim: (1, 1, 1), |
| 121 | + grid_dim: (1, 1, 1), |
| 122 | + shared_mem_bytes: 0, |
| 123 | + max_in_flight: 1024, |
| 124 | + } |
| 125 | + } |
| 126 | +} |
| 127 | + |
| 128 | +/// Opaque metadata attached to a compiled rule. |
| 129 | +/// |
| 130 | +/// All fields are optional and none of them influence the swap state |
| 131 | +/// machine. They exist solely for audit trails, observability, and |
| 132 | +/// attribution. Callers are free to ignore them or fill them in as they |
| 133 | +/// see fit; RingKernel passes them through unchanged. |
| 134 | +#[derive(Debug, Clone, Default)] |
| 135 | +pub struct RuleMetadata { |
| 136 | + /// Human-readable description of the source language, e.g. |
| 137 | + /// `"OWL 2 RL"`, `"SHACL"`, `"custom DSL"`. Opaque to RingKernel. |
| 138 | + pub source_language: Option<String>, |
| 139 | + /// SHA-256 of the rule source text, for audit reproducibility. |
| 140 | + pub source_hash: Option<[u8; 32]>, |
| 141 | + /// When the rule was compiled. |
| 142 | + pub compiled_at: Option<SystemTime>, |
| 143 | + /// Version string of the compiler that produced this artifact. |
| 144 | + pub compiler_version: Option<String>, |
| 145 | + /// Principal who authored / compiled the rule. |
| 146 | + pub author: Option<String>, |
| 147 | +} |
| 148 | + |
| 149 | +/// Lightweight handle returned after a successful registry operation. |
| 150 | +#[derive(Debug, Clone)] |
| 151 | +pub struct RuleHandle { |
| 152 | + /// Rule identifier. |
| 153 | + pub rule_id: String, |
| 154 | + /// Rule version. |
| 155 | + pub version: u64, |
| 156 | + /// Lifecycle status of this specific version. |
| 157 | + pub status: RuleStatus, |
| 158 | + /// When the version was registered with the registry. |
| 159 | + pub registered_at: SystemTime, |
| 160 | +} |
| 161 | + |
| 162 | +/// Lifecycle status of a specific rule version. |
| 163 | +#[derive(Debug, Clone, Copy, PartialEq, Eq)] |
| 164 | +pub enum RuleStatus { |
| 165 | + /// Loaded and validated but not yet the active version. |
| 166 | + Registered, |
| 167 | + /// Currently executing on the device. |
| 168 | + Active, |
| 169 | + /// Being drained ahead of a swap. |
| 170 | + Quiescing, |
| 171 | + /// Replaced by the specified newer version. |
| 172 | + Superseded(u64), |
| 173 | + /// Rolled back away from (prior `Active` version the user chose to revert). |
| 174 | + Rolledback, |
| 175 | + /// Validation or swap backend failed; this version is unusable. |
| 176 | + Failed, |
| 177 | +} |
| 178 | + |
| 179 | +/// Report emitted after a successful reload (or rollback). |
| 180 | +#[derive(Debug, Clone)] |
| 181 | +pub struct ReloadReport { |
| 182 | + /// Rule identifier. |
| 183 | + pub rule_id: String, |
| 184 | + /// Version we moved away from (0 if this was the initial activation). |
| 185 | + pub from_version: u64, |
| 186 | + /// Version that is now `Active`. |
| 187 | + pub to_version: u64, |
| 188 | + /// Time spent draining the old actor. |
| 189 | + pub quiesce_duration: Duration, |
| 190 | + /// Time spent performing the atomic pointer swap. |
| 191 | + pub swap_duration: Duration, |
| 192 | + /// Messages that were in-flight during the swap window |
| 193 | + /// (as reported by the swap backend). |
| 194 | + pub messages_in_flight_during_swap: u64, |
| 195 | + /// Whether the previous version is still retained in history and can |
| 196 | + /// be the target of a subsequent rollback. |
| 197 | + pub rollback_available: bool, |
| 198 | +} |
| 199 | + |
| 200 | +/// Errors produced by the rule registry. |
| 201 | +#[derive(Debug, thiserror::Error)] |
| 202 | +pub enum RuleError { |
| 203 | + /// No such rule in the registry. |
| 204 | + #[error("rule not found: {0}")] |
| 205 | + NotFound(String), |
| 206 | + |
| 207 | + /// Incoming version is not strictly newer than the current active version. |
| 208 | + #[error("version downgrade rejected: current={current}, proposed={proposed}")] |
| 209 | + VersionDowngrade { |
| 210 | + /// Currently active version. |
| 211 | + current: u64, |
| 212 | + /// Version the caller tried to install. |
| 213 | + proposed: u64, |
| 214 | + }, |
| 215 | + |
| 216 | + /// Rule targets a compute capability the device does not meet. |
| 217 | + #[error("compute capability mismatch: rule={required}, device={available}")] |
| 218 | + ComputeCapMismatch { |
| 219 | + /// Compute cap the rule requires. |
| 220 | + required: String, |
| 221 | + /// Compute cap the device actually has. |
| 222 | + available: String, |
| 223 | + }, |
| 224 | + |
| 225 | + /// Rule depends on another rule that is not registered. |
| 226 | + #[error("dependency missing: {0}")] |
| 227 | + MissingDependency(String), |
| 228 | + |
| 229 | + /// Signature check did not succeed. |
| 230 | + #[error("signature verification failed")] |
| 231 | + InvalidSignature, |
| 232 | + |
| 233 | + /// Caller asked to roll back to a version no longer in history. |
| 234 | + #[error("rollback target not in history: version={0}")] |
| 235 | + RollbackTargetMissing(u64), |
| 236 | + |
| 237 | + /// No version is currently active — nothing to roll back from. |
| 238 | + #[error("no active version to rollback")] |
| 239 | + NoActiveVersion, |
| 240 | + |
| 241 | + /// Quiesce window elapsed before the actor finished draining. |
| 242 | + #[error("quiesce timeout after {0:?}")] |
| 243 | + QuiesceTimeout(Duration), |
| 244 | + |
| 245 | + /// Swap backend refused the operation (wraps backend-specific detail). |
| 246 | + #[error("swap backend error: {0}")] |
| 247 | + BackendError(String), |
| 248 | + |
| 249 | + /// Version was already registered and we do not allow re-register of |
| 250 | + /// the same `(rule_id, version)` tuple. |
| 251 | + #[error("duplicate version: rule={rule_id}, version={version}")] |
| 252 | + DuplicateVersion { |
| 253 | + /// Rule identifier. |
| 254 | + rule_id: String, |
| 255 | + /// Version that was already present. |
| 256 | + version: u64, |
| 257 | + }, |
| 258 | +} |
0 commit comments