Skip to content

Commit a1226a5

Browse files
mivertowskiclaude
andcommitted
feat(core): hot rule reload with CompiledRule artifact API (v1.1)
Per spec section 3.3, accept compiled rule artifacts (PTX + metadata) from VynGraph OWL/SHACL compiler; hot-swap atomically with rollback. RingKernel stays rule-format-agnostic — VynGraph owns OWL parsing and compilation; we just execute artifacts. New module crates/ringkernel-core/src/rules/: Types (mod.rs, 258 lines): - CompiledRule { rule_id, version, ptx, compute_cap, depends_on, signature, actor_config, metadata } - ActorConfig { block_dim, grid_dim, shared_mem_bytes, max_in_flight } - RuleMetadata { source_language, source_hash, compiled_at, compiler_version, author } — all opaque to us - RuleStatus { Registered, Active, Quiescing, Superseded(v), Rolledback, Failed } - ReloadReport { quiesce_duration, swap_duration, drain_count } - RuleError with 12 variants (NotFound, VersionDowngrade, ComputeCapMismatch, MissingDependency, InvalidSignature, RollbackTargetMissing, QuiesceTimeout, BackendError, DuplicateVersion) RuleRegistry + state machine (registry.rs, 1134 lines): - SignatureVerifier trait (optional) - RuleSwapBackend trait: pre_stage → quiesce → swap → terminate_old - NoopSwapBackend for tests - Validation: monotonic versions, duplicate detection, compute_cap compat (sm_XX parsing with device_cap >= rule_cap), dependency checks - FIFO history eviction at max_history (default 5) - Rollback marks current as Rolledback (not Superseded) HotReloadManager integration (multi_gpu.rs): - Added rule_registry: Arc<RuleRegistry> field - rule_registry() accessor - with_rule_backend() constructor for CUDA backend injection - HotReloadConfig.max_rule_history (default 5) 32 async tests (all passing). 782 lib tests pass workspace-wide. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent fe1535e commit a1226a5

4 files changed

Lines changed: 1426 additions & 0 deletions

File tree

crates/ringkernel-core/src/lib.rs

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -76,6 +76,7 @@ pub mod queue;
7676
pub mod reduction;
7777
pub mod registry;
7878
pub mod resource;
79+
pub mod rules;
7980
pub mod runtime;
8081
pub mod runtime_context;
8182
pub mod scheduling;
@@ -192,6 +193,10 @@ pub mod prelude {
192193
ResourceError, ResourceGuard, ResourceResult, DEFAULT_MAX_MEMORY_BYTES,
193194
SYSTEM_MEMORY_MARGIN,
194195
};
196+
pub use crate::rules::{
197+
ActorConfig, CompiledRule, NoopSwapBackend, ReloadReport, RuleError, RuleHandle,
198+
RuleMetadata, RuleRegistry, RuleStatus, RuleSwapBackend, SignatureVerifier,
199+
};
195200
pub use crate::runtime::*;
196201
pub use crate::runtime_context::{
197202
AppInfo, BackgroundTaskStatus, CircuitGuard, ContextMetrics, DegradationGuard,

crates/ringkernel-core/src/multi_gpu.rs

Lines changed: 29 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1746,6 +1746,9 @@ pub struct HotReloadConfig {
17461746
pub validate_before_swap: bool,
17471747
/// Keep old code as fallback in case of failure.
17481748
pub keep_fallback: bool,
1749+
/// Number of compiled-rule versions retained per rule for rollback
1750+
/// (FIFO eviction, see `rules::RuleRegistry`). Default: 5.
1751+
pub max_rule_history: usize,
17491752
}
17501753

17511754
impl Default for HotReloadConfig {
@@ -1758,6 +1761,7 @@ impl Default for HotReloadConfig {
17581761
retry_backoff: Duration::from_millis(500),
17591762
validate_before_swap: true,
17601763
keep_fallback: true,
1764+
max_rule_history: 5,
17611765
}
17621766
}
17631767
}
@@ -2104,21 +2108,46 @@ pub struct HotReloadManager {
21042108
version_counter: AtomicU64,
21052109
/// Statistics.
21062110
stats: HotReloadStats,
2111+
/// Compiled-rule registry (v1.1, per spec 3.3). Hot-swaps whole
2112+
/// inference-rule actors identified by opaque `CompiledRule`
2113+
/// artifacts produced by upstream compilers (e.g. VynGraph).
2114+
rule_registry: Arc<crate::rules::RuleRegistry>,
21072115
}
21082116

21092117
impl HotReloadManager {
21102118
/// Create a new hot reload manager.
21112119
pub fn new(config: HotReloadConfig) -> Arc<Self> {
2120+
Self::with_rule_backend(config, Arc::new(crate::rules::NoopSwapBackend))
2121+
}
2122+
2123+
/// Create a new hot reload manager with a custom rule-swap backend.
2124+
///
2125+
/// Production code (e.g. `ringkernel-cuda`) should use this to inject
2126+
/// a backend that actually performs the GPU-side atomic actor swap.
2127+
pub fn with_rule_backend(
2128+
config: HotReloadConfig,
2129+
rule_backend: Arc<dyn crate::rules::RuleSwapBackend>,
2130+
) -> Arc<Self> {
2131+
let rule_registry = Arc::new(crate::rules::RuleRegistry::new(
2132+
config.max_rule_history,
2133+
rule_backend,
2134+
));
21122135
Arc::new(Self {
21132136
config,
21142137
kernels: RwLock::new(HashMap::new()),
21152138
fallbacks: RwLock::new(HashMap::new()),
21162139
active_requests: RwLock::new(HashMap::new()),
21172140
version_counter: AtomicU64::new(1),
21182141
stats: HotReloadStats::default(),
2142+
rule_registry,
21192143
})
21202144
}
21212145

2146+
/// Access the compiled-rule registry for hot-swap of inference rules.
2147+
pub fn rule_registry(&self) -> &Arc<crate::rules::RuleRegistry> {
2148+
&self.rule_registry
2149+
}
2150+
21222151
/// Create with default configuration.
21232152
pub fn with_defaults() -> Arc<Self> {
21242153
Self::new(HotReloadConfig::default())
Lines changed: 258 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,258 @@
1+
//! Hot-swappable compiled rule artifacts.
2+
//!
3+
//! Per `docs/superpowers/specs/2026-04-17-v1.1-vyngraph-gaps.md` section 3.3,
4+
//! this module lets RingKernel accept opaque compiled rule artifacts (PTX +
5+
//! metadata) and hot-swap them atomically without runtime restart.
6+
//!
7+
//! ## Design philosophy
8+
//!
9+
//! RingKernel stays **rule-format-agnostic**. Callers such as VynGraph own
10+
//! OWL 2 RL / SHACL parsing and compile rules to PTX using our existing
11+
//! `ringkernel-cuda-codegen` pipeline. RingKernel receives the compiled
12+
//! artifact via [`CompiledRule`] and manages versioning, validation,
13+
//! rollback, and the atomic swap state machine.
14+
//!
15+
//! ## Artifact lifecycle
16+
//!
17+
//! ```text
18+
//! CompiledRule ─register_rule()─► RuleStatus::Registered
19+
//! │ │
20+
//! │ reload_rule() │
21+
//! ▼ ▼
22+
//! (new version) ─pre_stage/quiesce/swap─► RuleStatus::Active
23+
//! │
24+
//! prior version: Superseded(new_ver) │
25+
//! │
26+
//! rollback_rule() ◄──────┤
27+
//! current version: Rolledback │
28+
//! prior version: Active │
29+
//! ```
30+
//!
31+
//! ## Guarantees
32+
//!
33+
//! - Version monotonicity (downgrades rejected unless explicit rollback)
34+
//! - Bounded history (FIFO eviction beyond `max_history`)
35+
//! - Validation-before-swap (compute cap, dependencies, signature)
36+
//! - Pluggable swap backend (`NoopSwapBackend` for tests, CUDA in production)
37+
//!
38+
//! ## Example
39+
//!
40+
//! ```ignore
41+
//! use std::sync::Arc;
42+
//! use ringkernel_core::rules::{
43+
//! ActorConfig, CompiledRule, NoopSwapBackend, RuleMetadata, RuleRegistry,
44+
//! };
45+
//!
46+
//! # async fn example() {
47+
//! let registry = RuleRegistry::new(5, Arc::new(NoopSwapBackend));
48+
//! let rule = CompiledRule {
49+
//! rule_id: "gaap-consolidation".into(),
50+
//! version: 1,
51+
//! ptx: b".version 8.0\n.target sm_90\n".to_vec(),
52+
//! compute_cap: "sm_90".into(),
53+
//! depends_on: vec![],
54+
//! signature: None,
55+
//! actor_config: ActorConfig::default(),
56+
//! metadata: RuleMetadata::default(),
57+
//! };
58+
//! let handle = registry.register_rule(rule, "sm_90").await.unwrap();
59+
//! assert_eq!(handle.version, 1);
60+
//! # }
61+
//! ```
62+
//!
63+
//! [`HotReloadManager::rule_registry()`] exposes the registry for use by
64+
//! existing multi-GPU hot-reload plumbing.
65+
//!
66+
//! [`HotReloadManager::rule_registry()`]: crate::multi_gpu::HotReloadManager::rule_registry
67+
68+
use std::time::{Duration, SystemTime};
69+
70+
pub mod registry;
71+
72+
pub use registry::{
73+
NoopSwapBackend, RuleRegistry, RuleSwapBackend, SignatureVerifier,
74+
};
75+
76+
/// A compiled rule artifact ready for GPU hot-swap.
77+
///
78+
/// RingKernel does not inspect `ptx` beyond validating compute capability,
79+
/// dependencies and (optionally) signature. The caller owns semantic
80+
/// correctness of the compilation.
81+
#[derive(Debug, Clone)]
82+
pub struct CompiledRule {
83+
/// Caller-scoped rule set identifier (e.g. `"gaap-consolidation"`).
84+
pub rule_id: String,
85+
/// Monotonically increasing version; later versions must be strictly
86+
/// greater than the currently active version.
87+
pub version: u64,
88+
/// Compiled PTX bytes for the actor kernel.
89+
pub ptx: Vec<u8>,
90+
/// Required compute capability, e.g. `"sm_90"` for H100.
91+
pub compute_cap: String,
92+
/// Other `rule_id`s that must already be registered before this rule
93+
/// can be installed. Used for inference-rule dependency graphs.
94+
pub depends_on: Vec<String>,
95+
/// Optional integrity signature (format is verifier-specific).
96+
pub signature: Option<Vec<u8>>,
97+
/// Actor launch configuration.
98+
pub actor_config: ActorConfig,
99+
/// Opaque metadata passed through for audit/logging. RingKernel does
100+
/// not interpret any of these fields.
101+
pub metadata: RuleMetadata,
102+
}
103+
104+
/// Launch configuration for the rule's actor kernel.
105+
#[derive(Debug, Clone)]
106+
pub struct ActorConfig {
107+
/// CUDA block dimensions `(x, y, z)`.
108+
pub block_dim: (u32, u32, u32),
109+
/// CUDA grid dimensions `(x, y, z)`.
110+
pub grid_dim: (u32, u32, u32),
111+
/// Dynamic shared-memory bytes to allocate per block.
112+
pub shared_mem_bytes: u32,
113+
/// Maximum number of in-flight messages this actor accepts.
114+
pub max_in_flight: u32,
115+
}
116+
117+
impl Default for ActorConfig {
118+
fn default() -> Self {
119+
Self {
120+
block_dim: (1, 1, 1),
121+
grid_dim: (1, 1, 1),
122+
shared_mem_bytes: 0,
123+
max_in_flight: 1024,
124+
}
125+
}
126+
}
127+
128+
/// Opaque metadata attached to a compiled rule.
129+
///
130+
/// All fields are optional and none of them influence the swap state
131+
/// machine. They exist solely for audit trails, observability, and
132+
/// attribution. Callers are free to ignore them or fill them in as they
133+
/// see fit; RingKernel passes them through unchanged.
134+
#[derive(Debug, Clone, Default)]
135+
pub struct RuleMetadata {
136+
/// Human-readable description of the source language, e.g.
137+
/// `"OWL 2 RL"`, `"SHACL"`, `"custom DSL"`. Opaque to RingKernel.
138+
pub source_language: Option<String>,
139+
/// SHA-256 of the rule source text, for audit reproducibility.
140+
pub source_hash: Option<[u8; 32]>,
141+
/// When the rule was compiled.
142+
pub compiled_at: Option<SystemTime>,
143+
/// Version string of the compiler that produced this artifact.
144+
pub compiler_version: Option<String>,
145+
/// Principal who authored / compiled the rule.
146+
pub author: Option<String>,
147+
}
148+
149+
/// Lightweight handle returned after a successful registry operation.
150+
#[derive(Debug, Clone)]
151+
pub struct RuleHandle {
152+
/// Rule identifier.
153+
pub rule_id: String,
154+
/// Rule version.
155+
pub version: u64,
156+
/// Lifecycle status of this specific version.
157+
pub status: RuleStatus,
158+
/// When the version was registered with the registry.
159+
pub registered_at: SystemTime,
160+
}
161+
162+
/// Lifecycle status of a specific rule version.
163+
#[derive(Debug, Clone, Copy, PartialEq, Eq)]
164+
pub enum RuleStatus {
165+
/// Loaded and validated but not yet the active version.
166+
Registered,
167+
/// Currently executing on the device.
168+
Active,
169+
/// Being drained ahead of a swap.
170+
Quiescing,
171+
/// Replaced by the specified newer version.
172+
Superseded(u64),
173+
/// Rolled back away from (prior `Active` version the user chose to revert).
174+
Rolledback,
175+
/// Validation or swap backend failed; this version is unusable.
176+
Failed,
177+
}
178+
179+
/// Report emitted after a successful reload (or rollback).
180+
#[derive(Debug, Clone)]
181+
pub struct ReloadReport {
182+
/// Rule identifier.
183+
pub rule_id: String,
184+
/// Version we moved away from (0 if this was the initial activation).
185+
pub from_version: u64,
186+
/// Version that is now `Active`.
187+
pub to_version: u64,
188+
/// Time spent draining the old actor.
189+
pub quiesce_duration: Duration,
190+
/// Time spent performing the atomic pointer swap.
191+
pub swap_duration: Duration,
192+
/// Messages that were in-flight during the swap window
193+
/// (as reported by the swap backend).
194+
pub messages_in_flight_during_swap: u64,
195+
/// Whether the previous version is still retained in history and can
196+
/// be the target of a subsequent rollback.
197+
pub rollback_available: bool,
198+
}
199+
200+
/// Errors produced by the rule registry.
201+
#[derive(Debug, thiserror::Error)]
202+
pub enum RuleError {
203+
/// No such rule in the registry.
204+
#[error("rule not found: {0}")]
205+
NotFound(String),
206+
207+
/// Incoming version is not strictly newer than the current active version.
208+
#[error("version downgrade rejected: current={current}, proposed={proposed}")]
209+
VersionDowngrade {
210+
/// Currently active version.
211+
current: u64,
212+
/// Version the caller tried to install.
213+
proposed: u64,
214+
},
215+
216+
/// Rule targets a compute capability the device does not meet.
217+
#[error("compute capability mismatch: rule={required}, device={available}")]
218+
ComputeCapMismatch {
219+
/// Compute cap the rule requires.
220+
required: String,
221+
/// Compute cap the device actually has.
222+
available: String,
223+
},
224+
225+
/// Rule depends on another rule that is not registered.
226+
#[error("dependency missing: {0}")]
227+
MissingDependency(String),
228+
229+
/// Signature check did not succeed.
230+
#[error("signature verification failed")]
231+
InvalidSignature,
232+
233+
/// Caller asked to roll back to a version no longer in history.
234+
#[error("rollback target not in history: version={0}")]
235+
RollbackTargetMissing(u64),
236+
237+
/// No version is currently active — nothing to roll back from.
238+
#[error("no active version to rollback")]
239+
NoActiveVersion,
240+
241+
/// Quiesce window elapsed before the actor finished draining.
242+
#[error("quiesce timeout after {0:?}")]
243+
QuiesceTimeout(Duration),
244+
245+
/// Swap backend refused the operation (wraps backend-specific detail).
246+
#[error("swap backend error: {0}")]
247+
BackendError(String),
248+
249+
/// Version was already registered and we do not allow re-register of
250+
/// the same `(rule_id, version)` tuple.
251+
#[error("duplicate version: rule={rule_id}, version={version}")]
252+
DuplicateVersion {
253+
/// Rule identifier.
254+
rule_id: String,
255+
/// Version that was already present.
256+
version: u64,
257+
},
258+
}

0 commit comments

Comments
 (0)