Skip to content

Commit febb724

Browse files
mivertowskiclaude
andcommitted
feat(cuda): GPU-side tenant enforcement + migration kernels (v1.1)
Per spec section 3.1 GPU-side implementation. persistent.rs: - K2KRouteEntry: add tenant_id: u64 (64→72 bytes) - Device-side tenant check: source_tenant != dest_tenant = reject + counter - PersistentControlBlock stays 256 bytes (repurposed reserved 56 bytes): * cross_tenant_reject_count * migration_phase (Active/Quiescing/Transferring/Swapping) * is_quiesced flag * in_flight_drained count * state_checksum (CRC32) * tenant_id - New SimCommand variants: * BeginQuiesce = 32 (stop accepting, flush in-flight) * FinalizeMigration = 33 (after transfer, terminate) * RestoreState = 34 (target GPU: accept state blob) - New ResponseType variants: * QuiesceComplete = 32 * ReadyToActivate = 33 * MigrationComplete = 34 - H2KMessage::begin_quiesce / finalize_migration / restore_state cuda/migration_kernels.cu: - capture_actor_state: state → mapped buffer + CRC32 - restore_actor_state: buffer → state + CRC32 verification - drain_inflight_queue: flush K2K inbox to external buffer - Single-block launch for deterministic cross-GPU checksum - __threadfence_system() before mapped-memory publishes - #if __CUDA_ARCH__ >= 900 guard for future Hopper optimizations - Verified PTX compile on sm_75 and sm_90 multi_gpu/migration_kernels.rs: - MigrationKernels::load/capture_state/restore_state/drain_queue - Graceful BackendUnavailable when PTX missing - CaptureResult, DrainResult return types build.rs: - compile_migration_kernels() with nvcc fallback - Stub generation for non-cooperative builds (fixes include! gap) 28 new tests (20 persistent + 8 migration_kernels). Hardware-dependent tests marked #[ignore = "Requires CUDA GPU"]. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent 84f0b61 commit febb724

5 files changed

Lines changed: 1364 additions & 4 deletions

File tree

crates/ringkernel-cuda/build.rs

Lines changed: 128 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,7 @@ fn main() {
1515
println!("cargo:rerun-if-changed=src/cuda/cooperative_kernels.cu");
1616
println!("cargo:rerun-if-changed=src/cuda/cluster_kernels.cu");
1717
println!("cargo:rerun-if-changed=src/cuda/actor_lifecycle_kernel.cu");
18+
println!("cargo:rerun-if-changed=src/cuda/migration_kernels.cu");
1819
println!("cargo:rerun-if-env-changed=CUDA_PATH");
1920
println!("cargo:rerun-if-env-changed=CUDA_HOME");
2021
println!("cargo:rerun-if-env-changed=RINGKERNEL_CUDA_ARCH");
@@ -25,8 +26,12 @@ fn main() {
2526
let cooperative_enabled = env::var("CARGO_FEATURE_COOPERATIVE").is_ok();
2627

2728
if !cooperative_enabled {
28-
// Generate stub when cooperative feature is not enabled
29+
// Generate stubs for every kernel group so `include!` sites compile
30+
// regardless of feature selection.
2931
generate_stub(&out_dir, "Cooperative feature not enabled");
32+
generate_cluster_stub(&out_dir, "Cooperative feature not enabled");
33+
generate_lifecycle_stub(&out_dir, "Cooperative feature not enabled");
34+
generate_migration_stub(&out_dir, "Cooperative feature not enabled");
3035
return;
3136
}
3237

@@ -68,12 +73,25 @@ fn main() {
6873
generate_lifecycle_stub(&out_dir, &format!("Compilation failed: {}", e));
6974
}
7075
}
76+
77+
// Compile migration kernels (portable sm_75+)
78+
match compile_migration_kernels(&nvcc, &out_dir) {
79+
Ok(()) => {
80+
println!("cargo:rustc-cfg=has_migration_kernels");
81+
println!("cargo:warning=Migration kernels compiled successfully");
82+
}
83+
Err(e) => {
84+
println!("cargo:warning=Migration kernels not available: {}", e);
85+
generate_migration_stub(&out_dir, &format!("Compilation failed: {}", e));
86+
}
87+
}
7188
}
7289
None => {
7390
println!("cargo:warning=nvcc not found - cooperative groups will use fallback");
7491
generate_stub(&out_dir, "nvcc not found at build time");
7592
generate_cluster_stub(&out_dir, "nvcc not found at build time");
7693
generate_lifecycle_stub(&out_dir, "nvcc not found at build time");
94+
generate_migration_stub(&out_dir, "nvcc not found at build time");
7795
}
7896
}
7997
}
@@ -391,12 +409,14 @@ fn write_cluster_rust_code(
391409
code.push_str(ptx);
392410
code.push_str("\"####;\n\n");
393411

412+
code.push_str("/// Whether Hopper cluster kernel support was compiled in.\n");
394413
code.push_str(&format!(
395414
"pub const HAS_CLUSTER_KERNEL_SUPPORT: bool = {};\n\n",
396415
has_support
397416
));
398417

399418
let escaped_message = message.replace('\\', "\\\\").replace('"', "\\\"");
419+
code.push_str("/// Build-time message about Hopper cluster kernel support.\n");
400420
code.push_str(&format!(
401421
"pub const CLUSTER_KERNEL_BUILD_MESSAGE: &str = \"{}\";\n",
402422
escaped_message
@@ -443,9 +463,11 @@ fn compile_lifecycle_kernel(nvcc: &Path, out_dir: &Path) -> Result<(), String> {
443463
let rust_file = out_dir.join("actor_lifecycle_kernel.rs");
444464
let mut code = String::new();
445465
code.push_str("// Auto-generated actor lifecycle kernel PTX.\n\n");
466+
code.push_str("/// Pre-compiled PTX for the actor lifecycle kernel.\n");
446467
code.push_str("pub const LIFECYCLE_KERNEL_PTX: &str = r####\"");
447468
code.push_str(&ptx_content);
448469
code.push_str("\"####;\n\n");
470+
code.push_str("/// Whether the actor lifecycle kernel was compiled in.\n");
449471
code.push_str("pub const HAS_LIFECYCLE_KERNEL: bool = true;\n");
450472

451473
fs::write(&rust_file, code).map_err(|e| format!("Write failed: {}", e))
@@ -456,9 +478,114 @@ fn generate_lifecycle_stub(out_dir: &Path, reason: &str) {
456478
let rust_file = out_dir.join("actor_lifecycle_kernel.rs");
457479
let code = format!(
458480
"// Actor lifecycle kernel not available: {}\n\n\
481+
/// Pre-compiled PTX for the actor lifecycle kernel (empty stub).\n\
459482
pub const LIFECYCLE_KERNEL_PTX: &str = \"\";\n\
483+
/// Whether the actor lifecycle kernel was compiled in.\n\
460484
pub const HAS_LIFECYCLE_KERNEL: bool = false;\n",
461485
reason
462486
);
463487
fs::write(&rust_file, code).expect("Failed to write lifecycle stub");
464488
}
489+
490+
/// Compile the migration kernels (portable `sm_75+`) to PTX.
491+
///
492+
/// Migration kernels handle the state capture / restore / in-flight queue
493+
/// drain side of the 3-phase multi-GPU migration protocol (v1.1). The PTX
494+
/// is embedded as a `const &str` for runtime loading via cudarc.
495+
fn compile_migration_kernels(nvcc: &Path, out_dir: &Path) -> Result<(), String> {
496+
let manifest_dir = PathBuf::from(env::var("CARGO_MANIFEST_DIR").unwrap());
497+
let cuda_src = manifest_dir.join("src/cuda/migration_kernels.cu");
498+
499+
if !cuda_src.exists() {
500+
return Err(format!(
501+
"Migration CUDA source not found: {:?}",
502+
cuda_src
503+
));
504+
}
505+
506+
let ptx_file = out_dir.join("migration_kernels.ptx");
507+
508+
// Migration kernels are portable — compile for the same multi-arch set
509+
// as the cooperative kernels so they run on any supported device.
510+
let arch_args = determine_cuda_arch(nvcc);
511+
512+
let mut cmd = Command::new(nvcc);
513+
cmd.args(["-ptx", "-O3", "--generate-line-info"]);
514+
for arg in &arch_args {
515+
cmd.arg(arg);
516+
}
517+
cmd.args(["-std=c++17", "-w", "-o"]);
518+
cmd.arg(ptx_file.to_str().unwrap());
519+
cmd.arg(cuda_src.to_str().unwrap());
520+
521+
let status = cmd
522+
.status()
523+
.map_err(|e| format!("Failed to execute nvcc for migration kernels: {}", e))?;
524+
525+
if !status.success() {
526+
return Err(format!(
527+
"nvcc migration kernel compilation failed with exit code: {:?}",
528+
status.code()
529+
));
530+
}
531+
532+
let ptx_content = fs::read_to_string(&ptx_file)
533+
.map_err(|e| format!("Failed to read migration PTX: {}", e))?;
534+
535+
let rust_file = out_dir.join("migration_kernels.rs");
536+
write_migration_rust_code(
537+
&rust_file,
538+
&ptx_content,
539+
true,
540+
"Migration kernels compiled successfully",
541+
)
542+
.map_err(|e| format!("Failed to write migration Rust bindings: {}", e))?;
543+
544+
Ok(())
545+
}
546+
547+
/// Generate migration kernel stub when nvcc is unavailable.
548+
fn generate_migration_stub(out_dir: &Path, reason: &str) {
549+
let rust_file = out_dir.join("migration_kernels.rs");
550+
write_migration_rust_code(&rust_file, "", false, reason)
551+
.expect("Failed to write migration Rust stub");
552+
}
553+
554+
/// Emit the Rust file that wraps the migration PTX blob.
555+
fn write_migration_rust_code(
556+
path: &Path,
557+
ptx: &str,
558+
has_support: bool,
559+
message: &str,
560+
) -> std::io::Result<()> {
561+
let mut code = String::new();
562+
563+
code.push_str("// Auto-generated migration kernel PTX.\n");
564+
code.push_str("// Generated by build.rs at build time.\n");
565+
code.push_str("// Portable: sm_75+ (Turing through Hopper).\n\n");
566+
567+
code.push_str("/// Pre-compiled PTX for v1.1 migration kernels.\n");
568+
code.push_str("/// Contains:\n");
569+
code.push_str("/// - capture_actor_state: snapshot live actor state with CRC32\n");
570+
code.push_str("/// - restore_actor_state: reload captured state with CRC32 verify\n");
571+
code.push_str("/// - drain_inflight_queue: drain K2K queue to external buffer\n");
572+
573+
code.push_str("pub const MIGRATION_KERNEL_PTX: &str = r####\"");
574+
code.push_str(ptx);
575+
code.push_str("\"####;\n\n");
576+
577+
code.push_str("/// `true` if the migration kernels were compiled and embedded.\n");
578+
code.push_str(&format!(
579+
"pub const HAS_MIGRATION_KERNEL_SUPPORT: bool = {};\n\n",
580+
has_support
581+
));
582+
583+
code.push_str("/// Build-time message describing migration kernel availability.\n");
584+
let escaped_message = message.replace('\\', "\\\\").replace('"', "\\\"");
585+
code.push_str(&format!(
586+
"pub const MIGRATION_KERNEL_BUILD_MESSAGE: &str = \"{}\";\n",
587+
escaped_message
588+
));
589+
590+
fs::write(path, code)
591+
}

0 commit comments

Comments
 (0)