feat(cuda): add Blackwell sm_100 target + persistent mempool tuning

Ubuntu · claude · Ubuntu · commit ac826a6e6130 · 2026-04-16T15:14:27.000Z
- Add sm_100 (Blackwell B200/B300) to multi-arch fallback in build.rs
- Increase default mempool release threshold to 1GB for persistent workloads
- Add AsyncPoolConfig::for_persistent_actors() (4GB threshold, 256MB pre-alloc)
- Add AsyncPoolConfig::for_batch_processing() (128MB threshold, 32MB pre-alloc)
- Based on NVIDIA guidance for stream-ordered allocators with persistent kernels:
  high release threshold prevents OS reclaim during sustained operation

Also confirmed: CUDA Graph Conditional Nodes (IF/WHILE/SWITCH) available in
cudarc 0.19.3 via CU_GRAPH_NODE_TYPE_CONDITIONAL + cuGraphConditionalHandleCreate.
Documented as future optimization path for GPU-side actor state machines.

Co-Authored-By: Claude Opus 4.6 (1M context) &lt;noreply@anthropic.com&gt;
diff --git a/crates/ringkernel-cuda/build.rs b/crates/ringkernel-cuda/build.rs
@@ -213,7 +213,12 @@ fn determine_cuda_arch(nvcc: &Path) -> Vec<String> {
     }
 
     // 3. Fall back to multi-arch covering common architectures
-    println!("cargo:warning=Using multi-arch fallback (sm_75, sm_80, sm_89, sm_90)");
+    // sm_75: Turing (T4, RTX 2000)
+    // sm_80: Ampere (A100, RTX 3000)
+    // sm_89: Ada Lovelace (L40, RTX 4000)
+    // sm_90: Hopper (H100, H200)
+    // sm_100: Blackwell (B200, B300) — requires CUDA 12.8+
+    println!("cargo:warning=Using multi-arch fallback (sm_75, sm_80, sm_89, sm_90, sm_100)");
     vec![
         "-gencode".to_string(),
         "arch=compute_75,code=sm_75".to_string(),
@@ -223,6 +228,8 @@ fn determine_cuda_arch(nvcc: &Path) -> Vec<String> {
         "arch=compute_89,code=sm_89".to_string(),
         "-gencode".to_string(),
         "arch=compute_90,code=sm_90".to_string(),
+        "-gencode".to_string(),
+        "arch=compute_100,code=sm_100".to_string(),
     ]
 }
 
diff --git a/crates/ringkernel-cuda/src/hopper/async_mem.rs b/crates/ringkernel-cuda/src/hopper/async_mem.rs
@@ -43,7 +43,34 @@ impl Default for AsyncPoolConfig {
         Self {
             initial_size: 64 * 1024 * 1024,     // 64 MB
             max_size: 0,                          // Unlimited
-            release_threshold: 256 * 1024 * 1024, // 256 MB
+            // NVIDIA recommendation for persistent workloads: set high release
+            // threshold to prevent OS reclaim during sustained operation.
+            // For persistent actors that run for minutes/hours, we want to
+            // keep the pool warm — only release when truly excessive.
+            release_threshold: 1024 * 1024 * 1024, // 1 GB (high for persistent)
+        }
+    }
+}
+
+impl AsyncPoolConfig {
+    /// Configuration optimized for persistent actor workloads.
+    ///
+    /// High release threshold prevents pool shrinkage during sustained operation.
+    /// Based on NVIDIA guidance for stream-ordered allocators with persistent kernels.
+    pub fn for_persistent_actors() -> Self {
+        Self {
+            initial_size: 256 * 1024 * 1024,      // 256 MB pre-allocated
+            max_size: 0,                            // Unlimited
+            release_threshold: 4 * 1024 * 1024 * 1024, // 4 GB (very high)
+        }
+    }
+
+    /// Configuration for short-lived workloads (lower memory retention).
+    pub fn for_batch_processing() -> Self {
+        Self {
+            initial_size: 32 * 1024 * 1024,        // 32 MB
+            max_size: 0,
+            release_threshold: 128 * 1024 * 1024,  // 128 MB (release quickly)
         }
     }
 }