Skip to content

Commit ac826a6

Browse files
Ubuntuclaude
andcommitted
feat(cuda): add Blackwell sm_100 target + persistent mempool tuning
- Add sm_100 (Blackwell B200/B300) to multi-arch fallback in build.rs - Increase default mempool release threshold to 1GB for persistent workloads - Add AsyncPoolConfig::for_persistent_actors() (4GB threshold, 256MB pre-alloc) - Add AsyncPoolConfig::for_batch_processing() (128MB threshold, 32MB pre-alloc) - Based on NVIDIA guidance for stream-ordered allocators with persistent kernels: high release threshold prevents OS reclaim during sustained operation Also confirmed: CUDA Graph Conditional Nodes (IF/WHILE/SWITCH) available in cudarc 0.19.3 via CU_GRAPH_NODE_TYPE_CONDITIONAL + cuGraphConditionalHandleCreate. Documented as future optimization path for GPU-side actor state machines. Co-Authored-By: Claude Opus 4.6 (1M context) <noreply@anthropic.com>
1 parent d625e03 commit ac826a6

2 files changed

Lines changed: 36 additions & 2 deletions

File tree

crates/ringkernel-cuda/build.rs

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -213,7 +213,12 @@ fn determine_cuda_arch(nvcc: &Path) -> Vec<String> {
213213
}
214214

215215
// 3. Fall back to multi-arch covering common architectures
216-
println!("cargo:warning=Using multi-arch fallback (sm_75, sm_80, sm_89, sm_90)");
216+
// sm_75: Turing (T4, RTX 2000)
217+
// sm_80: Ampere (A100, RTX 3000)
218+
// sm_89: Ada Lovelace (L40, RTX 4000)
219+
// sm_90: Hopper (H100, H200)
220+
// sm_100: Blackwell (B200, B300) — requires CUDA 12.8+
221+
println!("cargo:warning=Using multi-arch fallback (sm_75, sm_80, sm_89, sm_90, sm_100)");
217222
vec![
218223
"-gencode".to_string(),
219224
"arch=compute_75,code=sm_75".to_string(),
@@ -223,6 +228,8 @@ fn determine_cuda_arch(nvcc: &Path) -> Vec<String> {
223228
"arch=compute_89,code=sm_89".to_string(),
224229
"-gencode".to_string(),
225230
"arch=compute_90,code=sm_90".to_string(),
231+
"-gencode".to_string(),
232+
"arch=compute_100,code=sm_100".to_string(),
226233
]
227234
}
228235

crates/ringkernel-cuda/src/hopper/async_mem.rs

Lines changed: 28 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -43,7 +43,34 @@ impl Default for AsyncPoolConfig {
4343
Self {
4444
initial_size: 64 * 1024 * 1024, // 64 MB
4545
max_size: 0, // Unlimited
46-
release_threshold: 256 * 1024 * 1024, // 256 MB
46+
// NVIDIA recommendation for persistent workloads: set high release
47+
// threshold to prevent OS reclaim during sustained operation.
48+
// For persistent actors that run for minutes/hours, we want to
49+
// keep the pool warm — only release when truly excessive.
50+
release_threshold: 1024 * 1024 * 1024, // 1 GB (high for persistent)
51+
}
52+
}
53+
}
54+
55+
impl AsyncPoolConfig {
56+
/// Configuration optimized for persistent actor workloads.
57+
///
58+
/// High release threshold prevents pool shrinkage during sustained operation.
59+
/// Based on NVIDIA guidance for stream-ordered allocators with persistent kernels.
60+
pub fn for_persistent_actors() -> Self {
61+
Self {
62+
initial_size: 256 * 1024 * 1024, // 256 MB pre-allocated
63+
max_size: 0, // Unlimited
64+
release_threshold: 4 * 1024 * 1024 * 1024, // 4 GB (very high)
65+
}
66+
}
67+
68+
/// Configuration for short-lived workloads (lower memory retention).
69+
pub fn for_batch_processing() -> Self {
70+
Self {
71+
initial_size: 32 * 1024 * 1024, // 32 MB
72+
max_size: 0,
73+
release_threshold: 128 * 1024 * 1024, // 128 MB (release quickly)
4774
}
4875
}
4976
}

0 commit comments

Comments
 (0)