Skip to content

Commit cf0f7ad

Browse files
mivertowskiclaude
andcommitted
Release v0.4.2: warp-shuffle reductions, __nanosleep, libcu++ atomics
Upgrade CUDA codegen with research findings from the CUDA wishlist: - Two-phase warp-shuffle reductions (O(log N) barriers → 1 per block) - __nanosleep() for idle spin-wait and barrier power efficiency - Opt-in libcu++ cuda::atomic_ref with explicit memory ordering Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
1 parent a7f42ab commit cf0f7ad

16 files changed

Lines changed: 534 additions & 178 deletions

File tree

.gitignore

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,3 +21,7 @@ target
2121
#.idea/
2222
.env
2323
docs/PLAN-accounting-analytics.md
24+
docs/ringkernel-executive-overview.aux
25+
docs/ringkernel-executive-overview.log
26+
docs/ringkernel-executive-overview.out
27+
docs/ringkernel-executive-overview.toc

CHANGELOG.md

Lines changed: 28 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -7,6 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
77

88
## [Unreleased]
99

10+
## [0.4.2] - 2026-02-06
11+
12+
### Added
13+
14+
#### CUDA Codegen: Warp-Shuffle Reductions
15+
- **Two-phase warp-shuffle reduction** replaces tree reduction in all generated CUDA code
16+
- Phase 1: Intra-warp `__shfl_down_sync(0xFFFFFFFF, val, offset)` — zero `__syncthreads()` calls
17+
- Phase 2: Cross-warp reduction via shared memory — one `__syncthreads()` call
18+
- Applies to: `block_reduce_energy` (persistent FDTD), `generate_block_reduce_fn`, `generate_grid_reduce_fn`, `generate_reduce_and_broadcast_fn`, and all inline reduction generators
19+
- Reduces barrier count from O(log N) to 1 per block reduction (e.g., 9 → 1 for 512-thread blocks)
20+
21+
#### CUDA Codegen: `__nanosleep()` Power Efficiency
22+
- **`PersistentFdtdConfig::idle_sleep_ns`** (default 1000ns): configurable idle sleep duration
23+
- Persistent FDTD idle spin-wait now uses `__nanosleep()` instead of volatile counter loop
24+
- Software grid barrier spin-loop uses `__nanosleep(100)` to reduce power consumption
25+
- Builder: `with_idle_sleep(ns)` to customize sleep duration
26+
27+
#### CUDA Codegen: libcu++ Ordered Atomics (opt-in)
28+
- **`PersistentFdtdConfig::use_libcupp_atomics`** (default false): opt-in `cuda::atomic_ref` support
29+
- When enabled, H2K/K2H queue operations use `memory_order_acquire`/`memory_order_release` instead of `__threadfence_system()` pairs
30+
- Software barrier uses `cuda::thread_scope_device` (narrower than system scope) with `memory_order_acq_rel`
31+
- Compile-time guard: `#if __CUDACC_VER_MAJOR__ < 11` error for CUDA toolkit version check
32+
- Builder: `with_libcupp_atomics(true)` to enable
33+
34+
### Changed
35+
- `block_reduce_energy` in persistent FDTD now uses warp-shuffle instead of shared-memory tree reduction
36+
- All standalone reduction helpers in `reduction_intrinsics.rs` upgraded to warp-shuffle pattern
37+
1038
## [0.4.1] - 2026-02-06
1139

1240
### Added

Cargo.lock

Lines changed: 23 additions & 23 deletions
Some generated files are not rendered by default. Learn more about customizing how changed files appear on GitHub.

Cargo.toml

Lines changed: 13 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -27,7 +27,7 @@ members = [
2727
]
2828

2929
[workspace.package]
30-
version = "0.4.1"
30+
version = "0.4.2"
3131
edition = "2021"
3232
authors = ["Michael Ivertowski <mivertowski@outlook.com>"]
3333
license = "Apache-2.0"
@@ -92,18 +92,18 @@ aws-config = { version = "1.6", features = ["behavior-version-latest"] }
9292
bytes = "1.9"
9393

9494
# Internal crates - version must match workspace version for publishing
95-
ringkernel-core = { version = "0.4.1", path = "crates/ringkernel-core" }
96-
ringkernel-derive = { version = "0.4.1", path = "crates/ringkernel-derive" }
97-
ringkernel-cpu = { version = "0.4.1", path = "crates/ringkernel-cpu" }
98-
ringkernel-cuda = { version = "0.4.1", path = "crates/ringkernel-cuda" }
99-
ringkernel-wgpu = { version = "0.4.1", path = "crates/ringkernel-wgpu" }
100-
ringkernel-metal = { version = "0.4.1", path = "crates/ringkernel-metal" }
101-
ringkernel-codegen = { version = "0.4.1", path = "crates/ringkernel-codegen" }
102-
ringkernel-cuda-codegen = { version = "0.4.1", path = "crates/ringkernel-cuda-codegen" }
103-
ringkernel-wgpu-codegen = { version = "0.4.1", path = "crates/ringkernel-wgpu-codegen" }
104-
ringkernel-ir = { version = "0.4.1", path = "crates/ringkernel-ir" }
105-
ringkernel-wavesim = { version = "0.4.1", path = "crates/ringkernel-wavesim" }
106-
ringkernel-ecosystem = { version = "0.4.1", path = "crates/ringkernel-ecosystem" }
95+
ringkernel-core = { version = "0.4.2", path = "crates/ringkernel-core" }
96+
ringkernel-derive = { version = "0.4.2", path = "crates/ringkernel-derive" }
97+
ringkernel-cpu = { version = "0.4.2", path = "crates/ringkernel-cpu" }
98+
ringkernel-cuda = { version = "0.4.2", path = "crates/ringkernel-cuda" }
99+
ringkernel-wgpu = { version = "0.4.2", path = "crates/ringkernel-wgpu" }
100+
ringkernel-metal = { version = "0.4.2", path = "crates/ringkernel-metal" }
101+
ringkernel-codegen = { version = "0.4.2", path = "crates/ringkernel-codegen" }
102+
ringkernel-cuda-codegen = { version = "0.4.2", path = "crates/ringkernel-cuda-codegen" }
103+
ringkernel-wgpu-codegen = { version = "0.4.2", path = "crates/ringkernel-wgpu-codegen" }
104+
ringkernel-ir = { version = "0.4.2", path = "crates/ringkernel-ir" }
105+
ringkernel-wavesim = { version = "0.4.2", path = "crates/ringkernel-wavesim" }
106+
ringkernel-ecosystem = { version = "0.4.2", path = "crates/ringkernel-ecosystem" }
107107

108108
[profile.release]
109109
lto = true

crates/ringkernel-accnet/Cargo.toml

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -12,10 +12,10 @@ readme = "README.md"
1212

1313
[dependencies]
1414
# Core RingKernel dependencies
15-
ringkernel-core = { version = "0.4.1", path = "../ringkernel-core" }
16-
ringkernel-derive = { version = "0.4.1", path = "../ringkernel-derive" }
17-
ringkernel-cpu = { version = "0.4.1", path = "../ringkernel-cpu" }
18-
ringkernel-cuda-codegen = { version = "0.4.1", path = "../ringkernel-cuda-codegen", optional = true }
15+
ringkernel-core = { version = "0.4.2", path = "../ringkernel-core" }
16+
ringkernel-derive = { version = "0.4.2", path = "../ringkernel-derive" }
17+
ringkernel-cpu = { version = "0.4.2", path = "../ringkernel-cpu" }
18+
ringkernel-cuda-codegen = { version = "0.4.2", path = "../ringkernel-cuda-codegen", optional = true }
1919
syn = { version = "2.0", features = ["full", "parsing"], optional = true }
2020

2121
# Async runtime
@@ -52,7 +52,7 @@ env_logger = "0.11"
5252
log = "0.4"
5353

5454
# Optional CUDA backend
55-
ringkernel-cuda = { version = "0.4.1", path = "../ringkernel-cuda", optional = true, features = ["cuda"] }
55+
ringkernel-cuda = { version = "0.4.2", path = "../ringkernel-cuda", optional = true, features = ["cuda"] }
5656
cudarc = { workspace = true, optional = true }
5757

5858
[dev-dependencies]

crates/ringkernel-cli/Cargo.toml

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -53,9 +53,9 @@ tracing = { workspace = true }
5353
tracing-subscriber = { workspace = true }
5454

5555
# RingKernel crates (for codegen and validation)
56-
ringkernel-ir = { version = "0.4.1", path = "../ringkernel-ir" }
57-
ringkernel-cuda-codegen = { version = "0.4.1", path = "../ringkernel-cuda-codegen", optional = true }
58-
ringkernel-wgpu-codegen = { version = "0.4.1", path = "../ringkernel-wgpu-codegen", optional = true }
56+
ringkernel-ir = { version = "0.4.2", path = "../ringkernel-ir" }
57+
ringkernel-cuda-codegen = { version = "0.4.2", path = "../ringkernel-cuda-codegen", optional = true }
58+
ringkernel-wgpu-codegen = { version = "0.4.2", path = "../ringkernel-wgpu-codegen", optional = true }
5959

6060
# Proc macro parsing
6161
syn = { workspace = true }

0 commit comments

Comments
 (0)