mivertowski
diff --git a/‎.gitignore‎
Lines changed: 4 additions & 0 deletions b/‎.gitignore‎
Lines changed: 4 additions & 0 deletions
diff --git a/‎CHANGELOG.md‎
Lines changed: 28 additions & 0 deletions b/‎CHANGELOG.md‎
Lines changed: 28 additions & 0 deletions
diff --git a/‎Cargo.lock‎
Lines changed: 23 additions & 23 deletions b/‎Cargo.lock‎
Lines changed: 23 additions & 23 deletions
diff --git a/‎Cargo.toml‎
Lines changed: 13 additions & 13 deletions b/‎Cargo.toml‎
Lines changed: 13 additions & 13 deletions
diff --git a/‎crates/ringkernel-accnet/Cargo.toml‎
Lines changed: 5 additions & 5 deletions b/‎crates/ringkernel-accnet/Cargo.toml‎
Lines changed: 5 additions & 5 deletions
diff --git a/‎crates/ringkernel-cli/Cargo.toml‎
Lines changed: 3 additions & 3 deletions b/‎crates/ringkernel-cli/Cargo.toml‎
Lines changed: 3 additions & 3 deletions
@@ -21,3 +21,7 @@ target
 #.idea/
 .env
 docs/PLAN-accounting-analytics.md
+docs/ringkernel-executive-overview.aux
+docs/ringkernel-executive-overview.log
+docs/ringkernel-executive-overview.out
+docs/ringkernel-executive-overview.toc
@@ -7,6 +7,34 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
 
 ## [Unreleased]
 
+## [0.4.2] - 2026-02-06
+
+### Added
+
+#### CUDA Codegen: Warp-Shuffle Reductions
+- **Two-phase warp-shuffle reduction** replaces tree reduction in all generated CUDA code
+  - Phase 1: Intra-warp `__shfl_down_sync(0xFFFFFFFF, val, offset)` — zero `__syncthreads()` calls
+  - Phase 2: Cross-warp reduction via shared memory — one `__syncthreads()` call
+  - Applies to: `block_reduce_energy` (persistent FDTD), `generate_block_reduce_fn`, `generate_grid_reduce_fn`, `generate_reduce_and_broadcast_fn`, and all inline reduction generators
+  - Reduces barrier count from O(log N) to 1 per block reduction (e.g., 9 → 1 for 512-thread blocks)
+
+#### CUDA Codegen: `__nanosleep()` Power Efficiency
+- **`PersistentFdtdConfig::idle_sleep_ns`** (default 1000ns): configurable idle sleep duration
+- Persistent FDTD idle spin-wait now uses `__nanosleep()` instead of volatile counter loop
+- Software grid barrier spin-loop uses `__nanosleep(100)` to reduce power consumption
+- Builder: `with_idle_sleep(ns)` to customize sleep duration
+
+#### CUDA Codegen: libcu++ Ordered Atomics (opt-in)
+- **`PersistentFdtdConfig::use_libcupp_atomics`** (default false): opt-in `cuda::atomic_ref` support
+- When enabled, H2K/K2H queue operations use `memory_order_acquire`/`memory_order_release` instead of `__threadfence_system()` pairs
+- Software barrier uses `cuda::thread_scope_device` (narrower than system scope) with `memory_order_acq_rel`
+- Compile-time guard: `#if __CUDACC_VER_MAJOR__ < 11` error for CUDA toolkit version check
+- Builder: `with_libcupp_atomics(true)` to enable
+
+### Changed
+- `block_reduce_energy` in persistent FDTD now uses warp-shuffle instead of shared-memory tree reduction
+- All standalone reduction helpers in `reduction_intrinsics.rs` upgraded to warp-shuffle pattern
+
 ## [0.4.1] - 2026-02-06
 
 ### Added
 
@@ -27,7 +27,7 @@ members = [
 ]
 
 [workspace.package]
-version = "0.4.1"
+version = "0.4.2"
 edition = "2021"
 authors = ["Michael Ivertowski <mivertowski@outlook.com>"]
 license = "Apache-2.0"
@@ -92,18 +92,18 @@ aws-config = { version = "1.6", features = ["behavior-version-latest"] }
 bytes = "1.9"
 
 # Internal crates - version must match workspace version for publishing
-ringkernel-core = { version = "0.4.1", path = "crates/ringkernel-core" }
-ringkernel-derive = { version = "0.4.1", path = "crates/ringkernel-derive" }
-ringkernel-cpu = { version = "0.4.1", path = "crates/ringkernel-cpu" }
-ringkernel-cuda = { version = "0.4.1", path = "crates/ringkernel-cuda" }
-ringkernel-wgpu = { version = "0.4.1", path = "crates/ringkernel-wgpu" }
-ringkernel-metal = { version = "0.4.1", path = "crates/ringkernel-metal" }
-ringkernel-codegen = { version = "0.4.1", path = "crates/ringkernel-codegen" }
-ringkernel-cuda-codegen = { version = "0.4.1", path = "crates/ringkernel-cuda-codegen" }
-ringkernel-wgpu-codegen = { version = "0.4.1", path = "crates/ringkernel-wgpu-codegen" }
-ringkernel-ir = { version = "0.4.1", path = "crates/ringkernel-ir" }
-ringkernel-wavesim = { version = "0.4.1", path = "crates/ringkernel-wavesim" }
-ringkernel-ecosystem = { version = "0.4.1", path = "crates/ringkernel-ecosystem" }
+ringkernel-core = { version = "0.4.2", path = "crates/ringkernel-core" }
+ringkernel-derive = { version = "0.4.2", path = "crates/ringkernel-derive" }
+ringkernel-cpu = { version = "0.4.2", path = "crates/ringkernel-cpu" }
+ringkernel-cuda = { version = "0.4.2", path = "crates/ringkernel-cuda" }
+ringkernel-wgpu = { version = "0.4.2", path = "crates/ringkernel-wgpu" }
+ringkernel-metal = { version = "0.4.2", path = "crates/ringkernel-metal" }
+ringkernel-codegen = { version = "0.4.2", path = "crates/ringkernel-codegen" }
+ringkernel-cuda-codegen = { version = "0.4.2", path = "crates/ringkernel-cuda-codegen" }
+ringkernel-wgpu-codegen = { version = "0.4.2", path = "crates/ringkernel-wgpu-codegen" }
+ringkernel-ir = { version = "0.4.2", path = "crates/ringkernel-ir" }
+ringkernel-wavesim = { version = "0.4.2", path = "crates/ringkernel-wavesim" }
+ringkernel-ecosystem = { version = "0.4.2", path = "crates/ringkernel-ecosystem" }
 
 [profile.release]
 lto = true
 
@@ -12,10 +12,10 @@ readme = "README.md"
 
 [dependencies]
 # Core RingKernel dependencies
-ringkernel-core = { version = "0.4.1", path = "../ringkernel-core" }
-ringkernel-derive = { version = "0.4.1", path = "../ringkernel-derive" }
-ringkernel-cpu = { version = "0.4.1", path = "../ringkernel-cpu" }
-ringkernel-cuda-codegen = { version = "0.4.1", path = "../ringkernel-cuda-codegen", optional = true }
+ringkernel-core = { version = "0.4.2", path = "../ringkernel-core" }
+ringkernel-derive = { version = "0.4.2", path = "../ringkernel-derive" }
+ringkernel-cpu = { version = "0.4.2", path = "../ringkernel-cpu" }
+ringkernel-cuda-codegen = { version = "0.4.2", path = "../ringkernel-cuda-codegen", optional = true }
 syn = { version = "2.0", features = ["full", "parsing"], optional = true }
 
 # Async runtime
@@ -52,7 +52,7 @@ env_logger = "0.11"
 log = "0.4"
 
 # Optional CUDA backend
-ringkernel-cuda = { version = "0.4.1", path = "../ringkernel-cuda", optional = true, features = ["cuda"] }
+ringkernel-cuda = { version = "0.4.2", path = "../ringkernel-cuda", optional = true, features = ["cuda"] }
 cudarc = { workspace = true, optional = true }
 
 [dev-dependencies]
 
@@ -53,9 +53,9 @@ tracing = { workspace = true }
 tracing-subscriber = { workspace = true }
 
 # RingKernel crates (for codegen and validation)
-ringkernel-ir = { version = "0.4.1", path = "../ringkernel-ir" }
-ringkernel-cuda-codegen = { version = "0.4.1", path = "../ringkernel-cuda-codegen", optional = true }
-ringkernel-wgpu-codegen = { version = "0.4.1", path = "../ringkernel-wgpu-codegen", optional = true }
+ringkernel-ir = { version = "0.4.2", path = "../ringkernel-ir" }
+ringkernel-cuda-codegen = { version = "0.4.2", path = "../ringkernel-cuda-codegen", optional = true }
+ringkernel-wgpu-codegen = { version = "0.4.2", path = "../ringkernel-wgpu-codegen", optional = true }
 
 # Proc macro parsing
 syn = { workspace = true }