Skip to content

Commit 9d3aa0c

Browse files
mivertowskiclaude
andcommitted
Update documentation for 120+ CUDA intrinsics expansion
- Add CUDA Codegen Intrinsics Expansion section to CHANGELOG - Update README with 120+ intrinsics count and 3D stencil patterns - Update docs/13-cuda-codegen.md with complete intrinsics reference - Fix clippy excessive_precision warnings in dsl.rs erf() function - Format code with cargo fmt Changes from merged PR #8: - Expanded GPU intrinsics from ~45 to 120+ operations - Added 11 atomic operations (and, or, xor, inc, dec, etc.) - Added 3D stencil intrinsics (up, down, at with dz) - Added warp match/reduce operations (Volta+/SM 8.0+) - Added bit manipulation, memory, special, and timing ops - Updated tests from 143 to 171 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
1 parent 9081dbc commit 9d3aa0c

6 files changed

Lines changed: 360 additions & 98 deletions

File tree

CHANGELOG.md

Lines changed: 16 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -21,9 +21,25 @@ and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0
2121
- Material absorption modeling with frequency-dependent coefficients
2222
- Interactive 3D camera controls and visualization modes
2323

24+
#### CUDA Codegen Intrinsics Expansion
25+
- Expanded GPU intrinsics from ~45 to **120+ operations** across 13 categories
26+
- **Atomic Operations** (11 ops): `atomic_add`, `atomic_sub`, `atomic_min`, `atomic_max`, `atomic_exchange`, `atomic_cas`, `atomic_and`, `atomic_or`, `atomic_xor`, `atomic_inc`, `atomic_dec`
27+
- **Synchronization** (7 ops): `sync_threads`, `sync_threads_count`, `sync_threads_and`, `sync_threads_or`, `thread_fence`, `thread_fence_block`, `thread_fence_system`
28+
- **Trigonometric** (11 ops): `sin`, `cos`, `tan`, `asin`, `acos`, `atan`, `atan2`, `sincos`, `sinpi`, `cospi`
29+
- **Hyperbolic** (6 ops): `sinh`, `cosh`, `tanh`, `asinh`, `acosh`, `atanh`
30+
- **Exponential/Logarithmic** (18 ops): `exp`, `exp2`, `exp10`, `expm1`, `log`, `ln`, `log2`, `log10`, `log1p`, `pow`, `ldexp`, `scalbn`, `ilogb`, `erf`, `erfc`, `erfinv`, `erfcinv`, `lgamma`, `tgamma`
31+
- **Classification** (8 ops): `is_nan`, `is_infinite`, `is_finite`, `is_normal`, `signbit`, `nextafter`, `fdim`
32+
- **Warp Operations** (16 ops): `warp_active_mask`, `warp_shfl`, `warp_shfl_up`, `warp_shfl_down`, `warp_shfl_xor`, `warp_ballot`, `warp_all`, `warp_any`, `warp_match_any`, `warp_match_all`, `warp_reduce_add/min/max/and/or/xor`
33+
- **Bit Manipulation** (8 ops): `popc`, `clz`, `ctz`, `ffs`, `brev`, `byte_perm`, `funnel_shift_left`, `funnel_shift_right`
34+
- **Memory Operations** (3 ops): `ldg`, `prefetch_l1`, `prefetch_l2`
35+
- **Special Functions** (13 ops): `rcp`, `fast_div`, `saturate`, `j0`, `j1`, `jn`, `y0`, `y1`, `yn`, `normcdf`, `normcdfinv`, `cyl_bessel_i0`, `cyl_bessel_i1`
36+
- **Timing** (3 ops): `clock`, `clock64`, `nanosleep`
37+
- **3D Stencil Intrinsics**: `pos.up(buf)`, `pos.down(buf)`, `pos.at(buf, dx, dy, dz)` for volumetric kernels
38+
2439
### Changed
2540
- Added `required-features` to CUDA-only wavesim binaries to fix build without CUDA
2641
- Updated GitHub Actions release workflow with proper feature flags and Ubuntu version
42+
- Updated ringkernel-cuda-codegen tests from 143 to 171 tests
2743

2844
### Fixed
2945
- Fixed release workflow feature flags for showcase applications

README.md

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -463,15 +463,16 @@ let cuda_code = transpile_ring_kernel(&handler, &config)?;
463463
```
464464

465465
**DSL Features:**
466-
- Thread/block indices: `thread_idx_x()`, `block_idx_x()`, `block_dim_x()`, `grid_dim_x()`
466+
- Thread/block indices: `thread_idx_x()`, `block_idx_x()`, `block_dim_x()`, `grid_dim_x()`, `warp_size()`
467467
- Control flow: `if/else`, `match` → switch/case, early `return`
468468
- Loops: `for i in 0..n`, `while cond`, `loop` with `break`/`continue`
469-
- Stencil patterns: `pos.north()`, `pos.south()`, `pos.east()`, `pos.west()`, `pos.at(dx, dy)`
469+
- Stencil patterns (2D): `pos.north()`, `pos.south()`, `pos.east()`, `pos.west()`, `pos.at(dx, dy)`
470+
- Stencil patterns (3D): `pos.up()`, `pos.down()`, `pos.at(dx, dy, dz)` for volumetric kernels
470471
- Shared memory: `__shared__` arrays and tiles
471472
- Struct literals: `Point { x: 1.0, y: 2.0 }` → C compound literals
472473
- Reference expressions: `&arr[idx]` → pointer with automatic `->` for field access
473474
- Type inference: Tracks pointer variables for correct accessor generation
474-
- 45+ GPU intrinsics (atomics, warp ops, sync, math)
475+
- **120+ GPU intrinsics** across 13 categories (atomics, warp ops, sync, math, trig, bit manipulation, memory, timing)
475476

476477
**Ring Kernel Features:**
477478
- Persistent message loop with ControlBlock lifecycle

crates/ringkernel-cuda-codegen/src/dsl.rs

Lines changed: 50 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -160,21 +160,33 @@ pub fn sync_threads() {
160160
/// Transpiles to: `__syncthreads_count(predicate)`
161161
#[inline]
162162
pub fn sync_threads_count(predicate: bool) -> i32 {
163-
if predicate { 1 } else { 0 }
163+
if predicate {
164+
1
165+
} else {
166+
0
167+
}
164168
}
165169

166170
/// Synchronize threads with AND of predicate.
167171
/// Transpiles to: `__syncthreads_and(predicate)`
168172
#[inline]
169173
pub fn sync_threads_and(predicate: bool) -> i32 {
170-
if predicate { 1 } else { 0 }
174+
if predicate {
175+
1
176+
} else {
177+
0
178+
}
171179
}
172180

173181
/// Synchronize threads with OR of predicate.
174182
/// Transpiles to: `__syncthreads_or(predicate)`
175183
#[inline]
176184
pub fn sync_threads_or(predicate: bool) -> i32 {
177-
if predicate { 1 } else { 0 }
185+
if predicate {
186+
1
187+
} else {
188+
0
189+
}
178190
}
179191

180192
/// Thread memory fence.
@@ -585,12 +597,12 @@ pub fn ilogb(x: f32) -> i32 {
585597
#[inline]
586598
pub fn erf(x: f32) -> f32 {
587599
// Approximation using Horner form
588-
let a1 = 0.254829592_f32;
589-
let a2 = -0.284496736_f32;
590-
let a3 = 1.421413741_f32;
591-
let a4 = -1.453152027_f32;
592-
let a5 = 1.061405429_f32;
593-
let p = 0.3275911_f32;
600+
let a1 = 0.254_829_6_f32;
601+
let a2 = -0.284_496_74_f32;
602+
let a3 = 1.421_413_7_f32;
603+
let a4 = -1.453_152_f32;
604+
let a5 = 1.061_405_4_f32;
605+
let p = 0.327_591_1_f32;
594606

595607
let sign = if x < 0.0 { -1.0 } else { 1.0 };
596608
let x = x.abs();
@@ -654,7 +666,11 @@ pub fn nextafter(x: f32, y: f32) -> f32 {
654666
/// Floating-point difference. Transpiles to: `fdimf(x, y)`
655667
#[inline]
656668
pub fn fdim(x: f32, y: f32) -> f32 {
657-
if x > y { x - y } else { 0.0 }
669+
if x > y {
670+
x - y
671+
} else {
672+
0.0
673+
}
658674
}
659675

660676
// ============================================================================
@@ -670,7 +686,11 @@ pub fn warp_active_mask() -> u32 {
670686
/// Warp ballot. Transpiles to: `__ballot_sync(mask, predicate)`
671687
#[inline]
672688
pub fn warp_ballot(_mask: u32, predicate: bool) -> u32 {
673-
if predicate { 1 } else { 0 }
689+
if predicate {
690+
1
691+
} else {
692+
0
693+
}
674694
}
675695

676696
/// Warp all predicate. Transpiles to: `__all_sync(mask, predicate)`
@@ -788,19 +808,31 @@ pub fn leading_zeros(x: i32) -> i32 {
788808
/// Count trailing zeros. Transpiles to: `__ffs(x) - 1`
789809
#[inline]
790810
pub fn ctz(x: u32) -> i32 {
791-
if x == 0 { 32 } else { x.trailing_zeros() as i32 }
811+
if x == 0 {
812+
32
813+
} else {
814+
x.trailing_zeros() as i32
815+
}
792816
}
793817

794818
/// Count trailing zeros (i32 version).
795819
#[inline]
796820
pub fn trailing_zeros(x: i32) -> i32 {
797-
if x == 0 { 32 } else { (x as u32).trailing_zeros() as i32 }
821+
if x == 0 {
822+
32
823+
} else {
824+
(x as u32).trailing_zeros() as i32
825+
}
798826
}
799827

800828
/// Find first set bit (1-indexed, 0 if none). Transpiles to: `__ffs(x)`
801829
#[inline]
802830
pub fn ffs(x: u32) -> i32 {
803-
if x == 0 { 0 } else { (x.trailing_zeros() + 1) as i32 }
831+
if x == 0 {
832+
0
833+
} else {
834+
(x.trailing_zeros() + 1) as i32
835+
}
804836
}
805837

806838
/// Bit reverse. Transpiles to: `__brev(x)`
@@ -1059,7 +1091,10 @@ mod tests {
10591091
#[test]
10601092
fn test_funnel_shift() {
10611093
assert_eq!(funnel_shift_left(0xFFFF_0000, 0x0000_FFFF, 16), 0xFFFF_FFFF);
1062-
assert_eq!(funnel_shift_right(0xFFFF_0000, 0x0000_FFFF, 16), 0xFFFF_FFFF);
1094+
assert_eq!(
1095+
funnel_shift_right(0xFFFF_0000, 0x0000_FFFF, 16),
1096+
0xFFFF_FFFF
1097+
);
10631098
}
10641099

10651100
#[test]

0 commit comments

Comments
 (0)