From ce8f4add3eef29b84e6231088a7eb54994a81eab Mon Sep 17 00:00:00 2001 From: Victor Camara Date: Wed, 15 Apr 2026 20:45:13 -0300 Subject: [PATCH 1/4] refactor: change CUDA and ROCm backends to per-stream syncronization --- ext/CUDAExt.jl | 4 ++-- ext/ROCExt.jl | 4 ++-- 2 files changed, 4 insertions(+), 4 deletions(-) diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl index ea77889ff..4e232d76d 100644 --- a/ext/CUDAExt.jl +++ b/ext/CUDAExt.jl @@ -104,7 +104,7 @@ end function _sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) with_context(x) do - CUDA.synchronize() + CUDA.synchronize(stream()) end end function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) @@ -391,7 +391,7 @@ Dagger.gpu_with_device(f, proc::CuArrayDeviceProc) = CUDA.device!(f, proc.device) function Dagger.gpu_synchronize(proc::CuArrayDeviceProc) with_context(proc) do - CUDA.synchronize() + CUDA.synchronize(stream()) end end function Dagger.gpu_synchronize(::Val{:CUDA}) diff --git a/ext/ROCExt.jl b/ext/ROCExt.jl index 3ab6d0731..c2058b829 100644 --- a/ext/ROCExt.jl +++ b/ext/ROCExt.jl @@ -98,7 +98,7 @@ end function _sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) with_context(x) do - AMDGPU.synchronize() + AMDGPU.synchronize(stream()) end end function sync_with_context(x::Union{Dagger.Processor,Dagger.MemorySpace}) @@ -364,7 +364,7 @@ Dagger.gpu_with_device(f, proc::ROCArrayDeviceProc) = AMDGPU.device!(f, AMDGPU.devices()[proc.device_id]) function Dagger.gpu_synchronize(proc::ROCArrayDeviceProc) with_context(proc) do - AMDGPU.synchronize() + AMDGPU.synchronize(stream()) end end function Dagger.gpu_synchronize(::Val{:ROC}) From b4640706eec8ddb7744148d20ff924b9805e2662 Mon Sep 17 00:00:00 2001 From: Victor Camara Date: Wed, 15 Apr 2026 21:12:26 -0300 Subject: [PATCH 2/4] add teste cuda --- test/multistream/CUDAsync.jl | 26 ++++++++++++++++++++++++++ 1 file changed, 26 insertions(+) create mode 100644 test/multistream/CUDAsync.jl diff --git a/test/multistream/CUDAsync.jl b/test/multistream/CUDAsync.jl new file mode 100644 index 000000000..133026b34 --- /dev/null +++ b/test/multistream/CUDAsync.jl @@ -0,0 +1,26 @@ +using Test +using Dagger, CUDA + +processors = collect(Dagger.get_processors(Dagger.OSProc())) +proc = first(filter(p -> contains(string(typeof(p)), "CuArray"), processors)) +println("Found: ", proc) + +@testset "gpu_synchronize is per-stream (CUDA)" begin + @test begin + Dagger.gpu_synchronize(proc) + true + end + + @test begin + Dagger.gpu_synchronize(Val(:CUDA)) + true + end + + t = Dagger.@spawn CuArray(rand(Float32, 64, 64)) + arr = fetch(t) + Dagger.gpu_synchronize(proc) + + host = Array(arr) + @test size(host) == (64, 64) + @test !any(isnan, host) +end \ No newline at end of file From 96d48dbbc2cf695e6b4ce34ce6840b100f6e8c57 Mon Sep 17 00:00:00 2001 From: Victor Camara Date: Wed, 15 Apr 2026 21:23:20 -0300 Subject: [PATCH 3/4] add ROCm test, both tests were written by claude --- test/multistream/ROCmsync.jl | 38 ++++++++++++++++++++++++++++++++++++ 1 file changed, 38 insertions(+) create mode 100644 test/multistream/ROCmsync.jl diff --git a/test/multistream/ROCmsync.jl b/test/multistream/ROCmsync.jl new file mode 100644 index 000000000..a3e9e8d30 --- /dev/null +++ b/test/multistream/ROCmsync.jl @@ -0,0 +1,38 @@ +using Test +using Dagger, AMDGPU + +processors = collect(Dagger.get_processors(Dagger.OSProc())) +println("All processors: ") +println.(processors) + +roc_procs = filter(p -> contains(string(typeof(p)), "ROCArray"), processors) + +if isempty(roc_procs) + @warn "No ROCm GPU found, skipping tests" +else + proc = first(roc_procs) + println("\nFound: ", proc) + + @testset "gpu_synchronize is per-stream (ROCm)" begin + # Correctness: sync completes without error + @test begin + Dagger.gpu_synchronize(proc) + true + end + + # Val dispatch also works + @test begin + Dagger.gpu_synchronize(Val(:ROC)) + true + end + + # Data integrity: result is available after sync, no garbage values + t = Dagger.@spawn ROCArray(rand(Float32, 64, 64)) + arr = fetch(t) + Dagger.gpu_synchronize(proc) + + host = Array(arr) + @test size(host) == (64, 64) + @test !any(isnan, host) + end +end \ No newline at end of file From 0682040f299e1e3cc76067ad2145ef7813bcb0bd Mon Sep 17 00:00:00 2001 From: Victor Camara Date: Wed, 15 Apr 2026 21:54:41 -0300 Subject: [PATCH 4/4] Remove both test files --- test/multistream/CUDAsync.jl | 26 ------------------------ test/multistream/ROCmsync.jl | 38 ------------------------------------ 2 files changed, 64 deletions(-) delete mode 100644 test/multistream/CUDAsync.jl delete mode 100644 test/multistream/ROCmsync.jl diff --git a/test/multistream/CUDAsync.jl b/test/multistream/CUDAsync.jl deleted file mode 100644 index 133026b34..000000000 --- a/test/multistream/CUDAsync.jl +++ /dev/null @@ -1,26 +0,0 @@ -using Test -using Dagger, CUDA - -processors = collect(Dagger.get_processors(Dagger.OSProc())) -proc = first(filter(p -> contains(string(typeof(p)), "CuArray"), processors)) -println("Found: ", proc) - -@testset "gpu_synchronize is per-stream (CUDA)" begin - @test begin - Dagger.gpu_synchronize(proc) - true - end - - @test begin - Dagger.gpu_synchronize(Val(:CUDA)) - true - end - - t = Dagger.@spawn CuArray(rand(Float32, 64, 64)) - arr = fetch(t) - Dagger.gpu_synchronize(proc) - - host = Array(arr) - @test size(host) == (64, 64) - @test !any(isnan, host) -end \ No newline at end of file diff --git a/test/multistream/ROCmsync.jl b/test/multistream/ROCmsync.jl deleted file mode 100644 index a3e9e8d30..000000000 --- a/test/multistream/ROCmsync.jl +++ /dev/null @@ -1,38 +0,0 @@ -using Test -using Dagger, AMDGPU - -processors = collect(Dagger.get_processors(Dagger.OSProc())) -println("All processors: ") -println.(processors) - -roc_procs = filter(p -> contains(string(typeof(p)), "ROCArray"), processors) - -if isempty(roc_procs) - @warn "No ROCm GPU found, skipping tests" -else - proc = first(roc_procs) - println("\nFound: ", proc) - - @testset "gpu_synchronize is per-stream (ROCm)" begin - # Correctness: sync completes without error - @test begin - Dagger.gpu_synchronize(proc) - true - end - - # Val dispatch also works - @test begin - Dagger.gpu_synchronize(Val(:ROC)) - true - end - - # Data integrity: result is available after sync, no garbage values - t = Dagger.@spawn ROCArray(rand(Float32, 64, 64)) - arr = fetch(t) - Dagger.gpu_synchronize(proc) - - host = Array(arr) - @test size(host) == (64, 64) - @test !any(isnan, host) - end -end \ No newline at end of file