From 09170f072e54afe9895c401cdf4a6c955c4e42f1 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Wed, 27 May 2026 12:52:13 -0700 Subject: [PATCH 01/10] [SYCL][Graph] Allow capturing restricted host tasks in native recording mode --- sycl/include/sycl/handler.hpp | 2 + sycl/source/detail/host_task.hpp | 6 ++ sycl/source/handler.cpp | 39 +++++++++- .../enqueue_func_host_task.cpp | 75 +++++++++++++++++++ 4 files changed, 118 insertions(+), 4 deletions(-) create mode 100644 sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp diff --git a/sycl/include/sycl/handler.hpp b/sycl/include/sycl/handler.hpp index 3e9a4125987f9..c1af708d90f49 100644 --- a/sycl/include/sycl/handler.hpp +++ b/sycl/include/sycl/handler.hpp @@ -3018,6 +3018,8 @@ class HandlerAccess { Handler.internalProfilingTagImpl(); } + static std::function getHostTaskFunc(detail::HostTask &HT); + template static std::enable_if_t< detail::check_fn_signature, void()>::value> diff --git a/sycl/source/detail/host_task.hpp b/sycl/source/detail/host_task.hpp index 83c9c503a5d3e..db55da8c7fb57 100644 --- a/sycl/source/detail/host_task.hpp +++ b/sycl/source/detail/host_task.hpp @@ -75,8 +75,14 @@ class HostTask { friend class DispatchHostTask; friend class ExecCGCommand; + friend class sycl::detail::HandlerAccess; }; +inline std::function +HandlerAccess::getHostTaskFunc(HostTask &HT) { + return std::move(HT.MHostTask); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 7b7ba020cc07f..9aa1224137cf9 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -766,10 +766,41 @@ detail::EventImplPtr handler::finalize() { // Native graph recording limitation if (type == detail::CGType::CodeplayHostTask && Queue->isNativeRecording()) { - throw sycl::exception( - make_error_code(errc::feature_not_supported), - "SYCL host_task is not supported in native recording mode. Use " - "zeCommandListAppendHostFunction as a workaround."); + auto *HT = static_cast(CommandGroup.get()); + if (!HT->MHostTask->isCreatedFromEnqueueFunction()) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "Only restricted host tasks may be captured in native recording mode."); + } + + bool NativeHostTaskSupport = false; + Queue->getAdapter().call( + detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), + UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, + sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); + if (!NativeHostTaskSupport) { + throw sycl::exception( + make_error_code(errc::feature_not_supported), + "Recording host tasks in native recording mode requires backend support" + "not available on this device."); + } + + auto CallbackData = std::make_unique>( + detail::HandlerAccess::getHostTaskFunc(*HT->MHostTask)); + + auto HostTaskCallback = [](void *Data) { + std::unique_ptr> F( + static_cast *>(Data)); + (*F)(); + }; + + Queue->getAdapter().call( + Queue->getHandleRef(), HostTaskCallback, CallbackData.get(), nullptr, 0, + nullptr, nullptr); + // Ownership transferred on success. + (void)CallbackData.release(); + + return detail::event_impl::create_completed_host_event(); } if (!CommandGroup->getRequirements().empty() && Queue->isNativeRecording()) { throw sycl::exception( diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp new file mode 100644 index 0000000000000..ba36dd66649d9 --- /dev/null +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp @@ -0,0 +1,75 @@ +// REQUIRES: level_zero_v2_adapter +// REQUIRES: level_zero_dev_kit +// REQUIRES: arch-intel_gpu_bmg_g21 +// UNSUPPORTED: windows && gpu-intel-gen12 + +// RUN: %{build} %level_zero_options -o %t.out +// RUN: %{run} %t.out +// RUN: %if level_zero %{%{l0_leak_check} %{run} %t.out 2>&1 | FileCheck %s --implicit-check-not=LEAK %} + +// Tests that syclex::host_task() can be recorded into a native-recording SYCL +// Graph and executes correctly between two SYCL kernels. + +#include "../../graph_common.hpp" +#include "../../ze_common.hpp" + +#include +#include + +namespace syclex = sycl::ext::oneapi::experimental; + +constexpr size_t N = 1024; + +int main() { + queue Queue{property::queue::in_order{}}; + + const sycl::context Context = Queue.get_context(); + const sycl::device Device = Queue.get_device(); + + uint32_t *Data = malloc_shared(N, Queue); + + ze_command_list_handle_t ZeCommandList; + bool success = getCommandListFromQueue(Queue, ZeCommandList); + assert(success); + + exp_ext::command_graph Graph{ + Context, Device, {exp_ext::property::graph::enable_native_recording{}}}; + + CommandListStateVerifier verifier(ZeCommandList); + verifier.verify(EXECUTING); + + Graph.begin_recording(Queue); + verifier.verify(RECORDING); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { + Data[idx] = static_cast(idx[0]) + 1; + }); + }); + + syclex::host_task(Queue, [=] { + for (size_t i = 0; i < N; i++) { + Data[i] *= 2; + } + }); + + Queue.submit([&](handler &CGH) { + CGH.parallel_for(range<1>{N}, + [=](id<1> idx) { Data[idx] += 10; }); + }); + + Graph.end_recording(Queue); + verifier.verify(EXECUTING); + + auto ExecutableGraph = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + uint32_t Expected = static_cast((i + 1) * 2 + 10); + assert(check_value(i, Expected, Data[i], "Data")); + } + + free(Data, Queue); + return 0; +} From 53a6d9c20f8571abc30cb70080236fb9c19ebd0e Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Wed, 27 May 2026 15:08:19 -0700 Subject: [PATCH 02/10] Code formatting --- sycl/source/detail/host_task.hpp | 3 +-- sycl/source/handler.cpp | 14 +++++++------- .../NativeRecording/enqueue_func_host_task.cpp | 3 +-- 3 files changed, 9 insertions(+), 11 deletions(-) diff --git a/sycl/source/detail/host_task.hpp b/sycl/source/detail/host_task.hpp index db55da8c7fb57..ed51d86f9a61e 100644 --- a/sycl/source/detail/host_task.hpp +++ b/sycl/source/detail/host_task.hpp @@ -78,8 +78,7 @@ class HostTask { friend class sycl::detail::HandlerAccess; }; -inline std::function -HandlerAccess::getHostTaskFunc(HostTask &HT) { +inline std::function HandlerAccess::getHostTaskFunc(HostTask &HT) { return std::move(HT.MHostTask); } diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 9aa1224137cf9..6f88c7a3826a9 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -768,9 +768,9 @@ detail::EventImplPtr handler::finalize() { if (type == detail::CGType::CodeplayHostTask && Queue->isNativeRecording()) { auto *HT = static_cast(CommandGroup.get()); if (!HT->MHostTask->isCreatedFromEnqueueFunction()) { - throw sycl::exception( - make_error_code(errc::feature_not_supported), - "Only restricted host tasks may be captured in native recording mode."); + throw sycl::exception(make_error_code(errc::feature_not_supported), + "Only restricted host tasks may be captured in " + "native recording mode."); } bool NativeHostTaskSupport = false; @@ -779,10 +779,10 @@ detail::EventImplPtr handler::finalize() { UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); if (!NativeHostTaskSupport) { - throw sycl::exception( - make_error_code(errc::feature_not_supported), - "Recording host tasks in native recording mode requires backend support" - "not available on this device."); + throw sycl::exception(make_error_code(errc::feature_not_supported), + "Recording host tasks in native recording mode " + "requires backend support" + "not available on this device."); } auto CallbackData = std::make_unique>( diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp index ba36dd66649d9..2b63eebd935bb 100644 --- a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp @@ -54,8 +54,7 @@ int main() { }); Queue.submit([&](handler &CGH) { - CGH.parallel_for(range<1>{N}, - [=](id<1> idx) { Data[idx] += 10; }); + CGH.parallel_for(range<1>{N}, [=](id<1> idx) { Data[idx] += 10; }); }); Graph.end_recording(Queue); From 1e41cd9329036cd9499b04924d6e420ebe3d8999 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 28 May 2026 09:38:57 -0700 Subject: [PATCH 03/10] Update spec --- .../experimental/sycl_ext_oneapi_graph.asciidoc | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc index ddda3cc602888..4ee19723d8d4e 100644 --- a/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc +++ b/sycl/doc/extensions/experimental/sycl_ext_oneapi_graph.asciidoc @@ -2119,6 +2119,13 @@ auto node = graph.add([&](sycl::handler& cgh){ Host-tasks can be updated using <>. +When using `property::graph::enable_native_recording`, host tasks submitted via +`sycl::handler::host_task` are not supported and will throw an exception. Host +tasks submitted via +`sycl::ext::oneapi::experimental::host_task` from the +link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] +extension are supported. + === Queue Behavior In Recording Mode @@ -2648,7 +2655,11 @@ if used in application code. . Using reductions in a graph node. . Using sycl streams in a graph node. -. Using host tasks via `sycl::handler::host_task` in a graph node when `property::graph::enable_native_recording` is set. +. Using host tasks via `sycl::handler::host_task` in a graph node when + `property::graph::enable_native_recording` is set. Host tasks submitted via + `sycl::ext::oneapi::experimental::host_task` from + link:../experimental/sycl_ext_oneapi_enqueue_functions.asciidoc[sycl_ext_oneapi_enqueue_functions] + are supported in native recording mode. . Calling `update()` on an executable graph created from a graph with `property::graph::enable_native_recording`. . Using an out-of-order queue with `property::graph::enable_native_recording`. From 1e8c085508bc1356bdc7a3ac0d11d36a2d4335cf Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 28 May 2026 12:07:51 -0500 Subject: [PATCH 04/10] Refactor: share EnqueueHostTaskData/NativeHostTask between handler and commands Move EnqueueHostTaskData and NativeHostTask from the anonymous namespace in commands.cpp into host_task.hpp so both the native recording path in handler.cpp and the scheduler path in commands.cpp use the same type and callback rather than duplicating the pattern. Co-Authored-By: Claude Sonnet 4.6 --- sycl/source/detail/host_task.hpp | 13 +++++++++++++ sycl/source/detail/scheduler/commands.cpp | 23 ++++++----------------- sycl/source/handler.cpp | 14 ++++---------- 3 files changed, 23 insertions(+), 27 deletions(-) diff --git a/sycl/source/detail/host_task.hpp b/sycl/source/detail/host_task.hpp index ed51d86f9a61e..aaad343d805f6 100644 --- a/sycl/source/detail/host_task.hpp +++ b/sycl/source/detail/host_task.hpp @@ -82,6 +82,19 @@ inline std::function HandlerAccess::getHostTaskFunc(HostTask &HT) { return std::move(HT.MHostTask); } +struct EnqueueHostTaskData { + explicit EnqueueHostTaskData(std::function HostTask) + : Func(std::move(HostTask)) {} + + std::function Func; +}; + +inline void NativeHostTask(void *Data) { + auto HostTaskData = std::unique_ptr( + static_cast(Data)); + HostTaskData->Func(); +} + } // namespace detail } // namespace _V1 } // namespace sycl diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 447ff0225eb03..75aa85b98a26f 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -7,6 +7,7 @@ #include "unified-runtime/ur_api.h" #include +#include #include #include @@ -249,20 +250,6 @@ void InteropFreeFunc(ur_queue_handle_t, void *InteropData) { return Data->func(Data->ih); } -struct EnqueueHostTaskData { - explicit EnqueueHostTaskData(std::function HostTask) - : Func(std::move(HostTask)) {} - - std::function Func; -}; - -void NativeHostTask(void *Data) { - // Callback data is heap-allocated at enqueue time and released here once - // the backend invokes the host task callback. - auto HostTaskData = std::unique_ptr( - static_cast(Data)); - HostTaskData->Func(); -} } // namespace class DispatchHostTask { @@ -392,11 +379,13 @@ class DispatchHostTask { UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); if (NativeHostTaskSupport) { - auto NativeHostTaskData = std::make_unique( - std::move(HostTask.MHostTask->MHostTask)); + auto NativeHostTaskData = + std::make_unique( + std::move(HostTask.MHostTask->MHostTask)); ur_event_handle_t HostTaskEvent{}; Queue->getAdapter().call( - Queue->getHandleRef(), NativeHostTask, NativeHostTaskData.get(), + Queue->getHandleRef(), detail::NativeHostTask, + NativeHostTaskData.get(), nullptr, 0, nullptr, &HostTaskEvent); // Ownership is transferred to NativeHostTask callback on success. (void)NativeHostTaskData.release(); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 6f88c7a3826a9..ace9ec48d7b68 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -781,22 +781,16 @@ detail::EventImplPtr handler::finalize() { if (!NativeHostTaskSupport) { throw sycl::exception(make_error_code(errc::feature_not_supported), "Recording host tasks in native recording mode " - "requires backend support" + "requires backend support " "not available on this device."); } - auto CallbackData = std::make_unique>( + auto CallbackData = std::make_unique( detail::HandlerAccess::getHostTaskFunc(*HT->MHostTask)); - auto HostTaskCallback = [](void *Data) { - std::unique_ptr> F( - static_cast *>(Data)); - (*F)(); - }; - Queue->getAdapter().call( - Queue->getHandleRef(), HostTaskCallback, CallbackData.get(), nullptr, 0, - nullptr, nullptr); + Queue->getHandleRef(), detail::NativeHostTask, CallbackData.get(), + nullptr, 0, nullptr, nullptr); // Ownership transferred on success. (void)CallbackData.release(); From 37ce24e53ea787107189dae11fe3141353cb8e4a Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 28 May 2026 10:17:05 -0700 Subject: [PATCH 05/10] Unsupported reason --- .../RecordReplay/NativeRecording/enqueue_func_host_task.cpp | 1 + 1 file changed, 1 insertion(+) diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp index 2b63eebd935bb..e98901311bcd5 100644 --- a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp @@ -2,6 +2,7 @@ // REQUIRES: level_zero_dev_kit // REQUIRES: arch-intel_gpu_bmg_g21 // UNSUPPORTED: windows && gpu-intel-gen12 +// UNSUPPORTED-INTENDED: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP is not supported on win&gen12. // RUN: %{build} %level_zero_options -o %t.out // RUN: %{run} %t.out From 78830f63954a820d542885e6dd92ecfb069480bc Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Thu, 28 May 2026 11:18:21 -0700 Subject: [PATCH 06/10] Clang format --- sycl/source/detail/scheduler/commands.cpp | 3 +-- .../RecordReplay/NativeRecording/enqueue_func_host_task.cpp | 3 ++- 2 files changed, 3 insertions(+), 3 deletions(-) diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 75aa85b98a26f..7888785e3c60f 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -385,8 +385,7 @@ class DispatchHostTask { ur_event_handle_t HostTaskEvent{}; Queue->getAdapter().call( Queue->getHandleRef(), detail::NativeHostTask, - NativeHostTaskData.get(), - nullptr, 0, nullptr, &HostTaskEvent); + NativeHostTaskData.get(), nullptr, 0, nullptr, &HostTaskEvent); // Ownership is transferred to NativeHostTask callback on success. (void)NativeHostTaskData.release(); diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp index e98901311bcd5..5978ace36778d 100644 --- a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp @@ -2,7 +2,8 @@ // REQUIRES: level_zero_dev_kit // REQUIRES: arch-intel_gpu_bmg_g21 // UNSUPPORTED: windows && gpu-intel-gen12 -// UNSUPPORTED-INTENDED: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP is not supported on win&gen12. +// UNSUPPORTED-INTENDED: UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP is not +// supported on win&gen12. // RUN: %{build} %level_zero_options -o %t.out // RUN: %{run} %t.out From 8702567c79c7443ab01cb6706468927248290cc5 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 29 May 2026 13:28:35 -0700 Subject: [PATCH 07/10] Store callbacks in graph to manage lifetime --- sycl/source/detail/graph/graph_impl.cpp | 11 ++++++++++- sycl/source/detail/graph/graph_impl.hpp | 10 ++++++++++ sycl/source/detail/host_task.hpp | 7 ++++--- sycl/source/detail/queue_impl.hpp | 14 ++++++++++++++ sycl/source/detail/scheduler/commands.cpp | 2 +- sycl/source/handler.cpp | 11 ++++++----- 6 files changed, 45 insertions(+), 10 deletions(-) diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index 4ea11db8c1d64..f33af9f8b664e 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -11,7 +11,8 @@ #include "graph_impl.hpp" #include "dynamic_impl.hpp" // for dynamic classes #include "node_impl.hpp" // for node_impl -#include // for CG, CGExecKernel, CGHostTask, ArgDesc, NDRDescT +#include // for CG, CGExecKernel, CGHostTask, ArgDesc, NDRDescT +#include // for EnqueueHostTaskData #include // for event_impl #include // for handler_impl #include // for KernelArgMask @@ -646,6 +647,12 @@ bool graph_impl::isQueueRecording(sycl::detail::queue_impl &Queue) { return MRecordingQueues.count(Queue.weak_from_this()) > 0; } +sycl::detail::EnqueueHostTaskData *graph_impl::addNativeHostTaskCallback( + std::unique_ptr Data) { + MNativeHostTaskCallbacks.push_back(std::move(Data)); + return MNativeHostTaskCallbacks.back().get(); +} + void graph_impl::clearQueues(bool NeedsLock) { graph_impl::RecQueuesStorage SwappedQueues; { @@ -672,6 +679,7 @@ void graph_impl::clearQueues(bool NeedsLock) { "Failed to end native graph capture"); } // CapturedGraph should be the same as MNativeGraphHandle + ValidQueue->setNativeRecordingGraph(nullptr); } else { // Only call setCommandGraph for traditional recording ValidQueue->setCommandGraph(nullptr); @@ -847,6 +855,7 @@ void graph_impl::beginRecordingImpl(sycl::detail::queue_impl &Queue, throw sycl::exception(sycl::make_error_code(errc::runtime), "Failed to begin native UR graph capture"); } + Queue.setNativeRecordingGraph(shared_from_this()); } else { // Non-native recording path if (AcquireQueueLock) { diff --git a/sycl/source/detail/graph/graph_impl.hpp b/sycl/source/detail/graph/graph_impl.hpp index 8151285368fe1..af73a5a29a3ab 100644 --- a/sycl/source/detail/graph/graph_impl.hpp +++ b/sycl/source/detail/graph/graph_impl.hpp @@ -44,6 +44,7 @@ class queue_impl; class NDRDescT; class ArgDesc; class CG; +struct EnqueueHostTaskData; } // namespace detail namespace ext { @@ -550,6 +551,11 @@ class graph_impl : public std::enable_shared_from_this { /// @return True if the queue is recording to this graph, false otherwise. bool isQueueRecording(sycl::detail::queue_impl &Queue); + /// Take ownership of callback data for a native-recorded host task and + /// returns a non-owning pointer for passing to UR + detail::EnqueueHostTaskData * + addNativeHostTaskCallback(std::unique_ptr Data); + private: /// Common implementation for beginRecording and beginRecordingUnlockedQueue. /// @param[in] Queue The queue to be recorded from. @@ -628,6 +634,10 @@ class graph_impl : public std::enable_shared_from_this { /// @note Native recording requires immediate command lists. ur_exp_graph_handle_t MNativeGraphHandle = nullptr; + /// Callback data for host tasks recorded in native recording mode. + std::vector> + MNativeHostTaskCallbacks; + /// Mapping from queues to barrier nodes. For each queue the last barrier /// node recorded to the graph from the queue is stored. std::map, node_impl *, diff --git a/sycl/source/detail/host_task.hpp b/sycl/source/detail/host_task.hpp index aaad343d805f6..4d0c2bb48c45e 100644 --- a/sycl/source/detail/host_task.hpp +++ b/sycl/source/detail/host_task.hpp @@ -89,10 +89,11 @@ struct EnqueueHostTaskData { std::function Func; }; -inline void NativeHostTask(void *Data) { - auto HostTaskData = std::unique_ptr( - static_cast(Data)); +template inline void NativeHostTask(void *Data) { + auto *HostTaskData = static_cast(Data); HostTaskData->Func(); + if constexpr (OwnsData) + delete HostTaskData; } } // namespace detail diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e5716a2a57775..03cd4bd030fe2 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -653,6 +653,17 @@ class queue_impl : public std::enable_shared_from_this { bool isNativeRecording() const; + void setNativeRecordingGraph( + const std::shared_ptr + &Graph) { + MNativeRecordingGraph = Graph; + } + + std::shared_ptr + getNativeRecordingGraph() const { + return MNativeRecordingGraph.lock(); + } + ext::oneapi::experimental::queue_state ext_oneapi_get_state_impl() const; std::shared_ptr @@ -1130,6 +1141,9 @@ class queue_impl : public std::enable_shared_from_this { // recording commands to it. std::weak_ptr MGraph{}; + std::weak_ptr + MNativeRecordingGraph{}; + unsigned long long MQueueID; static std::atomic MNextAvailableQueueID; diff --git a/sycl/source/detail/scheduler/commands.cpp b/sycl/source/detail/scheduler/commands.cpp index 7888785e3c60f..32894d97c51e1 100644 --- a/sycl/source/detail/scheduler/commands.cpp +++ b/sycl/source/detail/scheduler/commands.cpp @@ -384,7 +384,7 @@ class DispatchHostTask { std::move(HostTask.MHostTask->MHostTask)); ur_event_handle_t HostTaskEvent{}; Queue->getAdapter().call( - Queue->getHandleRef(), detail::NativeHostTask, + Queue->getHandleRef(), detail::NativeHostTask, NativeHostTaskData.get(), nullptr, 0, nullptr, &HostTaskEvent); // Ownership is transferred to NativeHostTask callback on success. (void)NativeHostTaskData.release(); diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index ace9ec48d7b68..3c81d7bee799b 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -785,14 +785,15 @@ detail::EventImplPtr handler::finalize() { "not available on this device."); } - auto CallbackData = std::make_unique( - detail::HandlerAccess::getHostTaskFunc(*HT->MHostTask)); + // Store callback in the graph so it is available during replays + auto GraphImpl = Queue->getNativeRecordingGraph(); + auto *CallbackData = GraphImpl->addNativeHostTaskCallback( + std::make_unique( + detail::HandlerAccess::getHostTaskFunc(*HT->MHostTask))); Queue->getAdapter().call( - Queue->getHandleRef(), detail::NativeHostTask, CallbackData.get(), + Queue->getHandleRef(), detail::NativeHostTask, CallbackData, nullptr, 0, nullptr, nullptr); - // Ownership transferred on success. - (void)CallbackData.release(); return detail::event_impl::create_completed_host_event(); } From a1ff4698ebe7d893bfd04876a6337e6fb568aabd Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 29 May 2026 15:20:29 -0700 Subject: [PATCH 08/10] Grab native handle using urQueueGetGraphExp directly --- sycl/source/detail/graph/graph_impl.cpp | 2 -- sycl/source/detail/queue_impl.hpp | 14 -------------- sycl/source/handler.cpp | 7 +++++-- .../level_zero/v2/command_list_manager.cpp | 10 +++++++--- 4 files changed, 12 insertions(+), 21 deletions(-) diff --git a/sycl/source/detail/graph/graph_impl.cpp b/sycl/source/detail/graph/graph_impl.cpp index f33af9f8b664e..315750c6924e0 100644 --- a/sycl/source/detail/graph/graph_impl.cpp +++ b/sycl/source/detail/graph/graph_impl.cpp @@ -679,7 +679,6 @@ void graph_impl::clearQueues(bool NeedsLock) { "Failed to end native graph capture"); } // CapturedGraph should be the same as MNativeGraphHandle - ValidQueue->setNativeRecordingGraph(nullptr); } else { // Only call setCommandGraph for traditional recording ValidQueue->setCommandGraph(nullptr); @@ -855,7 +854,6 @@ void graph_impl::beginRecordingImpl(sycl::detail::queue_impl &Queue, throw sycl::exception(sycl::make_error_code(errc::runtime), "Failed to begin native UR graph capture"); } - Queue.setNativeRecordingGraph(shared_from_this()); } else { // Non-native recording path if (AcquireQueueLock) { diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index 03cd4bd030fe2..e5716a2a57775 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -653,17 +653,6 @@ class queue_impl : public std::enable_shared_from_this { bool isNativeRecording() const; - void setNativeRecordingGraph( - const std::shared_ptr - &Graph) { - MNativeRecordingGraph = Graph; - } - - std::shared_ptr - getNativeRecordingGraph() const { - return MNativeRecordingGraph.lock(); - } - ext::oneapi::experimental::queue_state ext_oneapi_get_state_impl() const; std::shared_ptr @@ -1141,9 +1130,6 @@ class queue_impl : public std::enable_shared_from_this { // recording commands to it. std::weak_ptr MGraph{}; - std::weak_ptr - MNativeRecordingGraph{}; - unsigned long long MQueueID; static std::atomic MNextAvailableQueueID; diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 3c81d7bee799b..9c8fe0a10e2cf 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -785,8 +785,11 @@ detail::EventImplPtr handler::finalize() { "not available on this device."); } - // Store callback in the graph so it is available during replays - auto GraphImpl = Queue->getNativeRecordingGraph(); + // Store callback in the graph so it is available during replays. + ur_exp_graph_handle_t UrGraphHandle = nullptr; + Queue->getAdapter().call( + Queue->getHandleRef(), &UrGraphHandle); + auto GraphImpl = Queue->getContextImpl().getNativeGraph(UrGraphHandle); auto *CallbackData = GraphImpl->addNativeHostTaskCallback( std::make_unique( detail::HandlerAccess::getHostTaskFunc(*HT->MHostTask))); diff --git a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp index 82571666081e4..f6cfffc2b942a 100644 --- a/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/command_list_manager.cpp @@ -1418,9 +1418,7 @@ ur_result_t ur_command_list_manager::isGraphCaptureActive(bool *pResult) { } ur_result_t ur_command_list_manager::getGraph(ur_exp_graph_handle_t *phGraph) { - auto zeGetGraph = - hContext.get()->getPlatform()->ZeGraphExt.zeCommandListGetGraphExp; - if (!checkGraphExtensionSupport(hContext.get()) || !zeGetGraph) { + if (!checkGraphExtensionSupport(hContext.get())) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; } @@ -1431,6 +1429,12 @@ ur_result_t ur_command_list_manager::getGraph(ur_exp_graph_handle_t *phGraph) { } // Fork-join and implicit capture scenarios + auto zeGetGraph = + hContext.get()->getPlatform()->ZeGraphExt.zeCommandListGetGraphExp; + if (!zeGetGraph) { + return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; + } + ze_graph_handle_t hZeGraph = nullptr; ze_result_t ZeResult = ZE_CALL_NOCHECK(zeGetGraph, (getZeCommandList(), &hZeGraph)); From 77678c51033074467c2ff9a2facfcdaf5e0026b1 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 29 May 2026 15:39:48 -0700 Subject: [PATCH 09/10] Move UR calls out of finalize --- sycl/source/handler.cpp | 30 ++++++++++++++++++++---------- 1 file changed, 20 insertions(+), 10 deletions(-) diff --git a/sycl/source/handler.cpp b/sycl/source/handler.cpp index 9c8fe0a10e2cf..63db7e50a6185 100644 --- a/sycl/source/handler.cpp +++ b/sycl/source/handler.cpp @@ -365,6 +365,23 @@ fill_copy_args(detail::handler_impl *impl, DestOffset, DestExtent, CopyExtent); } +static bool checkDeviceSupports(device_impl &DeviceImpl, + ur_device_info_t InfoQuery) { + ur_bool_t SupportsOp = false; + DeviceImpl.getAdapter().call( + DeviceImpl.getHandleRef(), InfoQuery, sizeof(ur_bool_t), &SupportsOp, + nullptr); + return SupportsOp; +} + +static std::shared_ptr +getNativeGraphImpl(queue_impl &Queue) { + ur_exp_graph_handle_t UrGraphHandle = nullptr; + Queue.getAdapter().call(Queue.getHandleRef(), + &UrGraphHandle); + return Queue.getContextImpl().getNativeGraph(UrGraphHandle); +} + } // namespace detail handler::handler(detail::handler_impl &HandlerImpl) : impl(&HandlerImpl) {} @@ -773,12 +790,8 @@ detail::EventImplPtr handler::finalize() { "native recording mode."); } - bool NativeHostTaskSupport = false; - Queue->getAdapter().call( - detail::getSyclObjImpl(Queue->get_device())->getHandleRef(), - UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP, - sizeof(NativeHostTaskSupport), &NativeHostTaskSupport, nullptr); - if (!NativeHostTaskSupport) { + if (!checkDeviceSupports(*detail::getSyclObjImpl(Queue->get_device()), + UR_DEVICE_INFO_ENQUEUE_HOST_TASK_SUPPORT_EXP)) { throw sycl::exception(make_error_code(errc::feature_not_supported), "Recording host tasks in native recording mode " "requires backend support " @@ -786,10 +799,7 @@ detail::EventImplPtr handler::finalize() { } // Store callback in the graph so it is available during replays. - ur_exp_graph_handle_t UrGraphHandle = nullptr; - Queue->getAdapter().call( - Queue->getHandleRef(), &UrGraphHandle); - auto GraphImpl = Queue->getContextImpl().getNativeGraph(UrGraphHandle); + auto GraphImpl = detail::getNativeGraphImpl(*Queue); auto *CallbackData = GraphImpl->addNativeHostTaskCallback( std::make_unique( detail::HandlerAccess::getHostTaskFunc(*HT->MHostTask))); From c61a2091d03495c1f75a550e477ed9ecd41c1292 Mon Sep 17 00:00:00 2001 From: Adam Fidel Date: Fri, 29 May 2026 15:53:45 -0700 Subject: [PATCH 10/10] Rewrite test to replay the graph twice --- .../NativeRecording/enqueue_func_host_task.cpp | 12 +++++++++++- 1 file changed, 11 insertions(+), 1 deletion(-) diff --git a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp index 5978ace36778d..7e367319942ce 100644 --- a/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp +++ b/sycl/test-e2e/Graph/RecordReplay/NativeRecording/enqueue_func_host_task.cpp @@ -29,6 +29,7 @@ int main() { const sycl::device Device = Queue.get_device(); uint32_t *Data = malloc_shared(N, Queue); + std::fill(Data, Data + N, 0); ze_command_list_handle_t ZeCommandList; bool success = getCommandListFromQueue(Queue, ZeCommandList); @@ -45,7 +46,7 @@ int main() { Queue.submit([&](handler &CGH) { CGH.parallel_for(range<1>{N}, [=](id<1> idx) { - Data[idx] = static_cast(idx[0]) + 1; + Data[idx] += static_cast(idx[0]) + 1; }); }); @@ -63,6 +64,7 @@ int main() { verifier.verify(EXECUTING); auto ExecutableGraph = Graph.finalize(); + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); Queue.wait(); @@ -71,6 +73,14 @@ int main() { assert(check_value(i, Expected, Data[i], "Data")); } + Queue.submit([&](handler &CGH) { CGH.ext_oneapi_graph(ExecutableGraph); }); + Queue.wait(); + + for (size_t i = 0; i < N; i++) { + uint32_t Expected = static_cast((i + 1) * 6 + 30); + assert(check_value(i, Expected, Data[i], "Data")); + } + free(Data, Queue); return 0; }