diff --git a/sycl/include/sycl/queue.hpp b/sycl/include/sycl/queue.hpp index 16068d0c81d84..ea03529d4d95f 100644 --- a/sycl/include/sycl/queue.hpp +++ b/sycl/include/sycl/queue.hpp @@ -3611,6 +3611,11 @@ class __SYCL_EXPORT queue : public detail::OwnerLessBase { /// completed, otherwise returns false. bool khr_empty() const; + /// Flushes all commands in the queue to the device, but doesn't wait for them + /// to complete unlike wait(). + /// + void khr_flush() const; + std::optional ext_oneapi_get_last_event() const { return static_cast>(ext_oneapi_get_last_event_impl()); } diff --git a/sycl/source/detail/queue_impl.cpp b/sycl/source/detail/queue_impl.cpp index 96ac7fbd17eeb..7c0f44688b741 100644 --- a/sycl/source/detail/queue_impl.cpp +++ b/sycl/source/detail/queue_impl.cpp @@ -1268,6 +1268,15 @@ bool queue_impl::queue_empty() const { return IsReady; } +void queue_impl::queue_flush() const { + if (MGraph.lock()) { + throw sycl::exception(make_error_code(errc::invalid), + "flush cannot be called for a queue which is " + "recording to a command graph."); + } + getAdapter().call(MQueue); +} + void queue_impl::revisitUnenqueuedCommandsState( const EventImplPtr &CompletedHostTask) { if (MIsInorder) diff --git a/sycl/source/detail/queue_impl.hpp b/sycl/source/detail/queue_impl.hpp index e5716a2a57775..954ed744ba831 100644 --- a/sycl/source/detail/queue_impl.hpp +++ b/sycl/source/detail/queue_impl.hpp @@ -613,6 +613,8 @@ class queue_impl : public std::enable_shared_from_this { bool queue_empty() const; + void queue_flush() const; + EventImplPtr memcpyToDeviceGlobal(void *DeviceGlobalPtr, const void *Src, bool IsDeviceImageScope, size_t NumBytes, size_t Offset, diff --git a/sycl/source/queue.cpp b/sycl/source/queue.cpp index a8588c3d3d1ce..c6f6dbc34f2ca 100644 --- a/sycl/source/queue.cpp +++ b/sycl/source/queue.cpp @@ -247,6 +247,8 @@ bool queue::ext_oneapi_empty() const { return impl->queue_empty(); } bool queue::khr_empty() const { return impl->queue_empty(); } +void queue::khr_flush() const { return impl->queue_flush(); } + void queue::ext_oneapi_prod() { impl->flush(); } ur_native_handle_t queue::getNative(int32_t &NativeHandleDesc) const { diff --git a/unified-runtime/include/unified-runtime/ur_api.h b/unified-runtime/include/unified-runtime/ur_api.h index 798423b6ae55a..6857958c88526 100644 --- a/unified-runtime/include/unified-runtime/ur_api.h +++ b/unified-runtime/include/unified-runtime/ur_api.h @@ -512,6 +512,8 @@ typedef enum ur_function_t { UR_FUNCTION_QUEUE_GET_GRAPH_EXP = 314, /// Enumerator for ::urGraphSetDestructionCallbackExp UR_FUNCTION_GRAPH_SET_DESTRUCTION_CALLBACK_EXP = 315, + /// Enumerator for ::urKhrFlush + UR_FUNCTION_KHR_FLUSH = 316, /// @cond UR_FUNCTION_FORCE_UINT32 = 0x7fffffff /// @endcond @@ -6607,6 +6609,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush( /// [in] handle of the queue to be flushed. ur_queue_handle_t hQueue); +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue); + #if !defined(__GNUC__) #pragma endregion #endif @@ -14700,6 +14706,14 @@ typedef struct ur_queue_flush_params_t { ur_queue_handle_t *phQueue; } ur_queue_flush_params_t; +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function parameters for urKhrFlush +/// @details Each entry is a pointer to the parameter passed to the function; +/// allowing the callback the ability to modify the parameter's value +typedef struct ur_khr_flush_params_t { + ur_queue_handle_t *phQueue; +} ur_khr_flush_params_t; + /////////////////////////////////////////////////////////////////////////////// /// @brief Function parameters for urQueueBeginGraphCaptureExp /// @details Each entry is a pointer to the parameter passed to the function; diff --git a/unified-runtime/include/unified-runtime/ur_api_funcs.def b/unified-runtime/include/unified-runtime/ur_api_funcs.def index fc481d947a8c7..5b6b67676227a 100644 --- a/unified-runtime/include/unified-runtime/ur_api_funcs.def +++ b/unified-runtime/include/unified-runtime/ur_api_funcs.def @@ -75,6 +75,7 @@ _UR_API(urKernelGetSuggestedLocalWorkSizeWithArgs) _UR_API(urKernelSetExecInfo) _UR_API(urKernelSetSpecializationConstants) _UR_API(urKernelSuggestMaxCooperativeGroupCount) +_UR_API(urKhrFlush) _UR_API(urQueueGetInfo) _UR_API(urQueueCreate) _UR_API(urQueueRetain) diff --git a/unified-runtime/include/unified-runtime/ur_ddi.h b/unified-runtime/include/unified-runtime/ur_ddi.h index cc734e6194d92..1f19ae132a2ec 100644 --- a/unified-runtime/include/unified-runtime/ur_ddi.h +++ b/unified-runtime/include/unified-runtime/ur_ddi.h @@ -623,6 +623,10 @@ typedef ur_result_t(UR_APICALL *ur_pfnQueueFinish_t)(ur_queue_handle_t); /// @brief Function-pointer for urQueueFlush typedef ur_result_t(UR_APICALL *ur_pfnQueueFlush_t)(ur_queue_handle_t); +/////////////////////////////////////////////////////////////////////////////// +/// @brief Function-pointer for urQueueFlush +typedef ur_result_t(UR_APICALL *ur_pfnKhrFlush_t)(ur_queue_handle_t); + /////////////////////////////////////////////////////////////////////////////// /// @brief Table of Queue functions pointers typedef struct ur_queue_dditable_t { @@ -634,6 +638,7 @@ typedef struct ur_queue_dditable_t { ur_pfnQueueCreateWithNativeHandle_t pfnCreateWithNativeHandle; ur_pfnQueueFinish_t pfnFinish; ur_pfnQueueFlush_t pfnFlush; + ur_pfnKhrFlush_t pfnKhrFlush; } ur_queue_dditable_t; /////////////////////////////////////////////////////////////////////////////// diff --git a/unified-runtime/source/adapters/cuda/queue.cpp b/unified-runtime/source/adapters/cuda/queue.cpp index f42f59cc8e516..420c845a90cec 100644 --- a/unified-runtime/source/adapters/cuda/queue.cpp +++ b/unified-runtime/source/adapters/cuda/queue.cpp @@ -171,6 +171,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t /*hQueue*/) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t /*hQueue*/) { + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueGetNativeHandle( ur_queue_handle_t hQueue, ur_queue_native_desc_t * /*pDesc*/, ur_native_handle_t *phNativeQueue) { diff --git a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp index 1f6434b2c140f..be438df40c560 100644 --- a/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/cuda/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/hip/queue.cpp b/unified-runtime/source/adapters/hip/queue.cpp index f9b58b6989a41..3736d2e5068f4 100644 --- a/unified-runtime/source/adapters/hip/queue.cpp +++ b/unified-runtime/source/adapters/hip/queue.cpp @@ -196,6 +196,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t) { + return UR_RESULT_SUCCESS; +} + /// Gets the native HIP handle of a UR queue object /// /// \param[in] hQueue The UR queue to get the native HIP object of. diff --git a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp index 1d6fbdf7a9c0c..af2da615ca5b2 100644 --- a/unified-runtime/source/adapters/hip/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/hip/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/level_zero/queue.cpp b/unified-runtime/source/adapters/level_zero/queue.cpp index 2afab55a99c6d..4378cbb3ee908 100644 --- a/unified-runtime/source/adapters/level_zero/queue.cpp +++ b/unified-runtime/source/adapters/level_zero/queue.cpp @@ -929,6 +929,12 @@ ur_result_t urQueueFlush( return Queue->executeAllOpenCommandLists(); } +ur_result_t urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t Queue) { + return Queue->executeAllOpenCommandLists(); +} + ur_result_t urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { UR_LOG_LEGACY(ERR, logger::LegacyMessage("[UR][L0] {} function not implemented!"), diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp index dc76f7729f937..e3ed8ea67eed0 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.cpp @@ -459,6 +459,7 @@ UR_APIEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( ur::level_zero::urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = ur::level_zero::urQueueFinish; pDdiTable->pfnFlush = ur::level_zero::urQueueFlush; + pDdiTable->pfnKhrFlush = ur::level_zero::urKhrFlush; return result; } diff --git a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp index b56cc5e803c5d..11c7d2ca448b1 100644 --- a/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp +++ b/unified-runtime/source/adapters/level_zero/ur_interface_loader.hpp @@ -254,6 +254,7 @@ ur_result_t urQueueCreateWithNativeHandle( ur_queue_handle_t *phQueue); ur_result_t urQueueFinish(ur_queue_handle_t hQueue); ur_result_t urQueueFlush(ur_queue_handle_t hQueue); +ur_result_t urKhrFlush(ur_queue_handle_t hQueue); ur_result_t urEventGetInfo(ur_event_handle_t hEvent, ur_event_info_t propName, size_t propSize, void *pPropValue, size_t *pPropSizeRet); diff --git a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp index 0c501870bbf1f..c94b9bd5b16f6 100644 --- a/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp +++ b/unified-runtime/source/adapters/level_zero/v2/queue_api.cpp @@ -55,6 +55,11 @@ ur_result_t urQueueFlush(ur_queue_handle_t hQueue) try { } catch (...) { return exceptionToResult(std::current_exception()); } +ur_result_t urKhrFlush(ur_queue_handle_t hQueue) try { + return hQueue->get().queueFlush(); +} catch (...) { + return exceptionToResult(std::current_exception()); +} ur_result_t urEnqueueEventsWait(ur_queue_handle_t hQueue, uint32_t numEventsInWaitList, const ur_event_handle_t *phEventWaitList, diff --git a/unified-runtime/source/adapters/native_cpu/queue.cpp b/unified-runtime/source/adapters/native_cpu/queue.cpp index fb3a8c74cb15a..1026a32163673 100644 --- a/unified-runtime/source/adapters/native_cpu/queue.cpp +++ b/unified-runtime/source/adapters/native_cpu/queue.cpp @@ -94,6 +94,11 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t /*hQueue*/) { DIE_NO_IMPLEMENTATION; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t /*hQueue*/) { + + DIE_NO_IMPLEMENTATION; +} + UR_APIEXPORT ur_result_t urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { diff --git a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp index c3b121bfb0cdb..e32ce43ea0be2 100644 --- a/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/native_cpu/ur_interface_loader.cpp @@ -239,6 +239,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/offload/queue.cpp b/unified-runtime/source/adapters/offload/queue.cpp index 6c213e3bc6450..f8cffdf3b7370 100644 --- a/unified-runtime/source/adapters/offload/queue.cpp +++ b/unified-runtime/source/adapters/offload/queue.cpp @@ -124,6 +124,10 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t) { + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueBeginGraphCaptureExp(ur_queue_handle_t /* hQueue */) { return UR_RESULT_ERROR_UNSUPPORTED_FEATURE; diff --git a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp index ce4ab5351a527..86531a8410e21 100644 --- a/unified-runtime/source/adapters/offload/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/offload/ur_interface_loader.cpp @@ -249,6 +249,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/adapters/opencl/queue.cpp b/unified-runtime/source/adapters/opencl/queue.cpp index 35d56df08e539..cdcc7ed36da5d 100644 --- a/unified-runtime/source/adapters/opencl/queue.cpp +++ b/unified-runtime/source/adapters/opencl/queue.cpp @@ -300,6 +300,12 @@ UR_APIEXPORT ur_result_t UR_APICALL urQueueFlush(ur_queue_handle_t hQueue) { return UR_RESULT_SUCCESS; } +UR_APIEXPORT ur_result_t UR_APICALL urKhrFlush(ur_queue_handle_t hQueue) { + cl_int RetErr = clFlush(hQueue->CLQueue); + CL_RETURN_ON_FAILURE(RetErr); + return UR_RESULT_SUCCESS; +} + UR_APIEXPORT ur_result_t UR_APICALL urQueueRetain(ur_queue_handle_t hQueue) { hQueue->RefCount.retain(); return UR_RESULT_SUCCESS; diff --git a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp index 3205af2df209e..cbd2cc7a302ff 100644 --- a/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp +++ b/unified-runtime/source/adapters/opencl/ur_interface_loader.cpp @@ -223,6 +223,7 @@ UR_DLLEXPORT ur_result_t UR_APICALL urGetQueueProcAddrTable( pDdiTable->pfnCreateWithNativeHandle = urQueueCreateWithNativeHandle; pDdiTable->pfnFinish = urQueueFinish; pDdiTable->pfnFlush = urQueueFlush; + pDdiTable->pfnKhrFlush = urKhrFlush; pDdiTable->pfnGetInfo = urQueueGetInfo; pDdiTable->pfnGetNativeHandle = urQueueGetNativeHandle; pDdiTable->pfnRelease = urQueueRelease; diff --git a/unified-runtime/source/ur_api.cpp b/unified-runtime/source/ur_api.cpp index d6b7be91aa80d..1ed74e28aa91b 100644 --- a/unified-runtime/source/ur_api.cpp +++ b/unified-runtime/source/ur_api.cpp @@ -3582,6 +3582,13 @@ ur_result_t UR_APICALL urQueueFlush( return result; } +ur_result_t UR_APICALL urKhrFlush( + /// [in] handle of the queue to be flushed. + ur_queue_handle_t hQueue) { + ur_result_t result = UR_RESULT_SUCCESS; + return result; +} + /////////////////////////////////////////////////////////////////////////////// /// @brief Get event object information ///