diff --git a/sycl/source/detail/sycl_mem_obj_t.hpp b/sycl/source/detail/sycl_mem_obj_t.hpp index 4893d154a7566..60593231848a7 100644 --- a/sycl/source/detail/sycl_mem_obj_t.hpp +++ b/sycl/source/detail/sycl_mem_obj_t.hpp @@ -165,16 +165,6 @@ class SYCLMemObjT : public SYCLMemObjI { has_property(); } - bool canReadHostPtr(void *HostPtr, const size_t RequiredAlign) { - bool Aligned = - (reinterpret_cast(HostPtr) % RequiredAlign) == 0; - return Aligned || useHostPtr(); - } - - bool canReuseHostPtr(void *HostPtr, const size_t RequiredAlign) { - return !MHostPtrReadOnly && canReadHostPtr(HostPtr, RequiredAlign); - } - void handleHostData(void *HostPtr, const size_t RequiredAlign) { MHostPtrProvided = true; if (!MHostPtrReadOnly && HostPtr) { @@ -184,10 +174,16 @@ class SYCLMemObjT : public SYCLMemObjI { } if (HostPtr) { - if (canReuseHostPtr(HostPtr, RequiredAlign)) { - MUserPtr = HostPtr; - } else if (canReadHostPtr(HostPtr, RequiredAlign)) { - MUserPtr = HostPtr; + // Pass the user pointer to UR unchanged. Each adapter is responsible + // for handling pointers it cannot use directly (misaligned, not + // importable, etc.) by allocating its own backing storage and copying. + MUserPtr = HostPtr; + + // For a read-only host pointer we still need a writable backing store + // if the user later creates a write accessor. Defer the allocation + // until that happens. This is adapter-independent: the language rule + // is that we may not write through a const user pointer. + if (MHostPtrReadOnly) { std::lock_guard Lock(MCreateShadowCopyMtx); MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { setAlign(RequiredAlign); @@ -195,11 +191,6 @@ class SYCLMemObjT : public SYCLMemObjI { MUserPtr = MShadowCopy; std::memcpy(MUserPtr, HostPtr, MSizeInBytes); }; - } else { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr, MSizeInBytes); } } } @@ -218,10 +209,9 @@ class SYCLMemObjT : public SYCLMemObjI { if (!MHostPtrReadOnly) set_final_data_from_storage(); - if (canReuseHostPtr(HostPtr.get(), RequiredAlign)) { - MUserPtr = HostPtr.get(); - } else if (canReadHostPtr(HostPtr.get(), RequiredAlign)) { - MUserPtr = HostPtr.get(); + MUserPtr = HostPtr.get(); + + if (MHostPtrReadOnly) { std::lock_guard Lock(MCreateShadowCopyMtx); MCreateShadowCopy = [this, RequiredAlign, HostPtr]() -> void { setAlign(RequiredAlign); @@ -229,11 +219,6 @@ class SYCLMemObjT : public SYCLMemObjI { MUserPtr = MShadowCopy; std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); }; - } else { - setAlign(RequiredAlign); - MShadowCopy = allocateHostMem(); - MUserPtr = MShadowCopy; - std::memcpy(MUserPtr, HostPtr.get(), MSizeInBytes); } } } diff --git a/unified-runtime/source/adapters/level_zero/helpers/memory_helpers.cpp b/unified-runtime/source/adapters/level_zero/helpers/memory_helpers.cpp index e9b037fec6037..523aa4652c90d 100644 --- a/unified-runtime/source/adapters/level_zero/helpers/memory_helpers.cpp +++ b/unified-runtime/source/adapters/level_zero/helpers/memory_helpers.cpp @@ -34,7 +34,13 @@ bool maybeImportUSM(ze_driver_handle_t hTranslatedDriver, if (ret == UR_RESULT_SUCCESS && properties.type == ZE_MEMORY_TYPE_UNKNOWN) { // Promote the host ptr to USM host memory ZeUSMImport.doZeUSMImport(hTranslatedDriver, ptr, size); - return true; + + // doZeUSMImport silently ignores driver-level failures (e.g., misaligned + // ptr), so re-query to confirm the import actually succeeded before + // reporting it to callers. + ret = getMemoryAttrs(hContext, ptr, nullptr, &properties); + return ret == UR_RESULT_SUCCESS && + properties.type != ZE_MEMORY_TYPE_UNKNOWN; } return false; } diff --git a/unified-runtime/source/adapters/opencl/context.cpp b/unified-runtime/source/adapters/opencl/context.cpp index 30f6f5b878e5e..02c436c3c2999 100644 --- a/unified-runtime/source/adapters/opencl/context.cpp +++ b/unified-runtime/source/adapters/opencl/context.cpp @@ -15,6 +15,18 @@ #include #include +cl_command_queue ur_context_handle_t_::getSyncQueue() { + std::lock_guard Lock(SyncQueueMtx); + if (!SyncQueue) { + cl_int Err; + SyncQueue = + clCreateCommandQueue(CLContext, Devices[0]->CLDevice, 0, &Err); + assert(Err == CL_SUCCESS); + (void)Err; + } + return SyncQueue; +} + ur_result_t ur_context_handle_t_::makeWithNative(native_type Ctx, uint32_t DevCount, const ur_device_handle_t *phDevices, diff --git a/unified-runtime/source/adapters/opencl/context.hpp b/unified-runtime/source/adapters/opencl/context.hpp index 4060d17d25982..d6907d3c7f05b 100644 --- a/unified-runtime/source/adapters/opencl/context.hpp +++ b/unified-runtime/source/adapters/opencl/context.hpp @@ -14,6 +14,7 @@ #include "common/ur_ref_count.hpp" #include "device.hpp" +#include #include struct ur_context_handle_t_ : ur::opencl::handle_base { @@ -24,6 +25,9 @@ struct ur_context_handle_t_ : ur::opencl::handle_base { bool IsNativeHandleOwned = true; ur::RefCount RefCount; + cl_command_queue SyncQueue = nullptr; + std::mutex SyncQueueMtx; + ur_context_handle_t_(const ur_context_handle_t_ &) = delete; ur_context_handle_t_ &operator=(const ur_context_handle_t_ &) = delete; @@ -39,7 +43,13 @@ struct ur_context_handle_t_ : ur::opencl::handle_base { static ur_result_t makeWithNative(native_type Ctx, uint32_t DevCount, const ur_device_handle_t *phDevices, ur_context_handle_t &Context); + + cl_command_queue getSyncQueue(); + ~ur_context_handle_t_() noexcept { + if (SyncQueue) { + clReleaseCommandQueue(SyncQueue); + } // If we're reasonably sure this context is about to be destroyed we should // clear the ext function pointer cache. This isn't foolproof sadly but it // should drastically reduce the chances of the pathological case described diff --git a/unified-runtime/source/adapters/opencl/memory.cpp b/unified-runtime/source/adapters/opencl/memory.cpp index 3507c003db0b0..158b6adf55bba 100644 --- a/unified-runtime/source/adapters/opencl/memory.cpp +++ b/unified-runtime/source/adapters/opencl/memory.cpp @@ -336,10 +336,53 @@ ur_result_t ur_mem_handle_t_::makeWithNative(native_type NativeMem, return UR_RESULT_SUCCESS; } +static bool canUseHostPtrDirectly(ur_context_handle_t hContext, void *HostPtr) { + if (!HostPtr) + return false; + + cl_uint MaxAlignBits = 0; + bool AllUnifiedMem = true; + + for (uint32_t I = 0; I < hContext->DeviceCount; ++I) { + cl_uint AlignBits = 0; + clGetDeviceInfo(hContext->Devices[I]->CLDevice, + CL_DEVICE_MEM_BASE_ADDR_ALIGN, sizeof(AlignBits), + &AlignBits, nullptr); + if (AlignBits > MaxAlignBits) + MaxAlignBits = AlignBits; + + cl_bool Unified = CL_FALSE; + clGetDeviceInfo(hContext->Devices[I]->CLDevice, + CL_DEVICE_HOST_UNIFIED_MEMORY, sizeof(Unified), &Unified, + nullptr); + if (!Unified) + AllUnifiedMem = false; + } + + size_t RequiredAlign = MaxAlignBits / 8; + bool IsAligned = + RequiredAlign == 0 || + (reinterpret_cast(HostPtr) % RequiredAlign) == 0; + + return IsAligned && AllUnifiedMem; +} + UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( ur_context_handle_t hContext, ur_mem_flags_t flags, size_t size, const ur_buffer_properties_t *pProperties, ur_mem_handle_t *phBuffer) { cl_int RetErr = CL_INVALID_OPERATION; + + void *HostPtr = pProperties ? pProperties->pHost : nullptr; + cl_mem_flags CLFlags = convertURMemFlagsToCL(flags); + + bool NeedsWriteBack = false; + if (HostPtr && (CLFlags & CL_MEM_USE_HOST_PTR)) { + if (!canUseHostPtrDirectly(hContext, HostPtr)) { + CLFlags = (CLFlags & ~CL_MEM_USE_HOST_PTR) | CL_MEM_COPY_HOST_PTR; + NeedsWriteBack = true; + } + } + if (pProperties) { // TODO: need to check if all properties are supported by OpenCL RT and // ignore unsupported @@ -377,11 +420,15 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( PropertiesIntel.push_back(0); try { - cl_mem Buffer = FuncPtr( - CLContext, PropertiesIntel.data(), static_cast(flags), - size, pProperties->pHost, static_cast(&RetErr)); + cl_mem Buffer = + FuncPtr(CLContext, PropertiesIntel.data(), CLFlags, size, HostPtr, + static_cast(&RetErr)); CL_RETURN_ON_FAILURE(RetErr); auto URMem = std::make_unique(Buffer, hContext); + if (NeedsWriteBack) { + URMem->WriteBackPtr = HostPtr; + URMem->Size = size; + } *phBuffer = URMem.release(); } catch (std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; @@ -392,13 +439,16 @@ UR_APIEXPORT ur_result_t UR_APICALL urMemBufferCreate( } } - void *HostPtr = pProperties ? pProperties->pHost : nullptr; try { cl_mem Buffer = - clCreateBuffer(hContext->CLContext, static_cast(flags), - size, HostPtr, static_cast(&RetErr)); + clCreateBuffer(hContext->CLContext, CLFlags, size, HostPtr, + static_cast(&RetErr)); CL_RETURN_ON_FAILURE(RetErr); auto URMem = std::make_unique(Buffer, hContext); + if (NeedsWriteBack) { + URMem->WriteBackPtr = HostPtr; + URMem->Size = size; + } *phBuffer = URMem.release(); } catch (std::bad_alloc &) { return UR_RESULT_ERROR_OUT_OF_RESOURCES; diff --git a/unified-runtime/source/adapters/opencl/memory.hpp b/unified-runtime/source/adapters/opencl/memory.hpp index 025b7f4acef8b..801ed62416a32 100644 --- a/unified-runtime/source/adapters/opencl/memory.hpp +++ b/unified-runtime/source/adapters/opencl/memory.hpp @@ -22,6 +22,9 @@ struct ur_mem_handle_t_ : ur::opencl::handle_base { bool IsNativeHandleOwned = true; ur::RefCount RefCount; + void *WriteBackPtr = nullptr; + size_t Size = 0; + ur_mem_handle_t_(const ur_mem_handle_t_ &) = delete; ur_mem_handle_t_ &operator=(const ur_mem_handle_t_ &) = delete; @@ -31,6 +34,11 @@ struct ur_mem_handle_t_ : ur::opencl::handle_base { } ~ur_mem_handle_t_() { + if (WriteBackPtr && IsNativeHandleOwned) { + cl_command_queue Q = Context->getSyncQueue(); + clEnqueueReadBuffer(Q, CLMemory, CL_TRUE, 0, Size, WriteBackPtr, 0, + nullptr, nullptr); + } urContextRelease(Context); if (IsNativeHandleOwned) { clReleaseMemObject(CLMemory);