Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
106 changes: 98 additions & 8 deletions unified-runtime/source/adapters/level_zero/v2/memory.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -251,6 +251,32 @@ void *ur_discrete_buffer_handle_t::allocateOnDevice(ur_device_handle_t hDevice,
return ptr;
}

void *ur_discrete_buffer_handle_t::ensureDeviceAlloc(ur_device_handle_t hDevice,
size_t size) {
assert(hDevice);

auto id = hDevice->Id.value();
if (void *existing = deviceAllocations[id].get()) {
return existing;
}

// Allocate without touching activeAllocationDevice; the caller is
// responsible for updating it at the correct point in the migration flow.
void *ptr;
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
hContext, hDevice, nullptr, UR_USM_TYPE_DEVICE, size, &ptr));

deviceAllocations[id] =
usm_unique_ptr_t(ptr, [hContext = this->hContext](void *ptr) {
auto ret = hContext->getDefaultUSMPool()->free(ptr);
if (ret != UR_RESULT_SUCCESS) {
UR_LOG(ERR, "Failed to free device memory: {}", ret);
}
});

return ptr;
}

ur_result_t
ur_discrete_buffer_handle_t::migrateBufferTo(ur_device_handle_t hDevice,
void *src, size_t size) {
Expand Down Expand Up @@ -340,8 +366,8 @@ void *ur_discrete_buffer_handle_t::getActiveDeviceAlloc(size_t offset) {

void *ur_discrete_buffer_handle_t::getDevicePtr(
ur_device_handle_t hDevice, device_access_mode_t /*access*/, size_t offset,
size_t /*size*/, ze_command_list_handle_t /*cmdList*/,
wait_list_view & /*waitListView*/) {
size_t /*size*/, ze_command_list_handle_t cmdList,
wait_list_view &waitListView) {
TRACK_SCOPE_LATENCY("ur_discrete_buffer_handle_t::getDevicePtr");

if (!activeAllocationDevice) {
Expand All @@ -366,12 +392,76 @@ void *ur_discrete_buffer_handle_t::getDevicePtr(
activeAllocationDevice) != p2pDevices.end();

if (!p2pAccessible) {
// TODO: migrate buffer through the host
UR_LOG(WARN,
"p2p is not accessible: requesting device ptr:{} cannot access "
"allocation on device ptr:{}",
(void *)hDevice, (void *)activeAllocationDevice);
throw UR_RESULT_ERROR_UNSUPPORTED_FEATURE;
// P2P is not accessible between the two devices; migrate through the host.
UR_LOG(DEBUG,
"p2p is not accessible, migrating buffer through host: "
"src device ptr:{} -> dst device ptr:{}",
(void *)activeAllocationDevice, (void *)hDevice);

auto bufferSize = getSize();

// Allocate a USM HOST staging buffer for the migration.
void *hostBuf = nullptr;
UR_CALL_THROWS(hContext->getDefaultUSMPool()->allocate(
hContext, nullptr, nullptr, UR_USM_TYPE_HOST, bufferSize, &hostBuf));
usm_unique_ptr_t hostBufPtr(
hostBuf, [hContext = this->hContext](void *ptr) {
auto ret = hContext->getDefaultUSMPool()->free(ptr);
if (ret != UR_RESULT_SUCCESS) {
UR_LOG(ERR, "Failed to free migration staging buffer: {}", ret);
}
});

if (cmdList) {
// Order the migration relative to both the explicit wait events and any
// in-flight work already on the destination command list, then drain it
// so the host can safely read from the source device.
if (waitListView.num > 0) {
ZE2UR_CALL_THROWS(zeCommandListAppendWaitOnEvents,
(cmdList, waitListView.num, waitListView.handles));
}
ZE2UR_CALL_THROWS(zeCommandListHostSynchronize, (cmdList, UINT64_MAX));
waitListView.clear();

// The destination device's command list cannot access source device
// memory (P2P is not available), so use the source device's own
// synchronous command list for the device->host copy.
UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice,
hostBuf, getActiveDeviceAlloc(),
bufferSize));

// Use ensureDeviceAlloc instead of allocateOnDevice: the latter has a
// side-effect of setting activeAllocationDevice = hDevice immediately,
// before the copy is enqueued. activeAllocationDevice must only be
// updated after the copy is successfully complete (see below).
void *dstDevPtr = ensureDeviceAlloc(hDevice, bufferSize);

// Host memory is accessible by all devices; enqueue the host->dest
// copy on the provided command list.
ZE2UR_CALL_THROWS(
zeCommandListAppendMemoryCopy,
(cmdList, dstDevPtr, hostBuf, bufferSize, nullptr, 0, nullptr));

// Drain the command list so the staging buffer is fully consumed and
// can be freed immediately when hostBufPtr goes out of scope.
ZE2UR_CALL_THROWS(zeCommandListHostSynchronize, (cmdList, UINT64_MAX));
} else {
// Synchronous fallback when no command list is available
// (e.g. urMemGetNativeHandle).
for (uint32_t i = 0; i < waitListView.num; i++) {
ZE2UR_CALL_THROWS(zeEventHostSynchronize,
(waitListView.handles[i], UINT64_MAX));
}
waitListView.clear();

UR_CALL_THROWS(synchronousZeCopy(hContext, activeAllocationDevice,
hostBuf, getActiveDeviceAlloc(),
bufferSize));
UR_CALL_THROWS(migrateBufferTo(hDevice, hostBuf, bufferSize));
}

activeAllocationDevice = hDevice;
return getActiveDeviceAlloc(offset);
}

// TODO: see if it's better to migrate the memory to the specified device
Expand Down
4 changes: 4 additions & 0 deletions unified-runtime/source/adapters/level_zero/v2/memory.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -173,6 +173,10 @@ struct ur_discrete_buffer_handle_t : ur_mem_buffer_t {

void *getActiveDeviceAlloc(size_t offset = 0);
void *allocateOnDevice(ur_device_handle_t hDevice, size_t size);
// Ensures a device allocation exists for hDevice and returns its pointer.
// Unlike allocateOnDevice, does NOT update activeAllocationDevice, so it
// is safe to call before the data migration is complete.
void *ensureDeviceAlloc(ur_device_handle_t hDevice, size_t size);
ur_result_t migrateBufferTo(ur_device_handle_t hDevice, void *src,
size_t size);
};
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -562,12 +562,18 @@ TEST_P(urMemoryMultiResidencyTest, p2pReadFailsAfterRevokingAccess) {
}

// Verify that a USM allocation on devices[0] is NOT made resident on
// devices[1] when P2P access has not been enabled. The feature under test
// restricts residency, not hardware access: Level Zero hardware can still
// transfer data cross-device via the interconnect regardless of residency
// state, so the copy result is not checked here. The observable guarantee
// is that devices[1] free memory must not decrease by a full allocSize,
// proving the allocation was never pinned on the peer device.
// devices[1] when P2P access has not been enabled. This test runs after
// several P2P enable/disable cycles to confirm that the residency restriction
// is still enforced once P2P is turned back off.
//
// The memory check is done immediately after urUSMDeviceAlloc, without
// creating a queue or issuing any GPU work. Waiting for GPU operations to
// complete (e.g. urQueueFinish) introduces a timing window during which
// background activity — async driver cleanup from earlier tests, other
// concurrent GPU workloads on shared CI hardware — can change the free-memory
// reading on devices[1] and cause spurious failures. The allocation step
// alone is sufficient to trigger any peer-residency side-effects, so
// measuring immediately after it keeps the window as short as possible.
TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) {
constexpr size_t allocSize = kAllocSize;
static constexpr uint8_t fillPattern = 0xAB;
Expand Down Expand Up @@ -596,8 +602,8 @@ TEST_P(urMemoryMultiResidencyTest, allocationNotResidentOnPeerWithoutP2P) {
// Allocate on devices[0] WITHOUT enabling P2P — must not consume
// devices[1] memory.
void *srcPtr = nullptr;
ASSERT_NO_FATAL_FAILURE(
allocAndFillOnDevice0(allocSize, fillPattern, &srcPtr));
ASSERT_SUCCESS(urUSMDeviceAlloc(context, devices[0], nullptr, nullptr,
allocSize, &srcPtr));

uint64_t currentMemFreePeer = 0;
ur_result_t memRes =
Expand Down
1 change: 1 addition & 0 deletions unified-runtime/test/conformance/enqueue/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -14,6 +14,7 @@ add_conformance_kernels_test(enqueue
urEnqueueMemBufferCopy.cpp
urEnqueueMemBufferFill.cpp
urEnqueueMemBufferMap.cpp
urEnqueueMemBufferMultiDeviceMigration.cpp
urEnqueueMemBufferRead.cpp
urEnqueueMemBufferReadRect.cpp
urEnqueueMemBufferWrite.cpp
Expand Down
Original file line number Diff line number Diff line change
@@ -0,0 +1,163 @@
// Part of the LLVM Project, under the Apache License v2.0 with LLVM
// Exceptions. See https://llvm.org/LICENSE.txt for license information.
//
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
//
// Multi-device buffer tests that stress host-staged migration when a discrete
// buffer is accessed from different devices/queues (for example when device
// peer access is not available). Corresponds to L0 v2 discrete-buffer
// getDevicePtr migration ordering.
//
// The tests cover two migration paths inside getDevicePtr:
// - Async path (cmdList != nullptr): triggered by urEnqueueMem* operations.
// - Sync fallback (cmdList == nullptr): triggered by urMemGetNativeHandle.

#include <uur/fixtures.h>
#include <vector>

struct urEnqueueMemBufferMultiDeviceMigrationTest
: uur::urMultiDeviceMemBufferQueueTest {
void SetUp() override {
UUR_RETURN_ON_FATAL_FAILURE(uur::urMultiDeviceMemBufferQueueTest::SetUp());

if (devices.size() < 2) {
GTEST_SKIP() << "Test requires at least 2 devices";
}

// Check that the USM P2P extension is supported on both devices.
for (size_t i = 0; i < 2; i++) {
ur_bool_t usm_p2p_support = false;
ASSERT_SUCCESS(
urDeviceGetInfo(devices[i], UR_DEVICE_INFO_USM_P2P_SUPPORT_EXP,
sizeof(usm_p2p_support), &usm_p2p_support, nullptr));
if (!usm_p2p_support) {
GTEST_SKIP() << "EXP usm p2p feature is not supported on device " << i;
}
}

// This test exercises the host-mediated migration fallback, which is only
// triggered when P2P access is NOT available between the two devices.
// Skip if hardware P2P is present — the fallback path would never run.
int p2pSupported = 0;
ur_result_t res = urUsmP2PPeerAccessGetInfoExp(
devices[0], devices[1], UR_EXP_PEER_INFO_UR_PEER_ACCESS_SUPPORT,
sizeof(p2pSupported), &p2pSupported, nullptr);
if (res == UR_RESULT_SUCCESS && p2pSupported) {
GTEST_SKIP() << "Devices have P2P access; host-migration path is not "
"exercised";
}
}
};
UUR_INSTANTIATE_PLATFORM_TEST_SUITE(urEnqueueMemBufferMultiDeviceMigrationTest);

TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
AsyncFillThenReadOnSecondQueueWithWait) {
const uint32_t pattern = 0xA5A5A501;
ur_event_handle_t fillEv = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern,
sizeof(pattern), 0, size, 0, nullptr,
&fillEv));
ASSERT_NE(fillEv, nullptr);

std::vector<uint32_t> output(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
output.data(), 1, &fillEv, nullptr));

ASSERT_SUCCESS(urEventRelease(fillEv));

for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern, output[i]) << "Mismatch at index " << i;
}
}

TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
PingPongFillBetweenTwoDeviceQueues) {
const uint32_t pattern1 = 0xC001D00u;
ur_event_handle_t evFill1 = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern1,
sizeof(pattern1), 0, size, 0, nullptr,
&evFill1));
ASSERT_NE(evFill1, nullptr);

std::vector<uint32_t> stage1(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
stage1.data(), 1, &evFill1, nullptr));
ASSERT_SUCCESS(urEventRelease(evFill1));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern1, stage1[i]);
}

const uint32_t pattern2 = 0xD00DAD00u;
ur_event_handle_t evFill2 = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[1], buffer, &pattern2,
sizeof(pattern2), 0, size, 0, nullptr,
&evFill2));
ASSERT_NE(evFill2, nullptr);

std::vector<uint32_t> stage2(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, true, 0, size,
stage2.data(), 1, &evFill2, nullptr));
ASSERT_SUCCESS(urEventRelease(evFill2));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern2, stage2[i]);
}
}

TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
ChainedAsyncOpsAcrossQueuesWithEvents) {
const uint32_t patternA = 0x11111111u;
ur_event_handle_t evFill = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &patternA,
sizeof(patternA), 0, size, 0, nullptr,
&evFill));
ASSERT_NE(evFill, nullptr);

std::vector<uint32_t> verifyA(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
verifyA.data(), 1, &evFill, nullptr));
ASSERT_SUCCESS(urEventRelease(evFill));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(patternA, verifyA[i]);
}

const uint32_t patternB = 0x22222222u;
std::vector<uint32_t> hostB(count, patternB);
ur_event_handle_t evWrite = nullptr;
ASSERT_SUCCESS(urEnqueueMemBufferWrite(queues[1], buffer, true, 0, size,
hostB.data(), 0, nullptr, &evWrite));
ASSERT_NE(evWrite, nullptr);

std::vector<uint32_t> verifyB(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[0], buffer, true, 0, size,
verifyB.data(), 1, &evWrite, nullptr));
ASSERT_SUCCESS(urEventRelease(evWrite));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(patternB, verifyB[i]);
}
}

// Exercise the synchronous fallback migration path in getDevicePtr
// (cmdList == nullptr), which is triggered by urMemGetNativeHandle.
// Fill the buffer on device 0, then request its native pointer on device 1 to
// force a synchronous host-staged migration, then verify the data on device 1.
TEST_P(urEnqueueMemBufferMultiDeviceMigrationTest,
SyncFallbackMigrationViaNativeHandle) {
const uint32_t pattern = 0xDEADBEEFu;
ASSERT_SUCCESS(urEnqueueMemBufferFill(queues[0], buffer, &pattern,
sizeof(pattern), 0, size, 0, nullptr,
nullptr));
ASSERT_SUCCESS(urQueueFinish(queues[0]));

// urMemGetNativeHandle calls getDevicePtr with cmdList == nullptr,
// triggering the synchronous device->host->device migration path.
ur_native_handle_t nativePtr = 0;
ASSERT_SUCCESS(urMemGetNativeHandle(buffer, devices[1], &nativePtr));
ASSERT_NE(nativePtr, (ur_native_handle_t)0);

std::vector<uint32_t> output(count, 0);
ASSERT_SUCCESS(urEnqueueMemBufferRead(queues[1], buffer, true, 0, size,
output.data(), 0, nullptr, nullptr));
for (size_t i = 0; i < count; ++i) {
ASSERT_EQ(pattern, output[i]) << "Mismatch at index " << i;
}
}
Loading